{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 1620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 529.5, "completions/mean_terminated_length": 496.5333557128906, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.0030864197530864196, "grad_norm": 4.74130083751676, "kl": NaN, "learning_rate": 0.0, "loss": -0.1875, "num_tokens": 23472.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 542.03125, "completions/mean_terminated_length": 509.9000244140625, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.006172839506172839, "grad_norm": 4.716907773314693, "kl": NaN, "learning_rate": 1.020408163265306e-08, "loss": -0.1875, "num_tokens": 47325.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 2 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 638.625, "completions/mean_terminated_length": 510.16668701171875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.009259259259259259, "grad_norm": 5.525517617865517, "kl": NaN, "learning_rate": 2.040816326530612e-08, "loss": -0.1875, "num_tokens": 74461.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 3 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 482.125, "completions/mean_terminated_length": 482.125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.012345679012345678, "grad_norm": 5.694040850544329, "kl": NaN, "learning_rate": 3.0612244897959183e-08, "loss": -0.1875, "num_tokens": 96257.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 4 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 514.28125, "completions/mean_terminated_length": 514.28125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.015432098765432098, "grad_norm": 9.88473138977109, "kl": NaN, "learning_rate": 4.081632653061224e-08, "loss": -0.0183, "num_tokens": 118998.0, "reward": -1.3969838619232178e-09, "reward_std": 0.18550412356853485, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 4.656612873077393e-10, "rewards/logprob_reward/std": 0.3592105805873871, "step": 5 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 507.625, "completions/mean_terminated_length": 473.20001220703125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.018518518518518517, "grad_norm": 5.90717969482145, "kl": NaN, "learning_rate": 5.1020408163265303e-08, "loss": -0.1875, "num_tokens": 141370.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 6 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 684.75, "completions/mean_terminated_length": 636.2857666015625, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.021604938271604937, "grad_norm": 9.60087477291155, "kl": NaN, "learning_rate": 6.122448979591837e-08, "loss": -0.0, "num_tokens": 170098.0, "reward": -9.313225746154785e-10, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 7 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 563.28125, "completions/mean_terminated_length": 532.5667114257812, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.024691358024691357, "grad_norm": 0.0, "kl": NaN, "learning_rate": 7.142857142857142e-08, "loss": 0.0, "num_tokens": 194259.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 8 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 526.5, "completions/mean_terminated_length": 526.5, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.027777777777777776, "grad_norm": 0.0, "kl": NaN, "learning_rate": 8.163265306122448e-08, "loss": 0.0, "num_tokens": 217555.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 9 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 508.1875, "completions/mean_terminated_length": 473.8000183105469, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.030864197530864196, "grad_norm": 12.119144518696967, "kl": NaN, "learning_rate": 9.183673469387755e-08, "loss": -0.0, "num_tokens": 239785.0, "reward": -1.862645149230957e-09, "reward_std": 0.15130963921546936, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 10 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 539.78125, "completions/mean_terminated_length": 539.78125, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.033950617283950615, "grad_norm": 4.187066212925879, "kl": NaN, "learning_rate": 1.0204081632653061e-07, "loss": -0.1875, "num_tokens": 263486.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 11 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 516.71875, "completions/mean_terminated_length": 500.3548278808594, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.037037037037037035, "grad_norm": 6.532980810899672, "kl": NaN, "learning_rate": 1.1224489795918366e-07, "loss": -0.0, "num_tokens": 286193.0, "reward": -9.313225746154785e-10, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 12 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 524.1875, "completions/mean_terminated_length": 524.1875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.040123456790123455, "grad_norm": 0.0, "kl": NaN, "learning_rate": 1.2244897959183673e-07, "loss": 0.0, "num_tokens": 309231.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 13 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 507.75, "completions/mean_terminated_length": 473.3333435058594, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.043209876543209874, "grad_norm": 0.0, "kl": NaN, "learning_rate": 1.326530612244898e-07, "loss": 0.0, "num_tokens": 331919.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 14 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 528.0, "completions/mean_terminated_length": 494.933349609375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.046296296296296294, "grad_norm": 9.610937197935552, "kl": NaN, "learning_rate": 1.4285714285714285e-07, "loss": -0.1703, "num_tokens": 355315.0, "reward": -3.725290298461914e-09, "reward_std": 0.12426391243934631, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 15 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 540.90625, "completions/mean_terminated_length": 525.3225708007812, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.04938271604938271, "grad_norm": 3.5559481335414636, "kl": NaN, "learning_rate": 1.5306122448979592e-07, "loss": -0.1875, "num_tokens": 379508.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 16 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 520.9375, "completions/mean_terminated_length": 504.70965576171875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.05246913580246913, "grad_norm": 0.0, "kl": NaN, "learning_rate": 1.6326530612244896e-07, "loss": 0.0, "num_tokens": 402982.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 17 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 452.90625, "completions/mean_terminated_length": 452.90625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.05555555555555555, "grad_norm": 0.0019381009413819638, "kl": NaN, "learning_rate": 1.7346938775510203e-07, "loss": 0.0, "num_tokens": 423743.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 18 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 513.03125, "completions/mean_terminated_length": 513.03125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.05864197530864197, "grad_norm": 3.8400628573522324, "kl": NaN, "learning_rate": 1.836734693877551e-07, "loss": -0.1875, "num_tokens": 446912.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 19 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 567.125, "completions/mean_terminated_length": 482.5185241699219, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.06172839506172839, "grad_norm": 3.969671416208202, "kl": NaN, "learning_rate": 1.9387755102040814e-07, "loss": -0.1875, "num_tokens": 471448.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 20 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 511.4375, "completions/mean_terminated_length": 511.4375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.06481481481481481, "grad_norm": 5.348841577705897, "kl": NaN, "learning_rate": 2.0408163265306121e-07, "loss": -0.1875, "num_tokens": 494254.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 21 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 550.875, "completions/mean_terminated_length": 535.6129150390625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.06790123456790123, "grad_norm": 0.0, "kl": NaN, "learning_rate": 2.1428571428571426e-07, "loss": 0.0, "num_tokens": 519170.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 22 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 520.09375, "completions/mean_terminated_length": 520.09375, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.07098765432098765, "grad_norm": 0.0, "kl": NaN, "learning_rate": 2.2448979591836733e-07, "loss": 0.0, "num_tokens": 542389.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 23 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 494.8125, "completions/mean_terminated_length": 477.7419128417969, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.07407407407407407, "grad_norm": 5.883080414950403, "kl": NaN, "learning_rate": 2.346938775510204e-07, "loss": -0.1875, "num_tokens": 564475.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 24 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 543.6875, "completions/mean_terminated_length": 511.66668701171875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.07716049382716049, "grad_norm": 4.292052701984573, "kl": NaN, "learning_rate": 2.4489795918367347e-07, "loss": -0.1875, "num_tokens": 588373.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 25 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 458.1875, "completions/mean_terminated_length": 458.1875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.08024691358024691, "grad_norm": 5.34225123241201, "kl": NaN, "learning_rate": 2.551020408163265e-07, "loss": -0.1874, "num_tokens": 609663.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 26 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 499.46875, "completions/mean_terminated_length": 499.46875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.08333333333333333, "grad_norm": 7.9853862621483795, "kl": NaN, "learning_rate": 2.653061224489796e-07, "loss": -0.0, "num_tokens": 632082.0, "reward": -9.313225746154785e-10, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 27 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 544.0625, "completions/mean_terminated_length": 528.5806274414062, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.08641975308641975, "grad_norm": 4.542744467532849, "kl": NaN, "learning_rate": 2.755102040816326e-07, "loss": -0.1875, "num_tokens": 656264.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 28 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 485.0, "completions/mean_terminated_length": 467.6128845214844, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.08950617283950617, "grad_norm": 4.302072310973842, "kl": NaN, "learning_rate": 2.857142857142857e-07, "loss": -0.1875, "num_tokens": 677960.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 29 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 557.40625, "completions/mean_terminated_length": 557.40625, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.09259259259259259, "grad_norm": 6.465867393978447, "kl": NaN, "learning_rate": 2.9591836734693874e-07, "loss": -0.0, "num_tokens": 702525.0, "reward": -9.313225746154785e-10, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 30 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 560.625, "completions/mean_terminated_length": 529.7333374023438, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.09567901234567901, "grad_norm": 9.204797689062591, "kl": NaN, "learning_rate": 3.0612244897959183e-07, "loss": 0.187, "num_tokens": 726877.0, "reward": -3.725290298461914e-09, "reward_std": 0.1350192129611969, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -9.313225746154785e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 31 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 483.65625, "completions/mean_terminated_length": 466.2257995605469, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.09876543209876543, "grad_norm": 5.047012153592003, "kl": NaN, "learning_rate": 3.163265306122449e-07, "loss": -0.1875, "num_tokens": 748570.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 32 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 614.09375, "completions/mean_terminated_length": 555.5357666015625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.10185185185185185, "grad_norm": 0.0, "kl": NaN, "learning_rate": 3.265306122448979e-07, "loss": 0.0, "num_tokens": 775373.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 33 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 542.53125, "completions/mean_terminated_length": 527.0, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.10493827160493827, "grad_norm": 5.930133727857815, "kl": NaN, "learning_rate": 3.3673469387755096e-07, "loss": -0.1875, "num_tokens": 798974.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 34 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 502.34375, "completions/mean_terminated_length": 485.51611328125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.10802469135802469, "grad_norm": 10.104299577868375, "kl": NaN, "learning_rate": 3.4693877551020406e-07, "loss": -0.1874, "num_tokens": 821625.0, "reward": 1.862645149230957e-09, "reward_std": 0.14388087391853333, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 35 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 464.15625, "completions/mean_terminated_length": 464.15625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.1111111111111111, "grad_norm": 7.848102759829451, "kl": NaN, "learning_rate": 3.5714285714285716e-07, "loss": -0.3749, "num_tokens": 842938.0, "reward": 0.02812499739229679, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 36 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 522.4375, "completions/mean_terminated_length": 522.4375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.11419753086419752, "grad_norm": 1.7514018540364926, "kl": NaN, "learning_rate": 3.673469387755102e-07, "loss": -0.0719, "num_tokens": 865932.0, "reward": 0.02812499925494194, "reward_std": 0.07739149034023285, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 37 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 616.0, "completions/mean_terminated_length": 557.7142944335938, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.11728395061728394, "grad_norm": 8.139154074899833, "kl": NaN, "learning_rate": 3.7755102040816324e-07, "loss": -0.1875, "num_tokens": 892432.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 38 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 514.34375, "completions/mean_terminated_length": 514.34375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.12037037037037036, "grad_norm": 0.0, "kl": NaN, "learning_rate": 3.877551020408163e-07, "loss": 0.0, "num_tokens": 915023.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 39 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 538.71875, "completions/mean_terminated_length": 488.5172424316406, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.12345679012345678, "grad_norm": 10.38047968051339, "kl": NaN, "learning_rate": 3.979591836734694e-07, "loss": -0.1875, "num_tokens": 939246.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 40 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 539.21875, "completions/mean_terminated_length": 489.0689697265625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.12654320987654322, "grad_norm": 0.0, "kl": NaN, "learning_rate": 4.0816326530612243e-07, "loss": 0.0, "num_tokens": 963425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 41 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 481.46875, "completions/mean_terminated_length": 425.3448181152344, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.12962962962962962, "grad_norm": 0.0, "kl": NaN, "learning_rate": 4.183673469387755e-07, "loss": 0.0, "num_tokens": 985176.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 42 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 577.1875, "completions/mean_terminated_length": 547.4000244140625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.13271604938271606, "grad_norm": 4.9499267033359216, "kl": NaN, "learning_rate": 4.285714285714285e-07, "loss": -0.1875, "num_tokens": 1010554.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 43 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 489.21875, "completions/mean_terminated_length": 489.21875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.13580246913580246, "grad_norm": 7.089035443815076, "kl": NaN, "learning_rate": 4.387755102040816e-07, "loss": -0.1875, "num_tokens": 1032629.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 44 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 473.34375, "completions/mean_terminated_length": 473.34375, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.1388888888888889, "grad_norm": 5.925047387295865, "kl": NaN, "learning_rate": 4.4897959183673465e-07, "loss": -0.0, "num_tokens": 1054348.0, "reward": -1.862645149230957e-09, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 45 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 594.375, "completions/mean_terminated_length": 514.8148193359375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.1419753086419753, "grad_norm": 8.248861691563496, "kl": NaN, "learning_rate": 4.5918367346938775e-07, "loss": -0.1875, "num_tokens": 1080012.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 46 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 525.75, "completions/mean_terminated_length": 492.5333557128906, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.14506172839506173, "grad_norm": 3.3479654261151626, "kl": NaN, "learning_rate": 4.693877551020408e-07, "loss": -0.1875, "num_tokens": 1103256.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 47 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 551.75, "completions/mean_terminated_length": 502.89654541015625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.14814814814814814, "grad_norm": 4.134182401036001, "kl": NaN, "learning_rate": 4.795918367346938e-07, "loss": -0.1875, "num_tokens": 1127392.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 48 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 560.875, "completions/mean_terminated_length": 530.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.15123456790123457, "grad_norm": 7.640595895124005, "kl": NaN, "learning_rate": 4.897959183673469e-07, "loss": -0.0, "num_tokens": 1152032.0, "reward": -1.862645149230957e-09, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 49 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 559.15625, "completions/mean_terminated_length": 492.7500305175781, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.15432098765432098, "grad_norm": 4.335900513254463, "kl": NaN, "learning_rate": 5e-07, "loss": -0.1875, "num_tokens": 1176601.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 50 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 461.59375, "completions/mean_terminated_length": 461.59375, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.1574074074074074, "grad_norm": 6.874486086709533, "kl": NaN, "learning_rate": 4.999995001298037e-07, "loss": 0.1054, "num_tokens": 1197684.0, "reward": -1.862645149230957e-09, "reward_std": 0.14500659704208374, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 51 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 555.09375, "completions/mean_terminated_length": 539.9677124023438, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.16049382716049382, "grad_norm": 5.888160885729557, "kl": NaN, "learning_rate": 4.99998000521214e-07, "loss": 0.1873, "num_tokens": 1222175.0, "reward": -3.725290298461914e-09, "reward_std": 0.13756419718265533, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 52 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 523.8125, "completions/mean_terminated_length": 490.4667053222656, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.16358024691358025, "grad_norm": 6.895723747555437, "kl": NaN, "learning_rate": 4.999955011802275e-07, "loss": -0.1875, "num_tokens": 1245153.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 53 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 497.96875, "completions/mean_terminated_length": 481.0, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.16666666666666666, "grad_norm": 0.0, "kl": NaN, "learning_rate": 4.999920021168393e-07, "loss": 0.0, "num_tokens": 1267236.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 54 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 506.59375, "completions/mean_terminated_length": 506.59375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.1697530864197531, "grad_norm": 11.542699057812628, "kl": NaN, "learning_rate": 4.999875033450417e-07, "loss": -0.2016, "num_tokens": 1290135.0, "reward": -3.725290298461914e-09, "reward_std": 0.23362484574317932, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 55 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 580.3125, "completions/mean_terminated_length": 550.7333374023438, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.1728395061728395, "grad_norm": 6.082929826276223, "kl": NaN, "learning_rate": 4.999820048828253e-07, "loss": -0.0, "num_tokens": 1316181.0, "reward": -3.725290298461914e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 56 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 537.40625, "completions/mean_terminated_length": 487.0689697265625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.17592592592592593, "grad_norm": 3.300891508456188, "kl": NaN, "learning_rate": 4.999755067521781e-07, "loss": -0.1875, "num_tokens": 1339862.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 57 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 510.28125, "completions/mean_terminated_length": 476.0333557128906, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.17901234567901234, "grad_norm": 10.718095689791374, "kl": NaN, "learning_rate": 4.999680089790861e-07, "loss": -0.1875, "num_tokens": 1363267.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 58 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 480.3125, "completions/mean_terminated_length": 480.3125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.18209876543209877, "grad_norm": 5.638279253022429, "kl": NaN, "learning_rate": 4.999595115935325e-07, "loss": -0.0, "num_tokens": 1385133.0, "reward": -9.313225746154785e-10, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 59 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 489.03125, "completions/mean_terminated_length": 489.03125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.18518518518518517, "grad_norm": 5.361759756119009, "kl": NaN, "learning_rate": 4.999500146294979e-07, "loss": -0.3749, "num_tokens": 1406898.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 60 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 609.25, "completions/mean_terminated_length": 595.8709716796875, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.1882716049382716, "grad_norm": 8.180312606246089, "kl": NaN, "learning_rate": 4.999395181249604e-07, "loss": 0.187, "num_tokens": 1433194.0, "reward": 0.0, "reward_std": 0.15781110525131226, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 61 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 475.28125, "completions/mean_terminated_length": 457.58062744140625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.19135802469135801, "grad_norm": 9.370486688856312, "kl": NaN, "learning_rate": 4.99928022121895e-07, "loss": 0.1869, "num_tokens": 1454883.0, "reward": -3.725290298461914e-09, "reward_std": 0.13469690084457397, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 62 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 527.875, "completions/mean_terminated_length": 476.5517272949219, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.19444444444444445, "grad_norm": 4.488958880852314, "kl": NaN, "learning_rate": 4.99915526666274e-07, "loss": -0.0793, "num_tokens": 1478499.0, "reward": -3.725290298461914e-09, "reward_std": 0.12216030061244965, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 63 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 556.65625, "completions/mean_terminated_length": 525.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.19753086419753085, "grad_norm": 8.177889428278384, "kl": NaN, "learning_rate": 4.999020318080661e-07, "loss": 0.2708, "num_tokens": 1502564.0, "reward": -3.725290298461914e-09, "reward_std": 0.23764848709106445, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4751909375190735, "step": 64 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 442.84375, "completions/mean_terminated_length": 442.84375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.2006172839506173, "grad_norm": 8.354373096484785, "kl": NaN, "learning_rate": 4.998875376012368e-07, "loss": 0.1869, "num_tokens": 1522991.0, "reward": -7.450580596923828e-09, "reward_std": 0.13481050729751587, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -2.7939677238464355e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 65 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 517.96875, "completions/mean_terminated_length": 484.2333679199219, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.2037037037037037, "grad_norm": 9.443968749041115, "kl": NaN, "learning_rate": 4.998720441037479e-07, "loss": 0.1873, "num_tokens": 1546154.0, "reward": 1.862645149230957e-09, "reward_std": 0.1590980887413025, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110855221748352, "step": 66 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 582.15625, "completions/mean_terminated_length": 519.0357666015625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.20679012345679013, "grad_norm": 6.238242069432459, "kl": NaN, "learning_rate": 4.99855551377557e-07, "loss": 0.0607, "num_tokens": 1571659.0, "reward": -3.725290298461914e-09, "reward_std": 0.14501814544200897, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 67 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 459.90625, "completions/mean_terminated_length": 441.70965576171875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.20987654320987653, "grad_norm": 6.357087874651633, "kl": NaN, "learning_rate": 4.998380594886182e-07, "loss": 0.0735, "num_tokens": 1592716.0, "reward": -7.450580596923828e-09, "reward_std": 0.12894144654273987, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 68 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 518.375, "completions/mean_terminated_length": 518.375, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.21296296296296297, "grad_norm": 8.525097137526107, "kl": NaN, "learning_rate": 4.998195685068808e-07, "loss": -0.1602, "num_tokens": 1615664.0, "reward": 0.0, "reward_std": 0.22717738151550293, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 69 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 549.21875, "completions/mean_terminated_length": 517.5667114257812, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.21604938271604937, "grad_norm": 11.023532351660702, "kl": NaN, "learning_rate": 4.998000785062895e-07, "loss": -0.0997, "num_tokens": 1640415.0, "reward": 0.0, "reward_std": 0.15491779148578644, "rewards/format_reward_func/mean": -2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592105805873871, "step": 70 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 589.1875, "completions/mean_terminated_length": 527.0714721679688, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.2191358024691358, "grad_norm": 9.95859480120045, "kl": NaN, "learning_rate": 4.997795895647841e-07, "loss": -0.0, "num_tokens": 1666257.0, "reward": -9.313225746154785e-10, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 71 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 427.78125, "completions/mean_terminated_length": 408.5483703613281, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.2222222222222222, "grad_norm": 9.711779424497152, "kl": NaN, "learning_rate": 4.997581017642991e-07, "loss": -0.3742, "num_tokens": 1686250.0, "reward": -3.725290298461914e-09, "reward_std": 0.18278822302818298, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 72 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 555.96875, "completions/mean_terminated_length": 447.9615478515625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.22530864197530864, "grad_norm": 12.908124001807106, "kl": NaN, "learning_rate": 4.997356151907633e-07, "loss": 0.1446, "num_tokens": 1710849.0, "reward": -4.6566128730773926e-09, "reward_std": 0.22358503937721252, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 73 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 442.53125, "completions/mean_terminated_length": 423.774169921875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.22839506172839505, "grad_norm": 8.08954496023087, "kl": NaN, "learning_rate": 4.997121299340997e-07, "loss": 0.3373, "num_tokens": 1731202.0, "reward": -3.725290298461914e-09, "reward_std": 0.1365959793329239, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 74 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 527.40625, "completions/mean_terminated_length": 494.3000183105469, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.23148148148148148, "grad_norm": 7.5287129512560345, "kl": NaN, "learning_rate": 4.99687646088225e-07, "loss": 0.4762, "num_tokens": 1754607.0, "reward": 1.1175870895385742e-08, "reward_std": 0.22805829346179962, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.5388159155845642, "step": 75 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 444.34375, "completions/mean_terminated_length": 425.6451416015625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.2345679012345679, "grad_norm": 6.557388492091582, "kl": NaN, "learning_rate": 4.996621637510491e-07, "loss": 0.2711, "num_tokens": 1775542.0, "reward": -3.725290298461914e-09, "reward_std": 0.14887697994709015, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592105805873871, "step": 76 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 525.8125, "completions/mean_terminated_length": 509.7419128417969, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.23765432098765432, "grad_norm": 5.279569116437037, "kl": NaN, "learning_rate": 4.996356830244749e-07, "loss": -0.1931, "num_tokens": 1799104.0, "reward": 3.725290298461914e-09, "reward_std": 0.19772586226463318, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 5.587935447692871e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 77 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 483.03125, "completions/mean_terminated_length": 465.58062744140625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.24074074074074073, "grad_norm": 7.532480100642041, "kl": NaN, "learning_rate": 4.996082040143977e-07, "loss": 0.0932, "num_tokens": 1820565.0, "reward": 7.450580596923828e-09, "reward_std": 0.21790876984596252, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 78 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 525.40625, "completions/mean_terminated_length": 454.1785888671875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.24382716049382716, "grad_norm": 2.2175718648911444, "kl": NaN, "learning_rate": 4.995797268307051e-07, "loss": -0.0517, "num_tokens": 1843894.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 79 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 552.6875, "completions/mean_terminated_length": 465.40740966796875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.24691358024691357, "grad_norm": 2.7112926160139468, "kl": NaN, "learning_rate": 4.995502515872763e-07, "loss": -0.0311, "num_tokens": 1868112.0, "reward": -9.313225746154785e-10, "reward_std": 0.06587611883878708, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 80 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 493.15625, "completions/mean_terminated_length": 476.0322570800781, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.25, "grad_norm": 7.540066725909991, "kl": NaN, "learning_rate": 4.995197784019818e-07, "loss": -0.374, "num_tokens": 1890461.0, "reward": 0.0, "reward_std": 0.17774741351604462, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 81 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 553.46875, "completions/mean_terminated_length": 504.7930908203125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.25308641975308643, "grad_norm": 8.277666974684571, "kl": NaN, "learning_rate": 4.994883073966823e-07, "loss": 0.0969, "num_tokens": 1914820.0, "reward": 0.0, "reward_std": 0.1777360737323761, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 82 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 537.15625, "completions/mean_terminated_length": 521.4515991210938, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.25617283950617287, "grad_norm": 6.7740350517861865, "kl": NaN, "learning_rate": 4.994558386972295e-07, "loss": -0.4215, "num_tokens": 1938793.0, "reward": 7.450580596923828e-09, "reward_std": 0.21613164246082306, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 83 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 458.1875, "completions/mean_terminated_length": 420.4666748046875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.25925925925925924, "grad_norm": 2.6867147276116556, "kl": NaN, "learning_rate": 4.994223724334643e-07, "loss": -0.055, "num_tokens": 1959735.0, "reward": -5.587935447692871e-09, "reward_std": 0.1328701227903366, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 84 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 527.21875, "completions/mean_terminated_length": 475.82757568359375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.2623456790123457, "grad_norm": 5.6286038551084685, "kl": NaN, "learning_rate": 4.99387908739217e-07, "loss": -0.3749, "num_tokens": 1983126.0, "reward": 0.02812499739229679, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 85 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 522.34375, "completions/mean_terminated_length": 488.9000244140625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.2654320987654321, "grad_norm": 9.699949499271524, "kl": NaN, "learning_rate": 4.993524477523067e-07, "loss": -0.0421, "num_tokens": 2006269.0, "reward": -1.862645149230957e-09, "reward_std": 0.2729633152484894, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4751909673213959, "step": 86 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 624.6875, "completions/mean_terminated_length": 550.74072265625, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.26851851851851855, "grad_norm": 6.430087493862934, "kl": NaN, "learning_rate": 4.993159896145405e-07, "loss": 0.1025, "num_tokens": 2033575.0, "reward": -3.725290298461914e-09, "reward_std": 0.1897057294845581, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 87 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 481.5625, "completions/mean_terminated_length": 445.4000244140625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.2716049382716049, "grad_norm": 11.45326297061015, "kl": NaN, "learning_rate": 4.99278534471713e-07, "loss": -0.0001, "num_tokens": 2055329.0, "reward": 0.0, "reward_std": 0.1972888708114624, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 88 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 581.125, "completions/mean_terminated_length": 499.1111145019531, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.27469135802469136, "grad_norm": 10.331998309422328, "kl": NaN, "learning_rate": 4.992400824736059e-07, "loss": 0.3698, "num_tokens": 2080901.0, "reward": 0.0, "reward_std": 0.17625850439071655, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 89 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 594.5, "completions/mean_terminated_length": 514.9629516601562, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.2777777777777778, "grad_norm": 11.165598362989805, "kl": NaN, "learning_rate": 4.992006337739874e-07, "loss": -0.0311, "num_tokens": 2106693.0, "reward": 1.4901161193847656e-08, "reward_std": 0.31283944845199585, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 7.450580596923828e-09, "rewards/logprob_reward/std": 0.4751909673213959, "step": 90 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 519.8125, "completions/mean_terminated_length": 486.20001220703125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.2808641975308642, "grad_norm": 13.002113964292453, "kl": NaN, "learning_rate": 4.991601885306111e-07, "loss": 0.3115, "num_tokens": 2129763.0, "reward": 1.4901161193847656e-08, "reward_std": 0.2638370990753174, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.5388159155845642, "step": 91 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 555.15625, "completions/mean_terminated_length": 488.1785888671875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.2839506172839506, "grad_norm": 12.791846118361295, "kl": NaN, "learning_rate": 4.991187469052162e-07, "loss": 0.2108, "num_tokens": 2154216.0, "reward": 7.450580596923828e-09, "reward_std": 0.2904241681098938, "rewards/format_reward_func/mean": 1.1175870895385742e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 7.450580596923828e-09, "rewards/logprob_reward/std": 0.5388159155845642, "step": 92 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 536.5, "completions/mean_terminated_length": 504.0000305175781, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.28703703703703703, "grad_norm": 2.792875412014937, "kl": NaN, "learning_rate": 4.99076309063526e-07, "loss": -0.0838, "num_tokens": 2177772.0, "reward": -3.725290298461914e-09, "reward_std": 0.2394038587808609, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.4751909673213959, "step": 93 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 547.1875, "completions/mean_terminated_length": 497.862060546875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.29012345679012347, "grad_norm": 9.757143632177169, "kl": NaN, "learning_rate": 4.99032875175248e-07, "loss": -0.1874, "num_tokens": 2201578.0, "reward": -3.725290298461914e-09, "reward_std": 0.18489786982536316, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.3969838619232178e-09, "rewards/logprob_reward/std": 0.3110855221748352, "step": 94 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 550.875, "completions/mean_terminated_length": 463.25927734375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.2932098765432099, "grad_norm": 8.9755366295973, "kl": NaN, "learning_rate": 4.989884454140724e-07, "loss": 0.4945, "num_tokens": 2225358.0, "reward": -1.862645149230957e-09, "reward_std": 0.16876746714115143, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 95 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 471.71875, "completions/mean_terminated_length": 453.9031982421875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.2962962962962963, "grad_norm": 9.55185606564022, "kl": NaN, "learning_rate": 4.989430199576722e-07, "loss": 0.2836, "num_tokens": 2246301.0, "reward": -1.862645149230957e-09, "reward_std": 0.29134178161621094, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.5080004930496216, "step": 96 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 512.84375, "completions/mean_terminated_length": 478.7666931152344, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.2993827160493827, "grad_norm": 7.801768149510328, "kl": NaN, "learning_rate": 4.988965989877022e-07, "loss": -0.1874, "num_tokens": 2268696.0, "reward": -7.450580596923828e-09, "reward_std": 0.16211099922657013, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 97 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 547.46875, "completions/mean_terminated_length": 515.7000122070312, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.30246913580246915, "grad_norm": 8.110425669427034, "kl": NaN, "learning_rate": 4.988491826897978e-07, "loss": 0.2677, "num_tokens": 2292971.0, "reward": -7.450580596923828e-09, "reward_std": 0.1473100185394287, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 98 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 499.5, "completions/mean_terminated_length": 499.5, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.3055555555555556, "grad_norm": 3.1063567716739673, "kl": NaN, "learning_rate": 4.988007712535752e-07, "loss": -0.0527, "num_tokens": 2315347.0, "reward": -2.7939677238464355e-09, "reward_std": 0.1585049033164978, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 99 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 574.71875, "completions/mean_terminated_length": 510.5357360839844, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.30864197530864196, "grad_norm": 5.620277176386703, "kl": NaN, "learning_rate": 4.987513648726298e-07, "loss": -0.247, "num_tokens": 2340430.0, "reward": 0.02812499739229679, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 100 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 551.3125, "completions/mean_terminated_length": 483.7857360839844, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.3117283950617284, "grad_norm": 11.872955258019072, "kl": NaN, "learning_rate": 4.987009637445358e-07, "loss": 0.1215, "num_tokens": 2364524.0, "reward": 1.862645149230957e-09, "reward_std": 0.3255133628845215, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4751909673213959, "step": 101 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 566.84375, "completions/mean_terminated_length": 519.5516967773438, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.3148148148148148, "grad_norm": 11.232002516340126, "kl": NaN, "learning_rate": 4.986495680708453e-07, "loss": 0.584, "num_tokens": 2389303.0, "reward": 0.0, "reward_std": 0.25867360830307007, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.5388159155845642, "step": 102 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 550.03125, "completions/mean_terminated_length": 482.3214416503906, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.31790123456790126, "grad_norm": 9.777628123938609, "kl": NaN, "learning_rate": 4.985971780570878e-07, "loss": -0.4652, "num_tokens": 2413348.0, "reward": 7.450580596923828e-09, "reward_std": 0.35007143020629883, "rewards/format_reward_func/mean": 1.1175870895385742e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.5388159155845642, "step": 103 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 524.625, "completions/mean_terminated_length": 491.3333740234375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.32098765432098764, "grad_norm": 9.424241070510574, "kl": NaN, "learning_rate": 4.985437939127687e-07, "loss": -0.3748, "num_tokens": 2437404.0, "reward": -5.122274160385132e-09, "reward_std": 0.1748603880405426, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 104 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 625.5, "completions/mean_terminated_length": 533.5384521484375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.32407407407407407, "grad_norm": 6.171494848229457, "kl": NaN, "learning_rate": 4.984894158513696e-07, "loss": 0.0388, "num_tokens": 2464428.0, "reward": 2.7939677238464355e-09, "reward_std": 0.24299165606498718, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 105 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 495.9375, "completions/mean_terminated_length": 460.7333679199219, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.3271604938271605, "grad_norm": 11.006528150853264, "kl": NaN, "learning_rate": 4.984340440903456e-07, "loss": 0.0792, "num_tokens": 2486418.0, "reward": 3.725290298461914e-09, "reward_std": 0.34472817182540894, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4751909673213959, "step": 106 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 529.3125, "completions/mean_terminated_length": 478.137939453125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.33024691358024694, "grad_norm": 8.69266245923211, "kl": NaN, "learning_rate": 4.983776788511268e-07, "loss": 0.4456, "num_tokens": 2509496.0, "reward": -3.725290298461914e-09, "reward_std": 0.18249543011188507, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 107 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 531.8125, "completions/mean_terminated_length": 515.9354858398438, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.3333333333333333, "grad_norm": 6.409955823006029, "kl": NaN, "learning_rate": 4.983203203591154e-07, "loss": 0.3638, "num_tokens": 2532630.0, "reward": 7.450580596923828e-09, "reward_std": 0.15304797887802124, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 108 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 566.53125, "completions/mean_terminated_length": 551.774169921875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.33641975308641975, "grad_norm": 7.052145533314692, "kl": NaN, "learning_rate": 4.982619688436859e-07, "loss": -0.1236, "num_tokens": 2557651.0, "reward": -3.725290298461914e-09, "reward_std": 0.2175247073173523, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 7.450580596923828e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 109 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 554.15625, "completions/mean_terminated_length": 554.15625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.3395061728395062, "grad_norm": 5.6505366827006736, "kl": NaN, "learning_rate": 4.982026245381837e-07, "loss": 0.1873, "num_tokens": 2581932.0, "reward": -3.725290298461914e-09, "reward_std": 0.12099941819906235, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 110 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 520.875, "completions/mean_terminated_length": 468.82757568359375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.3425925925925926, "grad_norm": 4.990668417433553, "kl": NaN, "learning_rate": 4.981422876799244e-07, "loss": -0.125, "num_tokens": 2605524.0, "reward": 7.450580596923828e-09, "reward_std": 0.36357975006103516, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.5388159155845642, "step": 111 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 447.4375, "completions/mean_terminated_length": 447.4375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.345679012345679, "grad_norm": 8.097445924579832, "kl": NaN, "learning_rate": 4.980809585101927e-07, "loss": 0.1437, "num_tokens": 2626234.0, "reward": 0.0, "reward_std": 0.24270951747894287, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4751909375190735, "step": 112 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 607.0, "completions/mean_terminated_length": 510.7692565917969, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.3487654320987654, "grad_norm": 9.486949821902869, "kl": NaN, "learning_rate": 4.980186372742417e-07, "loss": -0.4536, "num_tokens": 2652698.0, "reward": 0.0, "reward_std": 0.2161448448896408, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.4751909375190735, "step": 113 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 560.90625, "completions/mean_terminated_length": 530.0333862304688, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.35185185185185186, "grad_norm": 8.181338165034866, "kl": NaN, "learning_rate": 4.979553242212917e-07, "loss": 0.1245, "num_tokens": 2677363.0, "reward": 0.0, "reward_std": 0.27589231729507446, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4751909673213959, "step": 114 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 602.90625, "completions/mean_terminated_length": 485.0, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.3549382716049383, "grad_norm": 7.060827852615143, "kl": NaN, "learning_rate": 4.978910196045291e-07, "loss": -0.1508, "num_tokens": 2702860.0, "reward": 6.51925802230835e-09, "reward_std": 0.26213163137435913, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.4751909673213959, "step": 115 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 523.59375, "completions/mean_terminated_length": 471.82757568359375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.35802469135802467, "grad_norm": 5.58331507303849, "kl": NaN, "learning_rate": 4.978257236811055e-07, "loss": -0.0721, "num_tokens": 2725663.0, "reward": -7.450580596923828e-09, "reward_std": 0.13394224643707275, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 116 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 580.0, "completions/mean_terminated_length": 534.0689697265625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.3611111111111111, "grad_norm": 5.442727785309853, "kl": NaN, "learning_rate": 4.977594367121369e-07, "loss": -0.0725, "num_tokens": 2750763.0, "reward": -3.725290298461914e-09, "reward_std": 0.20845584571361542, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 117 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 608.40625, "completions/mean_terminated_length": 492.03997802734375, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.36419753086419754, "grad_norm": 7.343243332926292, "kl": NaN, "learning_rate": 4.976921589627021e-07, "loss": -0.0781, "num_tokens": 2776880.0, "reward": -7.450580596923828e-09, "reward_std": 0.23553408682346344, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 118 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 542.5625, "completions/mean_terminated_length": 527.0322265625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.36728395061728397, "grad_norm": 5.909938660860529, "kl": NaN, "learning_rate": 4.976238907018427e-07, "loss": 0.3108, "num_tokens": 2800338.0, "reward": 0.0, "reward_std": 0.1802201271057129, "rewards/format_reward_func/mean": -2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096293926239, "step": 119 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 556.0, "completions/mean_terminated_length": 507.5862121582031, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.37037037037037035, "grad_norm": 7.084304579069176, "kl": NaN, "learning_rate": 4.975546322025605e-07, "loss": 0.1766, "num_tokens": 2824806.0, "reward": 0.0, "reward_std": 0.19519078731536865, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592105805873871, "step": 120 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 534.125, "completions/mean_terminated_length": 518.3225708007812, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.3734567901234568, "grad_norm": 8.40579312165444, "kl": NaN, "learning_rate": 4.974843837418175e-07, "loss": -0.2458, "num_tokens": 2848610.0, "reward": -3.725290298461914e-09, "reward_std": 0.30575814843177795, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 121 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 602.5, "completions/mean_terminated_length": 542.2857666015625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.3765432098765432, "grad_norm": 5.7832704011193865, "kl": NaN, "learning_rate": 4.974131456005349e-07, "loss": 0.2561, "num_tokens": 2874374.0, "reward": -1.4901161193847656e-08, "reward_std": 0.20695728063583374, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 122 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 674.90625, "completions/mean_terminated_length": 610.25927734375, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.37962962962962965, "grad_norm": 5.532237037603806, "kl": NaN, "learning_rate": 4.973409180635911e-07, "loss": -0.1752, "num_tokens": 2903135.0, "reward": -3.725290298461914e-09, "reward_std": 0.18403783440589905, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 123 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 619.9375, "completions/mean_terminated_length": 526.6923217773438, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.38271604938271603, "grad_norm": 7.290631465314015, "kl": NaN, "learning_rate": 4.972677014198213e-07, "loss": -0.3747, "num_tokens": 2929477.0, "reward": -2.7939677238464355e-09, "reward_std": 0.19752418994903564, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 4.656612873077393e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 124 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 555.5, "completions/mean_terminated_length": 524.2667236328125, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.38580246913580246, "grad_norm": 5.688696440304526, "kl": NaN, "learning_rate": 4.97193495962016e-07, "loss": 0.1632, "num_tokens": 2953441.0, "reward": -7.450580596923828e-09, "reward_std": 0.20944499969482422, "rewards/format_reward_func/mean": -3.725290298461914e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 125 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 595.34375, "completions/mean_terminated_length": 534.107177734375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.3888888888888889, "grad_norm": 4.705895167184754, "kl": NaN, "learning_rate": 4.971183019869201e-07, "loss": -0.1042, "num_tokens": 2978668.0, "reward": -4.656612873077393e-10, "reward_std": 0.17375023663043976, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 126 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 598.125, "completions/mean_terminated_length": 537.2857666015625, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.39197530864197533, "grad_norm": 7.694546935881251, "kl": NaN, "learning_rate": 4.970421197952311e-07, "loss": 0.0792, "num_tokens": 3004416.0, "reward": 0.0, "reward_std": 0.1776627004146576, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -9.313225746154785e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 127 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 544.46875, "completions/mean_terminated_length": 544.46875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.3950617283950617, "grad_norm": 6.548991620439674, "kl": NaN, "learning_rate": 4.969649496915991e-07, "loss": -0.2703, "num_tokens": 3028467.0, "reward": -3.725290298461914e-09, "reward_std": 0.3327018618583679, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.5679618120193481, "step": 128 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 559.71875, "completions/mean_terminated_length": 544.741943359375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.39814814814814814, "grad_norm": 8.554204995566714, "kl": NaN, "learning_rate": 4.96886791984624e-07, "loss": -0.0874, "num_tokens": 3053234.0, "reward": 0.0, "reward_std": 0.2549504041671753, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 129 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 570.6875, "completions/mean_terminated_length": 540.4666748046875, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.4012345679012346, "grad_norm": 8.127682526083769, "kl": NaN, "learning_rate": 4.968076469868558e-07, "loss": 0.3081, "num_tokens": 3078396.0, "reward": -3.725290298461914e-09, "reward_std": 0.20096953213214874, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.43994131684303284, "step": 130 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 489.375, "completions/mean_terminated_length": 472.1290283203125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.404320987654321, "grad_norm": 6.821984180324421, "kl": NaN, "learning_rate": 4.967275150147921e-07, "loss": -0.3931, "num_tokens": 3100128.0, "reward": -1.862645149230957e-09, "reward_std": 0.1411939263343811, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 131 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 613.0, "completions/mean_terminated_length": 570.4827270507812, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.4074074074074074, "grad_norm": 6.367082404169811, "kl": NaN, "learning_rate": 4.966463963888775e-07, "loss": 0.2158, "num_tokens": 3125992.0, "reward": 0.0, "reward_std": 0.23708997666835785, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -7.450580596923828e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 132 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 481.8125, "completions/mean_terminated_length": 464.32257080078125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.4104938271604938, "grad_norm": 8.869594271037249, "kl": NaN, "learning_rate": 4.965642914335025e-07, "loss": -0.0809, "num_tokens": 3147466.0, "reward": 0.0, "reward_std": 0.26418328285217285, "rewards/format_reward_func/mean": 1.1175870895385742e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.5080004930496216, "step": 133 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 600.34375, "completions/mean_terminated_length": 556.5172119140625, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.41358024691358025, "grad_norm": 5.855795974935996, "kl": NaN, "learning_rate": 4.964812004770013e-07, "loss": -0.0634, "num_tokens": 3172973.0, "reward": 3.725290298461914e-09, "reward_std": 0.26164084672927856, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4751909375190735, "step": 134 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 659.1875, "completions/mean_terminated_length": 537.5833740234375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.4166666666666667, "grad_norm": 5.513208877965179, "kl": NaN, "learning_rate": 4.963971238516519e-07, "loss": -0.0041, "num_tokens": 3200571.0, "reward": -1.862645149230957e-09, "reward_std": 0.18096524477005005, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 135 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 640.4375, "completions/mean_terminated_length": 533.0399780273438, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.41975308641975306, "grad_norm": 8.118947700231773, "kl": NaN, "learning_rate": 4.963120618936732e-07, "loss": 0.113, "num_tokens": 3227781.0, "reward": 0.0, "reward_std": 0.1903054118156433, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 136 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 616.59375, "completions/mean_terminated_length": 589.433349609375, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.4228395061728395, "grad_norm": 7.065393529232521, "kl": NaN, "learning_rate": 4.962260149432247e-07, "loss": 0.0773, "num_tokens": 3254508.0, "reward": 0.0, "reward_std": 0.2295312136411667, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096889972687, "step": 137 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 583.96875, "completions/mean_terminated_length": 538.4483032226562, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.42592592592592593, "grad_norm": 4.457811645650375, "kl": NaN, "learning_rate": 4.96138983344405e-07, "loss": -0.2813, "num_tokens": 3279387.0, "reward": -3.725290298461914e-09, "reward_std": 0.18175974488258362, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 138 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 556.15625, "completions/mean_terminated_length": 489.3214416503906, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.42901234567901236, "grad_norm": 6.333275510929497, "kl": NaN, "learning_rate": 4.9605096744525e-07, "loss": -0.2476, "num_tokens": 3303340.0, "reward": -5.122274160385132e-09, "reward_std": 0.1977911740541458, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 139 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 531.65625, "completions/mean_terminated_length": 515.774169921875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.43209876543209874, "grad_norm": 5.0422237424646745, "kl": NaN, "learning_rate": 4.95961967597732e-07, "loss": -0.1874, "num_tokens": 3326357.0, "reward": -5.587935447692871e-09, "reward_std": 0.15961746871471405, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 140 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 600.25, "completions/mean_terminated_length": 521.7777709960938, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.4351851851851852, "grad_norm": 6.722046369710051, "kl": NaN, "learning_rate": 4.958719841577579e-07, "loss": -0.1097, "num_tokens": 3351869.0, "reward": 0.0, "reward_std": 0.24473561346530914, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 141 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 604.03125, "completions/mean_terminated_length": 544.0357666015625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.4382716049382716, "grad_norm": 8.091365730794006, "kl": NaN, "learning_rate": 4.957810174851679e-07, "loss": -0.1557, "num_tokens": 3377718.0, "reward": -5.587935447692871e-09, "reward_std": 0.2584119439125061, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 142 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 593.96875, "completions/mean_terminated_length": 580.0967407226562, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.44135802469135804, "grad_norm": 5.928453859642679, "kl": NaN, "learning_rate": 4.956890679437345e-07, "loss": -0.1874, "num_tokens": 3403101.0, "reward": 0.0, "reward_std": 0.1641397774219513, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 143 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 611.71875, "completions/mean_terminated_length": 552.8214721679688, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.4444444444444444, "grad_norm": 4.435751443104133, "kl": NaN, "learning_rate": 4.955961359011601e-07, "loss": 0.3901, "num_tokens": 3428984.0, "reward": -3.725290298461914e-09, "reward_std": 0.2163362205028534, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4751909673213959, "step": 144 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 639.15625, "completions/mean_terminated_length": 567.888916015625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.44753086419753085, "grad_norm": 4.566569677013355, "kl": NaN, "learning_rate": 4.955022217290766e-07, "loss": -0.0864, "num_tokens": 3455297.0, "reward": 0.0, "reward_std": 0.1600709855556488, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 145 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 681.9375, "completions/mean_terminated_length": 586.1599731445312, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.4506172839506173, "grad_norm": 4.42410325272051, "kl": NaN, "learning_rate": 4.954073258030431e-07, "loss": -0.408, "num_tokens": 3483491.0, "reward": -3.725290298461914e-09, "reward_std": 0.11825646460056305, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 146 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 729.96875, "completions/mean_terminated_length": 596.3181762695312, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.4537037037037037, "grad_norm": 7.009213791039503, "kl": NaN, "learning_rate": 4.953114485025446e-07, "loss": -0.116, "num_tokens": 3513938.0, "reward": -3.725290298461914e-09, "reward_std": 0.1721172034740448, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 147 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 763.8125, "completions/mean_terminated_length": 662.0, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.4567901234567901, "grad_norm": 3.751126006916233, "kl": NaN, "learning_rate": 4.95214590210991e-07, "loss": 0.0975, "num_tokens": 3545116.0, "reward": 0.0, "reward_std": 0.1716199815273285, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 148 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 620.1875, "completions/mean_terminated_length": 607.1612548828125, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.45987654320987653, "grad_norm": 5.421698468082972, "kl": NaN, "learning_rate": 4.951167513157147e-07, "loss": -0.5259, "num_tokens": 3571158.0, "reward": -3.725290298461914e-09, "reward_std": 0.1502964347600937, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 149 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 604.59375, "completions/mean_terminated_length": 526.9259033203125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.46296296296296297, "grad_norm": 2.3897268658356694, "kl": NaN, "learning_rate": 4.950179322079697e-07, "loss": -0.0652, "num_tokens": 3597305.0, "reward": 0.0, "reward_std": 0.15569782257080078, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 150 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 764.625, "completions/mean_terminated_length": 609.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.4660493827160494, "grad_norm": 2.1609092045248084, "kl": NaN, "learning_rate": 4.949181332829299e-07, "loss": -0.0077, "num_tokens": 3629017.0, "reward": 4.656612873077393e-10, "reward_std": 0.15826013684272766, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 151 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 649.0625, "completions/mean_terminated_length": 544.0800170898438, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.4691358024691358, "grad_norm": 4.2898623150959425, "kl": NaN, "learning_rate": 4.948173549396873e-07, "loss": 0.0112, "num_tokens": 3656203.0, "reward": -5.587935447692871e-09, "reward_std": 0.1922873556613922, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -9.313225746154785e-10, "rewards/logprob_reward/std": 0.3592105805873871, "step": 152 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 621.71875, "completions/mean_terminated_length": 547.2222290039062, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.4722222222222222, "grad_norm": 4.471696890343921, "kl": NaN, "learning_rate": 4.947155975812506e-07, "loss": -0.2802, "num_tokens": 3682434.0, "reward": -4.656612873077393e-10, "reward_std": 0.12737129628658295, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 153 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 770.875, "completions/mean_terminated_length": 655.8181762695312, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.47530864197530864, "grad_norm": 2.908866923221855, "kl": NaN, "learning_rate": 4.946128616145436e-07, "loss": -0.015, "num_tokens": 3713682.0, "reward": 6.51925802230835e-09, "reward_std": 0.26219430565834045, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.43994131684303284, "step": 154 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 695.84375, "completions/mean_terminated_length": 648.9642944335938, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.4783950617283951, "grad_norm": 5.082271214668638, "kl": NaN, "learning_rate": 4.945091474504037e-07, "loss": 0.0762, "num_tokens": 3742373.0, "reward": 3.725290298461914e-09, "reward_std": 0.35248899459838867, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -5.587935447692871e-09, "rewards/logprob_reward/std": 0.5679618120193481, "step": 155 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 713.6875, "completions/mean_terminated_length": 642.0769653320312, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.48148148148148145, "grad_norm": 5.098516528190889, "kl": NaN, "learning_rate": 4.944044555035793e-07, "loss": 0.0775, "num_tokens": 3771843.0, "reward": -5.587935447692871e-09, "reward_std": 0.24149447679519653, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 9.313225746154785e-10, "rewards/logprob_reward/std": 0.4399413466453552, "step": 156 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 636.03125, "completions/mean_terminated_length": 595.8965454101562, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.4845679012345679, "grad_norm": 15.07093103042608, "kl": NaN, "learning_rate": 4.9429878619273e-07, "loss": -0.4165, "num_tokens": 3798908.0, "reward": 3.725290298461914e-09, "reward_std": 0.25131285190582275, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 157 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 594.3125, "completions/mean_terminated_length": 514.74072265625, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.4876543209876543, "grad_norm": 5.683678798871177, "kl": NaN, "learning_rate": 4.941921399404232e-07, "loss": -0.6527, "num_tokens": 3824110.0, "reward": 2.3283064365386963e-09, "reward_std": 0.18798649311065674, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 158 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 735.59375, "completions/mean_terminated_length": 584.5238037109375, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.49074074074074076, "grad_norm": 3.0657245224698473, "kl": NaN, "learning_rate": 4.940845171731329e-07, "loss": -0.1424, "num_tokens": 3854177.0, "reward": -7.450580596923828e-09, "reward_std": 0.19326482713222504, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 159 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 698.34375, "completions/mean_terminated_length": 651.8214721679688, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.49382716049382713, "grad_norm": 6.151965793074086, "kl": NaN, "learning_rate": 4.939759183212388e-07, "loss": -0.0262, "num_tokens": 3883380.0, "reward": 1.862645149230957e-09, "reward_std": 0.3883397579193115, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.5388159155845642, "step": 160 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 638.625, "completions/mean_terminated_length": 598.7586059570312, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.49691358024691357, "grad_norm": 2.3541404509512844, "kl": NaN, "learning_rate": 4.938663438190232e-07, "loss": -0.0192, "num_tokens": 3910280.0, "reward": 7.450580596923828e-09, "reward_std": 0.26380684971809387, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4751909673213959, "step": 161 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 661.8125, "completions/mean_terminated_length": 610.0714721679688, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.5, "grad_norm": 4.8024894565277005, "kl": NaN, "learning_rate": 4.937557941046705e-07, "loss": -0.1263, "num_tokens": 3938054.0, "reward": 1.862645149230957e-09, "reward_std": 0.2903626561164856, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 162 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 634.71875, "completions/mean_terminated_length": 622.1612548828125, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.5030864197530864, "grad_norm": 5.909263865787711, "kl": NaN, "learning_rate": 4.936442696202648e-07, "loss": -0.4076, "num_tokens": 3965021.0, "reward": -7.450580596923828e-09, "reward_std": 0.20059458911418915, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 163 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 648.9375, "completions/mean_terminated_length": 543.9199829101562, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.5061728395061729, "grad_norm": 5.1300141105548915, "kl": NaN, "learning_rate": 4.935317708117881e-07, "loss": -0.3341, "num_tokens": 3992259.0, "reward": 0.0, "reward_std": 0.26898545026779175, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 164 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 659.3125, "completions/mean_terminated_length": 607.2142944335938, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.5092592592592593, "grad_norm": 5.913967314016694, "kl": NaN, "learning_rate": 4.934182981291187e-07, "loss": 0.4965, "num_tokens": 4020249.0, "reward": -9.313225746154785e-09, "reward_std": 0.16201332211494446, "rewards/format_reward_func/mean": -3.725290298461914e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -2.5029294192790985e-09, "rewards/logprob_reward/std": 0.3592105805873871, "step": 165 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 730.6875, "completions/mean_terminated_length": 597.3636474609375, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.5123456790123457, "grad_norm": 5.898192428301864, "kl": NaN, "learning_rate": 4.933038520260299e-07, "loss": 0.0776, "num_tokens": 4050363.0, "reward": 7.450580596923828e-09, "reward_std": 0.317568302154541, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 693.90625, "completions/mean_terminated_length": 601.47998046875, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.5154320987654321, "grad_norm": 7.045947141689735, "kl": 0.061279296875, "learning_rate": 4.931884329601869e-07, "loss": -0.6902, "num_tokens": 4078984.0, "reward": -1.4901161193847656e-08, "reward_std": 0.3645022511482239, "rewards/format_reward_func/mean": -2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.5679618716239929, "step": 167 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 696.03125, "completions/mean_terminated_length": 604.2000122070312, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.5185185185185185, "grad_norm": 4.299896977851904, "kl": NaN, "learning_rate": 4.930720413931463e-07, "loss": -0.2749, "num_tokens": 4107801.0, "reward": -7.450580596923828e-09, "reward_std": 0.1616957187652588, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 168 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 709.84375, "completions/mean_terminated_length": 567.0454711914062, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.5216049382716049, "grad_norm": 7.697107711394947, "kl": NaN, "learning_rate": 4.929546777903534e-07, "loss": -0.026, "num_tokens": 4136728.0, "reward": 7.450580596923828e-09, "reward_std": 0.3467341661453247, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 169 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 653.03125, "completions/mean_terminated_length": 584.3333129882812, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.5246913580246914, "grad_norm": 5.383314154215838, "kl": NaN, "learning_rate": 4.928363426211407e-07, "loss": 0.225, "num_tokens": 4163881.0, "reward": 1.862645149230957e-09, "reward_std": 0.19019806385040283, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 170 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 760.8125, "completions/mean_terminated_length": 673.0833740234375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.5277777777777778, "grad_norm": 5.183491143280448, "kl": NaN, "learning_rate": 4.927170363587262e-07, "loss": -0.2029, "num_tokens": 4194395.0, "reward": -1.862645149230957e-09, "reward_std": 0.1886770874261856, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 171 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 658.5625, "completions/mean_terminated_length": 574.2307739257812, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.5308641975308642, "grad_norm": 3.313019723031103, "kl": NaN, "learning_rate": 4.925967594802109e-07, "loss": -0.0884, "num_tokens": 4221761.0, "reward": -1.3969838619232178e-09, "reward_std": 0.20295457541942596, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 777.09375, "completions/mean_terminated_length": 585.0555419921875, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.5339506172839507, "grad_norm": 4.536784003490762, "kl": 0.0555419921875, "learning_rate": 4.924755124665774e-07, "loss": -0.1002, "num_tokens": 4253128.0, "reward": 0.0, "reward_std": 0.20221075415611267, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 173 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 715.96875, "completions/mean_terminated_length": 644.8846435546875, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.5370370370370371, "grad_norm": 4.803885137174439, "kl": NaN, "learning_rate": 4.923532958026878e-07, "loss": -0.1473, "num_tokens": 4282771.0, "reward": -4.656612873077393e-10, "reward_std": 0.26001739501953125, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 174 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 677.5, "completions/mean_terminated_length": 597.5385131835938, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.5401234567901234, "grad_norm": 2.882320477201461, "kl": NaN, "learning_rate": 4.922301099772821e-07, "loss": -0.315, "num_tokens": 4311107.0, "reward": 0.0, "reward_std": 0.2995503544807434, "rewards/format_reward_func/mean": -2.60770320892334e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.5388159155845642, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 735.71875, "completions/mean_terminated_length": 622.9130249023438, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.5432098765432098, "grad_norm": 3.759131324924152, "kl": 0.05584716796875, "learning_rate": 4.921059554829753e-07, "loss": -0.5024, "num_tokens": 4341462.0, "reward": -7.450580596923828e-09, "reward_std": 0.2578310966491699, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 176 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 662.0, "completions/mean_terminated_length": 560.6400146484375, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.5462962962962963, "grad_norm": 4.695136669827626, "kl": NaN, "learning_rate": 4.91980832816257e-07, "loss": -0.0648, "num_tokens": 4369162.0, "reward": -7.450580596923828e-09, "reward_std": 0.1863657832145691, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -7.450580596923828e-09, "rewards/logprob_reward/std": 0.5080004930496216, "step": 177 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 809.21875, "completions/mean_terminated_length": 662.26318359375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.5493827160493827, "grad_norm": 2.106420448094769, "kl": NaN, "learning_rate": 4.918547424774873e-07, "loss": 0.0886, "num_tokens": 4402133.0, "reward": -7.450580596923828e-09, "reward_std": 0.17791201174259186, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 178 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 752.34375, "completions/mean_terminated_length": 610.047607421875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.5524691358024691, "grad_norm": 3.4961102804099884, "kl": NaN, "learning_rate": 4.917276849708972e-07, "loss": 0.1677, "num_tokens": 4432724.0, "reward": 1.862645149230957e-09, "reward_std": 0.2326449453830719, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 727.1875, "completions/mean_terminated_length": 644.0799560546875, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.5555555555555556, "grad_norm": 4.2019127086886785, "kl": 0.061187744140625, "learning_rate": 4.915996608045842e-07, "loss": -0.1858, "num_tokens": 4462386.0, "reward": 7.450580596923828e-09, "reward_std": 0.3091117739677429, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.4751909673213959, "step": 180 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 654.21875, "completions/mean_terminated_length": 568.8846435546875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.558641975308642, "grad_norm": 3.469894708774101, "kl": NaN, "learning_rate": 4.914706704905125e-07, "loss": -0.208, "num_tokens": 4489217.0, "reward": 1.862645149230957e-09, "reward_std": 0.19741347432136536, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 181 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 747.1875, "completions/mean_terminated_length": 654.9166870117188, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.5617283950617284, "grad_norm": 2.43344670166162, "kl": NaN, "learning_rate": 4.913407145445093e-07, "loss": -0.2519, "num_tokens": 4519575.0, "reward": 7.450580596923828e-09, "reward_std": 0.2075708508491516, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 182 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 768.5625, "completions/mean_terminated_length": 683.4166870117188, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.5648148148148148, "grad_norm": 3.416727072334397, "kl": NaN, "learning_rate": 4.912097934862632e-07, "loss": -0.3776, "num_tokens": 4550805.0, "reward": 7.450580596923828e-09, "reward_std": 0.283363938331604, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 7.450580596923828e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 183 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 786.34375, "completions/mean_terminated_length": 678.3181762695312, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.5679012345679012, "grad_norm": 2.449164100227983, "kl": NaN, "learning_rate": 4.910779078393228e-07, "loss": -0.1256, "num_tokens": 4582372.0, "reward": 0.0, "reward_std": 0.15146484971046448, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 184 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 737.15625, "completions/mean_terminated_length": 606.7727661132812, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.5709876543209876, "grad_norm": 4.024706935369811, "kl": NaN, "learning_rate": 4.909450581310935e-07, "loss": -0.0544, "num_tokens": 4612405.0, "reward": 1.862645149230957e-09, "reward_std": 0.262442946434021, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4751909673213959, "step": 185 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 707.0, "completions/mean_terminated_length": 633.84619140625, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.5740740740740741, "grad_norm": 2.6405622879977413, "kl": NaN, "learning_rate": 4.908112448928363e-07, "loss": 0.0276, "num_tokens": 4641465.0, "reward": -3.725290298461914e-09, "reward_std": 0.23802822828292847, "rewards/format_reward_func/mean": -3.725290298461914e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 186 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 734.375, "completions/mean_terminated_length": 602.727294921875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.5771604938271605, "grad_norm": 3.206208208105168, "kl": NaN, "learning_rate": 4.906764686596651e-07, "loss": -0.133, "num_tokens": 4670741.0, "reward": 3.725290298461914e-09, "reward_std": 0.20190681517124176, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110855221748352, "step": 187 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 690.375, "completions/mean_terminated_length": 596.9599609375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.5802469135802469, "grad_norm": 4.086080046056693, "kl": NaN, "learning_rate": 4.90540729970545e-07, "loss": 0.2032, "num_tokens": 4699437.0, "reward": 0.0, "reward_std": 0.1954423040151596, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -9.313225746154785e-10, "rewards/logprob_reward/std": 0.3592106103897095, "step": 188 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 715.8125, "completions/mean_terminated_length": 644.6923217773438, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.5833333333333334, "grad_norm": 2.8338398303145413, "kl": NaN, "learning_rate": 4.904040293682897e-07, "loss": -0.1297, "num_tokens": 4728543.0, "reward": -9.313225746154785e-09, "reward_std": 0.20422792434692383, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592105805873871, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 790.78125, "completions/mean_terminated_length": 650.8500366210938, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.5864197530864198, "grad_norm": 2.8438341579660102, "kl": 0.066864013671875, "learning_rate": 4.902663673995597e-07, "loss": 0.1729, "num_tokens": 4760248.0, "reward": 7.450580596923828e-09, "reward_std": 0.23384305834770203, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 831.65625, "completions/mean_terminated_length": 700.0526123046875, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.5895061728395061, "grad_norm": 2.780619605697074, "kl": 0.0731201171875, "learning_rate": 4.9012774461486e-07, "loss": -0.089, "num_tokens": 4793725.0, "reward": 0.0, "reward_std": 0.28481680154800415, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 191 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 812.90625, "completions/mean_terminated_length": 702.3333740234375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.5925925925925926, "grad_norm": 2.6545162579981834, "kl": NaN, "learning_rate": 4.899881615685376e-07, "loss": 0.0325, "num_tokens": 4826738.0, "reward": -1.4901161193847656e-08, "reward_std": 0.29407086968421936, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4751909673213959, "step": 192 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 806.0625, "completions/mean_terminated_length": 691.90478515625, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.595679012345679, "grad_norm": 4.497736207531074, "kl": NaN, "learning_rate": 4.898476188187798e-07, "loss": -0.0457, "num_tokens": 4858916.0, "reward": -3.725290298461914e-09, "reward_std": 0.18742044270038605, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592105805873871, "step": 193 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 722.96875, "completions/mean_terminated_length": 653.5, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.5987654320987654, "grad_norm": 2.304468478057239, "kl": NaN, "learning_rate": 4.897061169276118e-07, "loss": -0.2346, "num_tokens": 4888071.0, "reward": -8.381903171539307e-09, "reward_std": 0.14934472739696503, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 194 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 780.4375, "completions/mean_terminated_length": 652.857177734375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.6018518518518519, "grad_norm": 1.663862730374436, "kl": NaN, "learning_rate": 4.895636564608942e-07, "loss": -0.0296, "num_tokens": 4919649.0, "reward": -6.51925802230835e-09, "reward_std": 0.15778718888759613, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110855221748352, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 733.4375, "completions/mean_terminated_length": 652.0799560546875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.6049382716049383, "grad_norm": 6.552853168590596, "kl": 0.091400146484375, "learning_rate": 4.894202379883206e-07, "loss": 0.0893, "num_tokens": 4949807.0, "reward": -3.725290298461914e-09, "reward_std": 0.19026055932044983, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 196 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 744.9375, "completions/mean_terminated_length": 693.25927734375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.6080246913580247, "grad_norm": 2.0717863669209438, "kl": NaN, "learning_rate": 4.892758620834165e-07, "loss": -0.0606, "num_tokens": 4980221.0, "reward": -1.1175870895385742e-08, "reward_std": 0.2551858425140381, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4751909375190735, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 732.4375, "completions/mean_terminated_length": 650.7999877929688, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.6111111111111112, "grad_norm": 3.4708064378122376, "kl": 0.0716552734375, "learning_rate": 4.891305293235351e-07, "loss": -0.1915, "num_tokens": 5009911.0, "reward": -1.4901161193847656e-08, "reward_std": 0.3596891760826111, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.5080004930496216, "step": 198 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 788.65625, "completions/mean_terminated_length": 665.3809814453125, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.6141975308641975, "grad_norm": 1.6743537613630881, "kl": NaN, "learning_rate": 4.889842402898569e-07, "loss": -0.0209, "num_tokens": 5042052.0, "reward": 7.450580596923828e-09, "reward_std": 0.26783668994903564, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 7.450580596923828e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 199 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 857.5625, "completions/mean_terminated_length": 757.7000122070312, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.6172839506172839, "grad_norm": 1.7681695511576603, "kl": NaN, "learning_rate": 4.888369955673858e-07, "loss": -0.0328, "num_tokens": 5076666.0, "reward": 5.587935447692871e-09, "reward_std": 0.25736480951309204, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 768.71875, "completions/mean_terminated_length": 652.6818237304688, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.6203703703703703, "grad_norm": 2.423013369291697, "kl": 0.089080810546875, "learning_rate": 4.88688795744948e-07, "loss": -0.3225, "num_tokens": 5108017.0, "reward": 5.587935447692871e-09, "reward_std": 0.2015441507101059, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 753.125, "completions/mean_terminated_length": 647.1304321289062, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.6234567901234568, "grad_norm": 3.878827640491604, "kl": 0.09527587890625, "learning_rate": 4.885396414151888e-07, "loss": 0.1059, "num_tokens": 5138557.0, "reward": 0.0, "reward_std": 0.26025640964508057, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 202 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 633.28125, "completions/mean_terminated_length": 577.4642944335938, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.6265432098765432, "grad_norm": 5.296543435346333, "kl": NaN, "learning_rate": 4.883895331745707e-07, "loss": -0.4315, "num_tokens": 5164726.0, "reward": -1.1175870895385742e-08, "reward_std": 0.2554783821105957, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.4751909673213959, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 739.15625, "completions/mean_terminated_length": 609.6818237304688, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.6296296296296297, "grad_norm": 4.665205117691723, "kl": 0.0711669921875, "learning_rate": 4.882384716233709e-07, "loss": -0.5091, "num_tokens": 5194635.0, "reward": -3.725290298461914e-09, "reward_std": 0.2635560929775238, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 204 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 767.21875, "completions/mean_terminated_length": 650.5, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.6327160493827161, "grad_norm": 2.9408032043750487, "kl": NaN, "learning_rate": 4.880864573656785e-07, "loss": -0.2056, "num_tokens": 5226358.0, "reward": -7.450580596923828e-09, "reward_std": 0.29013127088546753, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.5080004930496216, "step": 205 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 739.0625, "completions/mean_terminated_length": 609.5454711914062, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.6358024691358025, "grad_norm": 3.213343905069938, "kl": NaN, "learning_rate": 4.879334910093926e-07, "loss": -0.264, "num_tokens": 5256600.0, "reward": 0.0, "reward_std": 0.2007269412279129, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 814.625, "completions/mean_terminated_length": 605.25, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.6388888888888888, "grad_norm": 3.6603473739931105, "kl": 0.078094482421875, "learning_rate": 4.877795731662202e-07, "loss": -0.3169, "num_tokens": 5289044.0, "reward": -1.862645149230957e-09, "reward_std": 0.20887130498886108, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 207 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 810.5625, "completions/mean_terminated_length": 682.5, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.6419753086419753, "grad_norm": 3.005784406698742, "kl": NaN, "learning_rate": 4.876247044516724e-07, "loss": -0.1685, "num_tokens": 5321606.0, "reward": -1.862645149230957e-09, "reward_std": 0.17495723068714142, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 812.15625, "completions/mean_terminated_length": 701.1904907226562, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.6450617283950617, "grad_norm": 3.055435065655736, "kl": 0.0987548828125, "learning_rate": 4.874688854850635e-07, "loss": -0.2587, "num_tokens": 5354467.0, "reward": -5.587935447692871e-09, "reward_std": 0.2891131043434143, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 209 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 678.71875, "completions/mean_terminated_length": 563.625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.6481481481481481, "grad_norm": 2.8793279918757797, "kl": NaN, "learning_rate": 4.873121168895075e-07, "loss": -0.1438, "num_tokens": 5382422.0, "reward": 1.4901161193847656e-08, "reward_std": 0.3393701910972595, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 757.0, "completions/mean_terminated_length": 668.0, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 0.6512345679012346, "grad_norm": 2.533179208177293, "kl": 0.080718994140625, "learning_rate": 4.87154399291916e-07, "loss": -0.192, "num_tokens": 5413010.0, "reward": 9.313225746154785e-10, "reward_std": 0.1891198754310608, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 211 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 829.34375, "completions/mean_terminated_length": 677.9444580078125, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.654320987654321, "grad_norm": 2.978035054435234, "kl": NaN, "learning_rate": 4.869957333229955e-07, "loss": -0.2636, "num_tokens": 5445613.0, "reward": 1.862645149230957e-09, "reward_std": 0.23661711812019348, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096889972687, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 678.75, "completions/mean_terminated_length": 614.8148193359375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.6574074074074074, "grad_norm": 3.2081156207665313, "kl": 0.109619140625, "learning_rate": 4.868361196172453e-07, "loss": 0.1124, "num_tokens": 5473581.0, "reward": 0.0, "reward_std": 0.2787058353424072, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 213 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 729.125, "completions/mean_terminated_length": 630.8333740234375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.6604938271604939, "grad_norm": 2.21977273685207, "kl": NaN, "learning_rate": 4.866755588129542e-07, "loss": -0.1336, "num_tokens": 5503061.0, "reward": -4.6566128730773926e-09, "reward_std": 0.19088414311408997, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592105805873871, "step": 214 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 812.8125, "completions/mean_terminated_length": 626.4705810546875, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.6635802469135802, "grad_norm": 3.396468325787721, "kl": NaN, "learning_rate": 4.86514051552199e-07, "loss": -0.303, "num_tokens": 5535647.0, "reward": 0.0, "reward_std": 0.19001758098602295, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592105805873871, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 787.09375, "completions/mean_terminated_length": 644.9500122070312, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.6666666666666666, "grad_norm": 3.70603151927325, "kl": 0.0989990234375, "learning_rate": 4.863515984808408e-07, "loss": 0.0477, "num_tokens": 5567206.0, "reward": 7.450580596923828e-09, "reward_std": 0.33198004961013794, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4751909673213959, "step": 216 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 713.125, "completions/mean_terminated_length": 609.5, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.6697530864197531, "grad_norm": 4.981690107695676, "kl": NaN, "learning_rate": 4.861882002485234e-07, "loss": -0.5205, "num_tokens": 5596386.0, "reward": -1.862645149230957e-09, "reward_std": 0.20860743522644043, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 741.5, "completions/mean_terminated_length": 593.5238037109375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.6728395061728395, "grad_norm": 2.9858354697398206, "kl": 0.1287841796875, "learning_rate": 4.860238575086699e-07, "loss": -0.2922, "num_tokens": 5627022.0, "reward": 1.862645149230957e-09, "reward_std": 0.1880928874015808, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 218 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 738.4375, "completions/mean_terminated_length": 626.6956787109375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.6759259259259259, "grad_norm": 2.2240921510399967, "kl": NaN, "learning_rate": 4.858585709184806e-07, "loss": -0.0813, "num_tokens": 5656968.0, "reward": 0.0, "reward_std": 0.2712445855140686, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.5080004930496216, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 790.90625, "completions/mean_terminated_length": 631.4210815429688, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.6790123456790124, "grad_norm": 1.3628167364247696, "kl": 0.10546875, "learning_rate": 4.856923411389302e-07, "loss": -0.002, "num_tokens": 5688765.0, "reward": 0.0, "reward_std": 0.1343029886484146, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 777.34375, "completions/mean_terminated_length": 695.125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.6820987654320988, "grad_norm": 2.426558623160803, "kl": 0.10418701171875, "learning_rate": 4.855251688347653e-07, "loss": -0.1912, "num_tokens": 5720220.0, "reward": 0.0, "reward_std": 0.25110799074172974, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 221 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 855.0625, "completions/mean_terminated_length": 739.4736938476562, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.6851851851851852, "grad_norm": 2.133269936151635, "kl": NaN, "learning_rate": 4.853570546745014e-07, "loss": -0.0728, "num_tokens": 5754410.0, "reward": 9.313225746154785e-10, "reward_std": 0.21861866116523743, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.1641532182693481e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 783.96875, "completions/mean_terminated_length": 639.9500122070312, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.6882716049382716, "grad_norm": 3.7839727813702058, "kl": 0.085906982421875, "learning_rate": 4.851879993304208e-07, "loss": 0.0654, "num_tokens": 5785749.0, "reward": 0.0, "reward_std": 0.23473702371120453, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 223 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 809.9375, "completions/mean_terminated_length": 681.5, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.691358024691358, "grad_norm": 1.826492925221081, "kl": NaN, "learning_rate": 4.850180034785691e-07, "loss": -0.1136, "num_tokens": 5818415.0, "reward": -4.656612873077393e-10, "reward_std": 0.07151594012975693, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 224 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 780.90625, "completions/mean_terminated_length": 635.0499877929688, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.6944444444444444, "grad_norm": 1.9202875738575542, "kl": NaN, "learning_rate": 4.848470677987532e-07, "loss": -0.113, "num_tokens": 5850032.0, "reward": 7.450580596923828e-09, "reward_std": 0.22210678458213806, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 225 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 805.53125, "completions/mean_terminated_length": 691.0952758789062, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.6975308641975309, "grad_norm": 2.017837093796174, "kl": NaN, "learning_rate": 4.846751929745383e-07, "loss": -0.0029, "num_tokens": 5882289.0, "reward": 3.725290298461914e-09, "reward_std": 0.19541588425636292, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 728.71875, "completions/mean_terminated_length": 574.047607421875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.7006172839506173, "grad_norm": 3.1385720720247985, "kl": 0.1187744140625, "learning_rate": 4.845023796932454e-07, "loss": -0.204, "num_tokens": 5911532.0, "reward": -9.313225746154785e-10, "reward_std": 0.22258394956588745, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 737.125, "completions/mean_terminated_length": 624.8695678710938, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.7037037037037037, "grad_norm": 2.0482393741551417, "kl": 0.0987548828125, "learning_rate": 4.84328628645948e-07, "loss": 0.0905, "num_tokens": 5941608.0, "reward": 9.313225746154785e-10, "reward_std": 0.1891579031944275, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -2.7939677238464355e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 228 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 788.0625, "completions/mean_terminated_length": 604.5555419921875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.7067901234567902, "grad_norm": 3.4958822091772714, "kl": NaN, "learning_rate": 4.841539405274698e-07, "loss": -0.1844, "num_tokens": 5973686.0, "reward": -3.725290298461914e-09, "reward_std": 0.24194923043251038, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 775.46875, "completions/mean_terminated_length": 645.2857055664062, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.7098765432098766, "grad_norm": 3.1229631584357502, "kl": 0.095977783203125, "learning_rate": 4.839783160363821e-07, "loss": -0.193, "num_tokens": 6005525.0, "reward": -3.725290298461914e-09, "reward_std": 0.2672354578971863, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 731.71875, "completions/mean_terminated_length": 617.3478393554688, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.7129629629629629, "grad_norm": 2.2642799708066583, "kl": 0.105316162109375, "learning_rate": 4.838017558750004e-07, "loss": -0.293, "num_tokens": 6035812.0, "reward": 0.0, "reward_std": 0.173539400100708, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 231 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 795.53125, "completions/mean_terminated_length": 658.4500122070312, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.7160493827160493, "grad_norm": 5.1671418250930055, "kl": NaN, "learning_rate": 4.836242607493819e-07, "loss": 0.0455, "num_tokens": 6068105.0, "reward": 7.450580596923828e-09, "reward_std": 0.34079915285110474, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4751909375190735, "step": 232 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 723.0625, "completions/mean_terminated_length": 622.75, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.7191358024691358, "grad_norm": 2.0798496267906352, "kl": NaN, "learning_rate": 4.834458313693228e-07, "loss": 0.0239, "num_tokens": 6097931.0, "reward": 1.862645149230957e-09, "reward_std": 0.2358597218990326, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 696.5, "completions/mean_terminated_length": 604.7999877929688, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.7222222222222222, "grad_norm": 4.333848724052473, "kl": 0.11224365234375, "learning_rate": 4.832664684483555e-07, "loss": -0.1502, "num_tokens": 6126531.0, "reward": -9.313225746154785e-10, "reward_std": 0.24924881756305695, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 9.313225746154785e-10, "rewards/logprob_reward/std": 0.3592105805873871, "step": 234 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 783.25, "completions/mean_terminated_length": 570.8235473632812, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.7253086419753086, "grad_norm": 1.5885116011981246, "kl": NaN, "learning_rate": 4.830861727037453e-07, "loss": 0.0281, "num_tokens": 6158191.0, "reward": -1.862645149230957e-09, "reward_std": 0.07657893747091293, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 836.6875, "completions/mean_terminated_length": 649.375, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.7283950617283951, "grad_norm": 4.699831311502976, "kl": 0.123870849609375, "learning_rate": 4.82904944856488e-07, "loss": -0.4437, "num_tokens": 6191801.0, "reward": 0.028124995529651642, "reward_std": 0.12057675421237946, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 706.0625, "completions/mean_terminated_length": 617.0399780273438, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.7314814814814815, "grad_norm": 3.296723607037876, "kl": 0.12939453125, "learning_rate": 4.827227856313066e-07, "loss": -0.297, "num_tokens": 6220767.0, "reward": 1.862645149230957e-09, "reward_std": 0.27311262488365173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 237 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 746.34375, "completions/mean_terminated_length": 637.6956787109375, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.7345679012345679, "grad_norm": 3.3829561076219923, "kl": NaN, "learning_rate": 4.825396957566491e-07, "loss": 0.1273, "num_tokens": 6251782.0, "reward": -1.862645149230957e-09, "reward_std": 0.18932479619979858, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -9.313225746154785e-10, "rewards/logprob_reward/std": 0.3592106103897095, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 649.71875, "completions/mean_terminated_length": 580.4074096679688, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.7376543209876543, "grad_norm": 4.971394904707194, "kl": 0.12762451171875, "learning_rate": 4.823556759646847e-07, "loss": -0.1317, "num_tokens": 6278893.0, "reward": -7.450580596923828e-09, "reward_std": 0.2861737608909607, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 731.9375, "completions/mean_terminated_length": 617.6521606445312, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.7407407407407407, "grad_norm": 3.9598679602177236, "kl": 0.1295166015625, "learning_rate": 4.821707269913016e-07, "loss": -0.3395, "num_tokens": 6309431.0, "reward": 3.725290298461914e-09, "reward_std": 0.25012826919555664, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 677.46875, "completions/mean_terminated_length": 561.9583740234375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.7438271604938271, "grad_norm": 3.493835067695898, "kl": 0.1341552734375, "learning_rate": 4.819848495761037e-07, "loss": -0.2769, "num_tokens": 6337758.0, "reward": 4.656612873077393e-10, "reward_std": 0.21241632103919983, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 241 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 688.375, "completions/mean_terminated_length": 576.5, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.7469135802469136, "grad_norm": 4.054686799800771, "kl": NaN, "learning_rate": 4.817980444624076e-07, "loss": -0.2903, "num_tokens": 6366362.0, "reward": 1.862645149230957e-09, "reward_std": 0.15422162413597107, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 748.375, "completions/mean_terminated_length": 656.5, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.75, "grad_norm": 3.2645994838077947, "kl": 0.09912109375, "learning_rate": 4.816103123972395e-07, "loss": 0.1701, "num_tokens": 6396726.0, "reward": 6.51925802230835e-09, "reward_std": 0.3210039734840393, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.5080004930496216, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 758.6875, "completions/mean_terminated_length": 654.8695678710938, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.7530864197530864, "grad_norm": 3.4538214639853657, "kl": 0.13458251953125, "learning_rate": 4.814216541313329e-07, "loss": 0.0184, "num_tokens": 6427664.0, "reward": -4.6566128730773926e-09, "reward_std": 0.2180139422416687, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -2.0954757928848267e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 727.5, "completions/mean_terminated_length": 628.6666870117188, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.7561728395061729, "grad_norm": 3.294021912055393, "kl": 0.12322998046875, "learning_rate": 4.812320704191252e-07, "loss": 0.1031, "num_tokens": 6457412.0, "reward": -1.862645149230957e-09, "reward_std": 0.14231424033641815, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 245 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 749.0625, "completions/mean_terminated_length": 624.0909423828125, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.7592592592592593, "grad_norm": 2.315221828899277, "kl": NaN, "learning_rate": 4.81041562018754e-07, "loss": -0.2133, "num_tokens": 6487486.0, "reward": -1.862645149230957e-09, "reward_std": 0.21590662002563477, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 246 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 736.6875, "completions/mean_terminated_length": 624.2608642578125, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.7623456790123457, "grad_norm": 3.3242674540332513, "kl": NaN, "learning_rate": 4.808501296920552e-07, "loss": -0.1676, "num_tokens": 6517144.0, "reward": 0.0, "reward_std": 0.20270907878875732, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 717.75, "completions/mean_terminated_length": 647.0769653320312, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.7654320987654321, "grad_norm": 3.1084229164813078, "kl": 0.1446533203125, "learning_rate": 4.806577742045593e-07, "loss": -0.2875, "num_tokens": 6546780.0, "reward": -9.313225746154785e-10, "reward_std": 0.16301563382148743, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 248 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 782.34375, "completions/mean_terminated_length": 672.5, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.7685185185185185, "grad_norm": 2.2457243077069826, "kl": NaN, "learning_rate": 4.804644963254887e-07, "loss": -0.1534, "num_tokens": 6578251.0, "reward": 3.725290298461914e-09, "reward_std": 0.1873370110988617, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.3969838619232178e-09, "rewards/logprob_reward/std": 0.3110855221748352, "step": 249 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 740.21875, "completions/mean_terminated_length": 629.1739501953125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.7716049382716049, "grad_norm": 1.807821247523498, "kl": NaN, "learning_rate": 4.80270296827754e-07, "loss": -0.1799, "num_tokens": 6608766.0, "reward": 0.028124995529651642, "reward_std": 0.10673221200704575, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 725.59375, "completions/mean_terminated_length": 642.0399780273438, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.7746913580246914, "grad_norm": 1.8298842724913364, "kl": 0.143798828125, "learning_rate": 4.800751764879516e-07, "loss": -0.1244, "num_tokens": 6638337.0, "reward": 0.0, "reward_std": 0.16211052238941193, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 740.84375, "completions/mean_terminated_length": 661.5599975585938, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.7777777777777778, "grad_norm": 2.268148157743323, "kl": 0.1407470703125, "learning_rate": 4.798791360863602e-07, "loss": -0.1694, "num_tokens": 6668616.0, "reward": 3.725290298461914e-09, "reward_std": 0.21965095400810242, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 741.90625, "completions/mean_terminated_length": 613.6818237304688, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.7808641975308642, "grad_norm": 3.3267154691316025, "kl": 0.172393798828125, "learning_rate": 4.796821764069378e-07, "loss": -0.2182, "num_tokens": 6698989.0, "reward": 0.02812499739229679, "reward_std": 0.12776592373847961, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 789.71875, "completions/mean_terminated_length": 698.0435180664062, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.7839506172839507, "grad_norm": 2.8463300517998205, "kl": 0.13848876953125, "learning_rate": 4.794842982373188e-07, "loss": -0.3921, "num_tokens": 6730656.0, "reward": 1.862645149230957e-09, "reward_std": 0.30081069469451904, "rewards/format_reward_func/mean": 3.725290298461914e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 774.1875, "completions/mean_terminated_length": 660.6364135742188, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.7870370370370371, "grad_norm": 2.6787479352006054, "kl": 0.15087890625, "learning_rate": 4.7928550236881e-07, "loss": -0.1829, "num_tokens": 6761922.0, "reward": -3.725290298461914e-09, "reward_std": 0.19329413771629333, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -9.313225746154785e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 692.96875, "completions/mean_terminated_length": 600.2799682617188, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.7901234567901234, "grad_norm": 2.9002315054338776, "kl": 0.13751220703125, "learning_rate": 4.790857895963888e-07, "loss": -0.4516, "num_tokens": 6789985.0, "reward": 9.313225746154785e-09, "reward_std": 0.2881210148334503, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 746.34375, "completions/mean_terminated_length": 600.90478515625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.7932098765432098, "grad_norm": 4.284324758344148, "kl": 0.15911865234375, "learning_rate": 4.788851607186988e-07, "loss": -0.4735, "num_tokens": 6820148.0, "reward": 0.0, "reward_std": 0.15029644966125488, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 736.0625, "completions/mean_terminated_length": 640.0833740234375, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.7962962962962963, "grad_norm": 5.010981769482883, "kl": 0.14849853515625, "learning_rate": 4.786836165380472e-07, "loss": -0.6461, "num_tokens": 6849914.0, "reward": 0.02812499925494194, "reward_std": 0.14930030703544617, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 726.875, "completions/mean_terminated_length": 591.8181762695312, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.7993827160493827, "grad_norm": 3.814852394046285, "kl": 0.13836669921875, "learning_rate": 4.784811578604013e-07, "loss": 0.3629, "num_tokens": 6879306.0, "reward": -1.862645149230957e-09, "reward_std": 0.22981852293014526, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592105805873871, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 708.65625, "completions/mean_terminated_length": 565.3181762695312, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.8024691358024691, "grad_norm": 6.197667422340778, "kl": 0.172607421875, "learning_rate": 4.782777854953857e-07, "loss": -0.2984, "num_tokens": 6908743.0, "reward": -1.862645149230957e-09, "reward_std": 0.1861894428730011, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 737.84375, "completions/mean_terminated_length": 657.719970703125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.8055555555555556, "grad_norm": 1.4501911673247054, "kl": 0.1632080078125, "learning_rate": 4.780735002562785e-07, "loss": -0.0344, "num_tokens": 6938870.0, "reward": 0.02812500298023224, "reward_std": 0.12218310683965683, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 711.5, "completions/mean_terminated_length": 589.2174072265625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.808641975308642, "grad_norm": 2.4850563544987287, "kl": 0.1593017578125, "learning_rate": 4.778683029600089e-07, "loss": -0.1642, "num_tokens": 6968186.0, "reward": 0.0, "reward_std": 0.2453797608613968, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 262 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 707.1875, "completions/mean_terminated_length": 634.0769653320312, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.8117283950617284, "grad_norm": 2.638204819765845, "kl": NaN, "learning_rate": 4.776621944271526e-07, "loss": -0.2079, "num_tokens": 6997596.0, "reward": -1.862645149230957e-09, "reward_std": 0.201801598072052, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 702.75, "completions/mean_terminated_length": 612.7999877929688, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.8148148148148148, "grad_norm": 4.922844441620661, "kl": 0.138427734375, "learning_rate": 4.774551754819299e-07, "loss": -0.4543, "num_tokens": 7026360.0, "reward": -9.313225746154785e-10, "reward_std": 0.22252653539180756, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 753.1875, "completions/mean_terminated_length": 662.9166870117188, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.8179012345679012, "grad_norm": 2.4659294983487468, "kl": 0.14898681640625, "learning_rate": 4.772472469522015e-07, "loss": -0.1614, "num_tokens": 7057062.0, "reward": 5.587935447692871e-09, "reward_std": 0.27855920791625977, "rewards/format_reward_func/mean": 1.1175870895385742e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 788.0, "completions/mean_terminated_length": 664.3809814453125, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.8209876543209876, "grad_norm": 3.1817469660601505, "kl": 0.14874267578125, "learning_rate": 4.770384096694658e-07, "loss": -0.0496, "num_tokens": 7088598.0, "reward": 5.587935447692871e-09, "reward_std": 0.27550405263900757, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -2.561137080192566e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 747.21875, "completions/mean_terminated_length": 683.34619140625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.8240740740740741, "grad_norm": 2.648277388892835, "kl": 0.14556884765625, "learning_rate": 4.7682866446885475e-07, "loss": -0.387, "num_tokens": 7119097.0, "reward": -3.725290298461914e-09, "reward_std": 0.18597835302352905, "rewards/format_reward_func/mean": -2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 812.28125, "completions/mean_terminated_length": 600.5625, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.8271604938271605, "grad_norm": 1.895103808274615, "kl": 0.16876220703125, "learning_rate": 4.766180121891316e-07, "loss": 0.0752, "num_tokens": 7151826.0, "reward": -3.725290298461914e-09, "reward_std": 0.15634265542030334, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 737.46875, "completions/mean_terminated_length": 657.239990234375, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.8302469135802469, "grad_norm": 2.4635694188564927, "kl": 0.1478271484375, "learning_rate": 4.7640645367268663e-07, "loss": -0.2284, "num_tokens": 7181905.0, "reward": -1.862645149230957e-09, "reward_std": 0.1880928874015808, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 810.8125, "completions/mean_terminated_length": 682.9000244140625, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.8333333333333334, "grad_norm": 1.7543640852318774, "kl": 0.1400146484375, "learning_rate": 4.761939897655343e-07, "loss": -0.1302, "num_tokens": 7214247.0, "reward": 0.0, "reward_std": 0.18571428954601288, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 820.5625, "completions/mean_terminated_length": 662.3333129882812, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.8364197530864198, "grad_norm": 1.9597368769750425, "kl": 0.1575927734375, "learning_rate": 4.7598062131730943e-07, "loss": -0.0789, "num_tokens": 7246569.0, "reward": 0.02812499925494194, "reward_std": 0.142043799161911, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 745.84375, "completions/mean_terminated_length": 619.4091186523438, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.8395061728395061, "grad_norm": 2.964069719910258, "kl": 0.16131591796875, "learning_rate": 4.757663491812644e-07, "loss": -0.4122, "num_tokens": 7276844.0, "reward": -1.862645149230957e-09, "reward_std": 0.1861894428730011, "rewards/format_reward_func/mean": -3.725290298461914e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 795.4375, "completions/mean_terminated_length": 658.2999877929688, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.8425925925925926, "grad_norm": 1.3766206164921555, "kl": 0.1619873046875, "learning_rate": 4.755511742142652e-07, "loss": -0.0572, "num_tokens": 7308802.0, "reward": -2.3283064365386963e-09, "reward_std": 0.06432675570249557, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 737.1875, "completions/mean_terminated_length": 624.95654296875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.845679012345679, "grad_norm": 2.0849285872933416, "kl": 0.14141845703125, "learning_rate": 4.753350972767883e-07, "loss": 0.0073, "num_tokens": 7338484.0, "reward": -3.725290298461914e-09, "reward_std": 0.20247438549995422, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -9.313225746154785e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 819.84375, "completions/mean_terminated_length": 712.90478515625, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.8487654320987654, "grad_norm": 1.632688373463398, "kl": 0.149658203125, "learning_rate": 4.75118119232917e-07, "loss": -0.1197, "num_tokens": 7371347.0, "reward": -1.862645149230957e-09, "reward_std": 0.16597789525985718, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 798.40625, "completions/mean_terminated_length": 695.8636474609375, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.8518518518518519, "grad_norm": 1.4641771387201727, "kl": 0.16363525390625, "learning_rate": 4.749002409503382e-07, "loss": -0.0562, "num_tokens": 7403812.0, "reward": 3.725290298461914e-09, "reward_std": 0.17815831303596497, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 831.71875, "completions/mean_terminated_length": 700.1578979492188, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.8549382716049383, "grad_norm": 1.6203607169742502, "kl": 0.15106201171875, "learning_rate": 4.7468146330033874e-07, "loss": -0.0896, "num_tokens": 7436911.0, "reward": 0.0, "reward_std": 0.20206104218959808, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 752.5, "completions/mean_terminated_length": 662.0, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.8580246913580247, "grad_norm": 1.927049495922596, "kl": 0.1982421875, "learning_rate": 4.7446178715780213e-07, "loss": -0.0179, "num_tokens": 7467319.0, "reward": 0.0, "reward_std": 0.199102520942688, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 770.75, "completions/mean_terminated_length": 655.6364135742188, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.8611111111111112, "grad_norm": 2.520690747868014, "kl": 0.17913818359375, "learning_rate": 4.742412134012047e-07, "loss": -0.1682, "num_tokens": 7498379.0, "reward": 0.02812499925494194, "reward_std": 0.11317629367113113, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 760.09375, "completions/mean_terminated_length": 672.125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.8641975308641975, "grad_norm": 1.3364082658906518, "kl": 0.1563720703125, "learning_rate": 4.740197429126125e-07, "loss": -0.1614, "num_tokens": 7529446.0, "reward": 0.02812499925494194, "reward_std": 0.12499846518039703, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 719.59375, "completions/mean_terminated_length": 600.478271484375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.8672839506172839, "grad_norm": 1.6900766726674665, "kl": 0.1710205078125, "learning_rate": 4.7379737657767745e-07, "loss": -0.1112, "num_tokens": 7558605.0, "reward": 0.02812499925494194, "reward_std": 0.11206148564815521, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 787.21875, "completions/mean_terminated_length": 645.1500244140625, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.8703703703703703, "grad_norm": 1.5534181757884256, "kl": 0.1671142578125, "learning_rate": 4.7357411528563393e-07, "loss": -0.0699, "num_tokens": 7590464.0, "reward": 0.02812499925494194, "reward_std": 0.11586824059486389, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 755.875, "completions/mean_terminated_length": 694.0, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.8734567901234568, "grad_norm": 1.7407050533198927, "kl": 0.190673828125, "learning_rate": 4.733499599292955e-07, "loss": -0.1111, "num_tokens": 7620896.0, "reward": 0.02812500298023224, "reward_std": 0.13413286209106445, "rewards/format_reward_func/mean": 3.3527612686157227e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 782.875, "completions/mean_terminated_length": 727.2307739257812, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.8765432098765432, "grad_norm": 2.3587940594191217, "kl": 0.202392578125, "learning_rate": 4.7312491140505064e-07, "loss": -0.1237, "num_tokens": 7652752.0, "reward": -3.725290298461914e-09, "reward_std": 0.23611551523208618, "rewards/format_reward_func/mean": 1.1175870895385742e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 765.875, "completions/mean_terminated_length": 648.5454711914062, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.8796296296296297, "grad_norm": 1.4397587646268084, "kl": 0.1761474609375, "learning_rate": 4.7289897061285965e-07, "loss": -0.0864, "num_tokens": 7683580.0, "reward": -3.725290298461914e-09, "reward_std": 0.16567710041999817, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 755.8125, "completions/mean_terminated_length": 650.8695678710938, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.8827160493827161, "grad_norm": 1.3844343604203304, "kl": 0.169921875, "learning_rate": 4.726721384562513e-07, "loss": -0.0201, "num_tokens": 7714302.0, "reward": 0.028124995529651642, "reward_std": 0.13184288144111633, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 804.28125, "completions/mean_terminated_length": 731.0416870117188, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.8858024691358025, "grad_norm": 1.830878149514015, "kl": 0.202392578125, "learning_rate": 4.724444158423185e-07, "loss": -0.1293, "num_tokens": 7746879.0, "reward": 0.0, "reward_std": 0.17877720296382904, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 861.96875, "completions/mean_terminated_length": 719.0, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.8888888888888888, "grad_norm": 1.1348754400802896, "kl": 0.1612548828125, "learning_rate": 4.722158036817154e-07, "loss": -0.0218, "num_tokens": 7780942.0, "reward": -1.862645149230957e-09, "reward_std": 0.07559289038181305, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 822.03125, "completions/mean_terminated_length": 683.8421020507812, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.8919753086419753, "grad_norm": 1.6500618392864028, "kl": 0.224365234375, "learning_rate": 4.7198630288865304e-07, "loss": -0.0926, "num_tokens": 7814191.0, "reward": 0.02812499925494194, "reward_std": 0.1277659386396408, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 764.90625, "completions/mean_terminated_length": 705.1154174804688, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.8950617283950617, "grad_norm": 1.8972891537926457, "kl": 0.188232421875, "learning_rate": 4.7175591438089646e-07, "loss": -0.2144, "num_tokens": 7845476.0, "reward": 0.0, "reward_std": 0.07440169155597687, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 797.625, "completions/mean_terminated_length": 679.047607421875, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.8981481481481481, "grad_norm": 1.7510276018240747, "kl": 0.20361328125, "learning_rate": 4.7152463907976024e-07, "loss": -0.0901, "num_tokens": 7878244.0, "reward": -3.259629011154175e-09, "reward_std": 0.1641397774219513, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 750.5, "completions/mean_terminated_length": 643.478271484375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.9012345679012346, "grad_norm": 1.9643840722039991, "kl": 0.2164306640625, "learning_rate": 4.7129247791010563e-07, "loss": -0.1134, "num_tokens": 7908864.0, "reward": -1.862645149230957e-09, "reward_std": 0.14452563226222992, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 786.625, "completions/mean_terminated_length": 707.5, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.904320987654321, "grad_norm": 1.7647614881243303, "kl": 0.187744140625, "learning_rate": 4.710594318003361e-07, "loss": -0.179, "num_tokens": 7940780.0, "reward": 3.725290298461914e-09, "reward_std": 0.1901835799217224, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 810.125, "completions/mean_terminated_length": 738.8333740234375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.9074074074074074, "grad_norm": 1.7120283526762177, "kl": 0.187744140625, "learning_rate": 4.7082550168239423e-07, "loss": -0.1556, "num_tokens": 7972860.0, "reward": 0.02812500298023224, "reward_std": 0.12490952014923096, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 673.75, "completions/mean_terminated_length": 623.7142944335938, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.9104938271604939, "grad_norm": 2.6500365855705112, "kl": 0.1890869140625, "learning_rate": 4.705906884917573e-07, "loss": -0.2749, "num_tokens": 8000448.0, "reward": 0.02812500111758709, "reward_std": 0.12204517424106598, "rewards/format_reward_func/mean": 3.3527612686157227e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 792.46875, "completions/mean_terminated_length": 715.2916870117188, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.9135802469135802, "grad_norm": 2.1181226483949627, "kl": 0.2181396484375, "learning_rate": 4.703549931674345e-07, "loss": -0.0669, "num_tokens": 8032479.0, "reward": 0.0, "reward_std": 0.1861894428730011, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 683.3125, "completions/mean_terminated_length": 634.6428833007812, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.9166666666666666, "grad_norm": 1.6382130671490476, "kl": 0.1922607421875, "learning_rate": 4.7011841665196227e-07, "loss": -0.1543, "num_tokens": 8060317.0, "reward": 1.862645149230957e-09, "reward_std": 0.2640279233455658, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 799.5, "completions/mean_terminated_length": 736.6399536132812, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.9197530864197531, "grad_norm": 1.7764343185319038, "kl": 0.1827392578125, "learning_rate": 4.6988095989140096e-07, "loss": -0.1012, "num_tokens": 8092253.0, "reward": 6.51925802230835e-09, "reward_std": 0.15763449668884277, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 750.625, "completions/mean_terminated_length": 687.5385131835938, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.9228395061728395, "grad_norm": 1.7056692070578798, "kl": 0.2401123046875, "learning_rate": 4.6964262383533114e-07, "loss": -0.0217, "num_tokens": 8122829.0, "reward": 3.725290298461914e-09, "reward_std": 0.22097495198249817, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 764.4375, "completions/mean_terminated_length": 704.5385131835938, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.9259259259259259, "grad_norm": 1.065816113120465, "kl": 0.1832275390625, "learning_rate": 4.694034094368495e-07, "loss": -0.0082, "num_tokens": 8153715.0, "reward": 0.02812500111758709, "reward_std": 0.11321557313203812, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 806.0, "completions/mean_terminated_length": 706.9091186523438, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.9290123456790124, "grad_norm": 2.2304382121233277, "kl": 0.2069091796875, "learning_rate": 4.691633176525651e-07, "loss": -0.0853, "num_tokens": 8186447.0, "reward": -3.725290298461914e-09, "reward_std": 0.27420520782470703, "rewards/format_reward_func/mean": 1.1175870895385742e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 841.59375, "completions/mean_terminated_length": 790.5199584960938, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 0.9320987654320988, "grad_norm": 1.7871214196940068, "kl": 0.189208984375, "learning_rate": 4.689223494425959e-07, "loss": -0.2181, "num_tokens": 8219918.0, "reward": 3.725290298461914e-09, "reward_std": 0.27329564094543457, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 694.5625, "completions/mean_terminated_length": 647.5, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.9351851851851852, "grad_norm": 1.26431278276871, "kl": 0.2164306640625, "learning_rate": 4.686805057705645e-07, "loss": -0.0975, "num_tokens": 8248556.0, "reward": 0.028124993667006493, "reward_std": 0.12057674676179886, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 802.3125, "completions/mean_terminated_length": 715.5652465820312, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.9382716049382716, "grad_norm": 1.6362305459893027, "kl": 0.189697265625, "learning_rate": 4.684377876035944e-07, "loss": -0.1168, "num_tokens": 8280638.0, "reward": 0.0, "reward_std": 0.08027060329914093, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 806.15625, "completions/mean_terminated_length": 692.047607421875, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.941358024691358, "grad_norm": 1.361930129716925, "kl": 0.18896484375, "learning_rate": 4.681941959123063e-07, "loss": 0.0299, "num_tokens": 8312699.0, "reward": 0.028124995529651642, "reward_std": 0.10160572826862335, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 808.15625, "completions/mean_terminated_length": 747.719970703125, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.9444444444444444, "grad_norm": 1.4194116880842143, "kl": 0.1986083984375, "learning_rate": 4.6794973167081397e-07, "loss": -0.1375, "num_tokens": 8345244.0, "reward": -1.862645149230957e-09, "reward_std": 0.18914230167865753, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 767.5625, "completions/mean_terminated_length": 730.9285888671875, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.9475308641975309, "grad_norm": 1.3125333171183138, "kl": 0.208251953125, "learning_rate": 4.6770439585672046e-07, "loss": -0.0046, "num_tokens": 8375882.0, "reward": 0.028124993667006493, "reward_std": 0.10480768978595734, "rewards/format_reward_func/mean": -2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 769.25, "completions/mean_terminated_length": 697.9199829101562, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.9506172839506173, "grad_norm": 1.2496521635499362, "kl": 0.2099609375, "learning_rate": 4.6745818945111426e-07, "loss": -0.0655, "num_tokens": 8407654.0, "reward": 0.02812499925494194, "reward_std": 0.10788977146148682, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 681.375, "completions/mean_terminated_length": 632.4285888671875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.9537037037037037, "grad_norm": 3.192894771680404, "kl": 0.2269287109375, "learning_rate": 4.6721111343856547e-07, "loss": -0.2669, "num_tokens": 8435982.0, "reward": 0.02812499739229679, "reward_std": 0.13851019740104675, "rewards/format_reward_func/mean": 2.60770320892334e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 799.65625, "completions/mean_terminated_length": 697.6818237304688, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.9567901234567902, "grad_norm": 1.7182349275263777, "kl": 0.2216796875, "learning_rate": 4.669631688071214e-07, "loss": -0.1529, "num_tokens": 8468247.0, "reward": 0.0, "reward_std": 0.09106835722923279, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 780.9375, "completions/mean_terminated_length": 712.8800048828125, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.9598765432098766, "grad_norm": 2.0344590490945773, "kl": 0.2119140625, "learning_rate": 4.667143565483032e-07, "loss": -0.1942, "num_tokens": 8499581.0, "reward": -3.725290298461914e-09, "reward_std": 0.10064592957496643, "rewards/format_reward_func/mean": -2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 850.5625, "completions/mean_terminated_length": 771.727294921875, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.9629629629629629, "grad_norm": 1.387639138328901, "kl": 0.205810546875, "learning_rate": 4.664646776571015e-07, "loss": -0.0767, "num_tokens": 8532843.0, "reward": 0.02812499925494194, "reward_std": 0.11077355593442917, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 778.53125, "completions/mean_terminated_length": 682.478271484375, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.9660493827160493, "grad_norm": 1.453385688103117, "kl": 0.1954345703125, "learning_rate": 4.662141331319726e-07, "loss": -0.0833, "num_tokens": 8563888.0, "reward": 3.725290298461914e-09, "reward_std": 0.18955133855342865, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 805.28125, "completions/mean_terminated_length": 744.0399780273438, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.9691358024691358, "grad_norm": 1.5846198018953086, "kl": 0.237060546875, "learning_rate": 4.6596272397483445e-07, "loss": -0.0878, "num_tokens": 8596345.0, "reward": 0.02812499925494194, "reward_std": 0.14731834828853607, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 802.65625, "completions/mean_terminated_length": 728.875, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.9722222222222222, "grad_norm": 1.4165274726711845, "kl": 0.18896484375, "learning_rate": 4.657104511910626e-07, "loss": -0.1436, "num_tokens": 8628278.0, "reward": 4.656612873077393e-10, "reward_std": 0.15477585792541504, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 739.9375, "completions/mean_terminated_length": 674.3846435546875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.9753086419753086, "grad_norm": 1.4267112007514666, "kl": 0.207275390625, "learning_rate": 4.654573157894861e-07, "loss": -0.1971, "num_tokens": 8658600.0, "reward": -3.725290298461914e-09, "reward_std": 0.07367793470621109, "rewards/format_reward_func/mean": -2.60770320892334e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 835.5625, "completions/mean_terminated_length": 761.8261108398438, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.9783950617283951, "grad_norm": 1.2429530468159484, "kl": 0.207275390625, "learning_rate": 4.652033187823838e-07, "loss": 0.0084, "num_tokens": 8692498.0, "reward": 0.028124995529651642, "reward_std": 0.1097278892993927, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 810.1875, "completions/mean_terminated_length": 713.0, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.9814814814814815, "grad_norm": 1.3244109968980724, "kl": 0.1849365234375, "learning_rate": 4.6494846118548e-07, "loss": -0.0008, "num_tokens": 8725076.0, "reward": 0.0, "reward_std": 0.18432721495628357, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 847.21875, "completions/mean_terminated_length": 788.2916870117188, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.9845679012345679, "grad_norm": 1.214473210632424, "kl": 0.2095947265625, "learning_rate": 4.6469274401794044e-07, "loss": -0.0535, "num_tokens": 8758731.0, "reward": 0.028124995529651642, "reward_std": 0.11348775029182434, "rewards/format_reward_func/mean": -2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 828.25, "completions/mean_terminated_length": 751.6521606445312, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 0.9876543209876543, "grad_norm": 1.8287834018417068, "kl": 0.2266845703125, "learning_rate": 4.6443616830236823e-07, "loss": -0.0622, "num_tokens": 8791855.0, "reward": 3.725290298461914e-09, "reward_std": 0.16360794007778168, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 825.6875, "completions/mean_terminated_length": 759.5833740234375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.9907407407407407, "grad_norm": 1.2917476118963442, "kl": 0.2122802734375, "learning_rate": 4.641787350647997e-07, "loss": -0.018, "num_tokens": 8824525.0, "reward": 0.028124995529651642, "reward_std": 0.12261858582496643, "rewards/format_reward_func/mean": -2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 720.71875, "completions/mean_terminated_length": 677.3928833007812, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.9938271604938271, "grad_norm": 2.315236983754253, "kl": 0.2188720703125, "learning_rate": 4.6392044533470053e-07, "loss": -0.0938, "num_tokens": 8854212.0, "reward": 0.0, "reward_std": 0.18609295785427094, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 793.46875, "completions/mean_terminated_length": 760.5357666015625, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.9969135802469136, "grad_norm": 1.4607945026342088, "kl": 0.2222900390625, "learning_rate": 4.636613001449615e-07, "loss": 0.0201, "num_tokens": 8885343.0, "reward": 0.02812500298023224, "reward_std": 0.12921050190925598, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 783.78125, "completions/mean_terminated_length": 739.2963256835938, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 1.0, "grad_norm": 1.2742956031622807, "kl": 0.1968994140625, "learning_rate": 4.6340130053189417e-07, "loss": -0.0566, "num_tokens": 8916640.0, "reward": 0.028124995529651642, "reward_std": 0.09852586686611176, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 766.53125, "completions/mean_terminated_length": 718.8518676757812, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 1.0030864197530864, "grad_norm": 1.3386985658527497, "kl": 0.2254638671875, "learning_rate": 4.6314044753522703e-07, "loss": -0.0188, "num_tokens": 8947637.0, "reward": 0.028124995529651642, "reward_std": 0.12057675421237946, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 823.65625, "completions/mean_terminated_length": 745.2608642578125, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 1.0061728395061729, "grad_norm": 1.045012885803152, "kl": 0.225341796875, "learning_rate": 4.6287874219810117e-07, "loss": -0.0212, "num_tokens": 8980434.0, "reward": -4.656612873077393e-10, "reward_std": 0.05696558207273483, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 760.21875, "completions/mean_terminated_length": 672.2916870117188, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 1.0092592592592593, "grad_norm": 2.070829051629741, "kl": 0.200439453125, "learning_rate": 4.626161855670663e-07, "loss": -0.1193, "num_tokens": 9011305.0, "reward": 2.7939677238464355e-09, "reward_std": 0.18401594460010529, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 794.75, "completions/mean_terminated_length": 705.0435180664062, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 1.0123456790123457, "grad_norm": 1.391068659844964, "kl": 0.260986328125, "learning_rate": 4.623527786920761e-07, "loss": -0.0676, "num_tokens": 9043449.0, "reward": -3.725290298461914e-09, "reward_std": 0.06405126303434372, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 751.03125, "completions/mean_terminated_length": 700.4815063476562, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 1.0154320987654322, "grad_norm": 1.9555365607649131, "kl": 0.24267578125, "learning_rate": 4.620885226264847e-07, "loss": -0.1742, "num_tokens": 9074510.0, "reward": -4.656612873077393e-10, "reward_std": 0.07151594012975693, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 757.53125, "completions/mean_terminated_length": 708.1851806640625, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 1.0185185185185186, "grad_norm": 1.2916364207005597, "kl": 0.225341796875, "learning_rate": 4.6182341842704177e-07, "loss": -0.0527, "num_tokens": 9105459.0, "reward": 0.02812499925494194, "reward_std": 0.11321558058261871, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 800.5, "completions/mean_terminated_length": 713.0435180664062, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 1.021604938271605, "grad_norm": 1.8860469273746234, "kl": 0.24658203125, "learning_rate": 4.6155746715388903e-07, "loss": -0.1312, "num_tokens": 9137911.0, "reward": -2.7939677238464355e-09, "reward_std": 0.09607689082622528, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 797.59375, "completions/mean_terminated_length": 694.6818237304688, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 1.0246913580246915, "grad_norm": 0.8196547752979229, "kl": 0.232421875, "learning_rate": 4.6129066987055533e-07, "loss": 0.0111, "num_tokens": 9170582.0, "reward": 0.028124995529651642, "reward_std": 0.07786067575216293, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 800.0, "completions/mean_terminated_length": 698.1818237304688, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 1.0277777777777777, "grad_norm": 1.6017538171568595, "kl": 0.2315673828125, "learning_rate": 4.610230276439526e-07, "loss": -0.134, "num_tokens": 9202658.0, "reward": -2.7939677238464355e-09, "reward_std": 0.06900564581155777, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 753.15625, "completions/mean_terminated_length": 714.4642944335938, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 1.0308641975308641, "grad_norm": 1.3638967708903083, "kl": 0.2313232421875, "learning_rate": 4.607545415443721e-07, "loss": -0.1003, "num_tokens": 9232935.0, "reward": 0.02812499925494194, "reward_std": 0.12381581962108612, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 791.9375, "completions/mean_terminated_length": 738.3846435546875, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 1.0339506172839505, "grad_norm": 1.3389285558071031, "kl": 0.2313232421875, "learning_rate": 4.604852126454792e-07, "loss": 0.0027, "num_tokens": 9264865.0, "reward": 0.02812499925494194, "reward_std": 0.13184289634227753, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 782.6875, "completions/mean_terminated_length": 738.0, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 1.037037037037037, "grad_norm": 1.0859184709201632, "kl": 0.2322998046875, "learning_rate": 4.6021504202430983e-07, "loss": -0.0282, "num_tokens": 9296891.0, "reward": -1.862645149230957e-09, "reward_std": 0.059618230909109116, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 761.0, "completions/mean_terminated_length": 673.3333740234375, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 1.0401234567901234, "grad_norm": 1.5540858164555094, "kl": 0.2288818359375, "learning_rate": 4.599440307612661e-07, "loss": 0.0036, "num_tokens": 9327651.0, "reward": 0.028124995529651642, "reward_std": 0.1387912631034851, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 826.90625, "completions/mean_terminated_length": 771.719970703125, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 1.0432098765432098, "grad_norm": 1.0394281470351463, "kl": 0.2412109375, "learning_rate": 4.5967217994011144e-07, "loss": 0.0169, "num_tokens": 9360664.0, "reward": 0.028124995529651642, "reward_std": 0.12057675421237946, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 779.09375, "completions/mean_terminated_length": 683.2608642578125, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 1.0462962962962963, "grad_norm": 1.0995136848421587, "kl": 0.221923828125, "learning_rate": 4.593994906479669e-07, "loss": -0.0187, "num_tokens": 9392319.0, "reward": 0.0, "reward_std": 0.14918872714042664, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 806.75, "completions/mean_terminated_length": 692.952392578125, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 1.0493827160493827, "grad_norm": 1.5452541460119205, "kl": 0.2412109375, "learning_rate": 4.591259639753066e-07, "loss": -0.1518, "num_tokens": 9424551.0, "reward": 0.0, "reward_std": 0.18690168857574463, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 755.8125, "completions/mean_terminated_length": 706.1481323242188, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 1.0524691358024691, "grad_norm": 1.6743990686258594, "kl": 0.239990234375, "learning_rate": 4.588516010159529e-07, "loss": -0.1598, "num_tokens": 9455101.0, "reward": 0.0, "reward_std": 0.1832430213689804, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 807.78125, "completions/mean_terminated_length": 723.1739501953125, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 1.0555555555555556, "grad_norm": 1.421654947289587, "kl": 0.245361328125, "learning_rate": 4.58576402867073e-07, "loss": -0.0177, "num_tokens": 9487994.0, "reward": -2.7939677238464355e-09, "reward_std": 0.09607689082622528, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 775.84375, "completions/mean_terminated_length": 729.888916015625, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 1.058641975308642, "grad_norm": 1.851952885497318, "kl": 0.2291259765625, "learning_rate": 4.5830037062917373e-07, "loss": -0.0178, "num_tokens": 9519473.0, "reward": 3.725290298461914e-09, "reward_std": 0.18941769003868103, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 806.8125, "completions/mean_terminated_length": 721.8261108398438, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 1.0617283950617284, "grad_norm": 0.9903122061679567, "kl": 0.22900390625, "learning_rate": 4.580235054060971e-07, "loss": -0.0547, "num_tokens": 9552079.0, "reward": -9.313225746154785e-10, "reward_std": 0.056965578347444534, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 742.1875, "completions/mean_terminated_length": 677.1538696289062, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 1.0648148148148149, "grad_norm": 1.017191215633938, "kl": 0.2313232421875, "learning_rate": 4.5774580830501685e-07, "loss": -0.0368, "num_tokens": 9582397.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 807.78125, "completions/mean_terminated_length": 639.6111450195312, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 1.0679012345679013, "grad_norm": 1.271190440324145, "kl": 0.3494873046875, "learning_rate": 4.574672804364329e-07, "loss": 0.0452, "num_tokens": 9614622.0, "reward": 0.0, "reward_std": 0.14905758202075958, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 799.46875, "completions/mean_terminated_length": 724.625, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 1.0709876543209877, "grad_norm": 1.2963272797093504, "kl": 0.246826171875, "learning_rate": 4.571879229141674e-07, "loss": -0.0761, "num_tokens": 9646701.0, "reward": -7.450580596923828e-09, "reward_std": 0.15223580598831177, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 729.34375, "completions/mean_terminated_length": 698.862060546875, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 1.074074074074074, "grad_norm": 1.8582353993105871, "kl": 0.2420654296875, "learning_rate": 4.5690773685536037e-07, "loss": -0.0516, "num_tokens": 9676248.0, "reward": 3.725290298461914e-09, "reward_std": 0.18624797463417053, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 736.75, "completions/mean_terminated_length": 670.4615478515625, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 1.0771604938271604, "grad_norm": 1.3292785642013, "kl": 0.2664794921875, "learning_rate": 4.5662672338046513e-07, "loss": -0.0511, "num_tokens": 9706268.0, "reward": -4.190951585769653e-09, "reward_std": 0.06405126303434372, "rewards/format_reward_func/mean": -3.725290298461914e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 769.125, "completions/mean_terminated_length": 684.1666870117188, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 1.0802469135802468, "grad_norm": 1.4656789656259415, "kl": 0.263916015625, "learning_rate": 4.5634488361324386e-07, "loss": -0.057, "num_tokens": 9737428.0, "reward": 4.656612873077393e-10, "reward_std": 0.06749087572097778, "rewards/format_reward_func/mean": 1.1175870895385742e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 682.5, "completions/mean_terminated_length": 633.7142944335938, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 1.0833333333333333, "grad_norm": 1.6015260587938807, "kl": 0.2655029296875, "learning_rate": 4.560622186807628e-07, "loss": -0.1084, "num_tokens": 9765428.0, "reward": -4.656612873077393e-10, "reward_std": 0.08606629073619843, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 730.4375, "completions/mean_terminated_length": 632.5833740234375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 1.0864197530864197, "grad_norm": 1.545253708649919, "kl": 0.247802734375, "learning_rate": 4.5577872971338826e-07, "loss": -0.0316, "num_tokens": 9795078.0, "reward": 0.02812499925494194, "reward_std": 0.1365205943584442, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 820.65625, "completions/mean_terminated_length": 714.1428833007812, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 1.0895061728395061, "grad_norm": 0.8123871552169197, "kl": 0.246337890625, "learning_rate": 4.554944178447816e-07, "loss": -0.0042, "num_tokens": 9827747.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 750.9375, "completions/mean_terminated_length": 711.9285888671875, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 1.0925925925925926, "grad_norm": 1.3114040538237064, "kl": 0.2572021484375, "learning_rate": 4.552092842118952e-07, "loss": 0.0155, "num_tokens": 9858769.0, "reward": 0.0, "reward_std": 0.07440169155597687, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 758.8125, "completions/mean_terminated_length": 638.2727661132812, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 1.095679012345679, "grad_norm": 1.9687993176615746, "kl": 1.0830078125, "learning_rate": 4.549233299549674e-07, "loss": -0.0936, "num_tokens": 9889595.0, "reward": -7.450580596923828e-09, "reward_std": 0.21730080246925354, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 741.125, "completions/mean_terminated_length": 675.84619140625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 1.0987654320987654, "grad_norm": 1.4586965770015488, "kl": 0.2420654296875, "learning_rate": 4.546365562175184e-07, "loss": -0.0696, "num_tokens": 9919671.0, "reward": -2.7939677238464355e-09, "reward_std": 0.08254127204418182, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 779.53125, "completions/mean_terminated_length": 744.607177734375, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 1.1018518518518519, "grad_norm": 1.1097968945572727, "kl": 0.264892578125, "learning_rate": 4.543489641463452e-07, "loss": -0.0138, "num_tokens": 9951212.0, "reward": -2.7939677238464355e-09, "reward_std": 0.059618234634399414, "rewards/format_reward_func/mean": -2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 708.125, "completions/mean_terminated_length": 675.4483032226562, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 1.1049382716049383, "grad_norm": 1.4865111899797392, "kl": 0.257568359375, "learning_rate": 4.540605548915175e-07, "loss": -0.0236, "num_tokens": 9980156.0, "reward": 0.028124993667006493, "reward_std": 0.1413259506225586, "rewards/format_reward_func/mean": -2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 772.03125, "completions/mean_terminated_length": 736.0357666015625, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 1.1080246913580247, "grad_norm": 1.5675281959330434, "kl": 0.25, "learning_rate": 4.537713296063729e-07, "loss": -0.1305, "num_tokens": 10011309.0, "reward": 4.6566128730773926e-09, "reward_std": 0.08913275599479675, "rewards/format_reward_func/mean": 3.725290298461914e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 701.84375, "completions/mean_terminated_length": 642.1851806640625, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 1.1111111111111112, "grad_norm": 1.516405493949358, "kl": 0.257080078125, "learning_rate": 4.534812894475122e-07, "loss": -0.0769, "num_tokens": 10040164.0, "reward": 2.7939677238464355e-09, "reward_std": 0.18401594460010529, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 736.40625, "completions/mean_terminated_length": 683.1481323242188, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 1.1141975308641976, "grad_norm": 1.241367741678557, "kl": 0.259033203125, "learning_rate": 4.5319043557479474e-07, "loss": -0.0822, "num_tokens": 10070633.0, "reward": 0.028124995529651642, "reward_std": 0.094046451151371, "rewards/format_reward_func/mean": -2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 826.4375, "completions/mean_terminated_length": 722.952392578125, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 1.117283950617284, "grad_norm": 1.113618964831504, "kl": 0.2376708984375, "learning_rate": 4.5289876915133394e-07, "loss": -0.0292, "num_tokens": 10103715.0, "reward": 0.05624999478459358, "reward_std": 0.11115353554487228, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0625, "rewards/logprob_reward/std": 0.24593468010425568, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 718.375, "completions/mean_terminated_length": 674.7142944335938, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 1.1203703703703705, "grad_norm": 1.8652216196467917, "kl": 0.2601318359375, "learning_rate": 4.5260629134349284e-07, "loss": -0.1558, "num_tokens": 10132667.0, "reward": 0.0, "reward_std": 0.1673842817544937, "rewards/format_reward_func/mean": 1.1175870895385742e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 670.84375, "completions/mean_terminated_length": 634.3103637695312, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 1.123456790123457, "grad_norm": 1.8852861657192153, "kl": 0.2645263671875, "learning_rate": 4.523130033208788e-07, "loss": -0.0096, "num_tokens": 10160562.0, "reward": 2.7939677238464355e-09, "reward_std": 0.21927371621131897, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110855221748352, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 774.96875, "completions/mean_terminated_length": 644.5238037109375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 1.126543209876543, "grad_norm": 1.5917623773340683, "kl": 0.2445068359375, "learning_rate": 4.520189062563393e-07, "loss": 0.0687, "num_tokens": 10192437.0, "reward": 3.725290298461914e-09, "reward_std": 0.188092902302742, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 707.8125, "completions/mean_terminated_length": 649.25927734375, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 1.1296296296296295, "grad_norm": 2.12568058540548, "kl": 0.2535400390625, "learning_rate": 4.5172400132595737e-07, "loss": -0.111, "num_tokens": 10221219.0, "reward": 0.02812499925494194, "reward_std": 0.1345728486776352, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 681.4375, "completions/mean_terminated_length": 618.0, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 1.132716049382716, "grad_norm": 1.4880863233115877, "kl": 0.2850341796875, "learning_rate": 4.514282897090464e-07, "loss": -0.0282, "num_tokens": 10249201.0, "reward": 0.02812499925494194, "reward_std": 0.13184288144111633, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 776.5625, "completions/mean_terminated_length": 646.952392578125, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 1.1358024691358024, "grad_norm": 0.016900480733155743, "kl": 0.2630615234375, "learning_rate": 4.511317725881457e-07, "loss": 0.0003, "num_tokens": 10281067.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 702.125, "completions/mean_terminated_length": 642.5184936523438, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 1.1388888888888888, "grad_norm": 1.3593715604492063, "kl": 0.2454833984375, "learning_rate": 4.50834451149016e-07, "loss": -0.0019, "num_tokens": 10309879.0, "reward": 0.028124995529651642, "reward_std": 0.11906316876411438, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 682.5625, "completions/mean_terminated_length": 647.2413940429688, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 1.1419753086419753, "grad_norm": 1.354772613119782, "kl": 0.3011474609375, "learning_rate": 4.505363265806342e-07, "loss": -0.0109, "num_tokens": 10338141.0, "reward": -9.313225746154785e-10, "reward_std": 0.07151594012975693, "rewards/format_reward_func/mean": 2.9802322387695312e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 707.4375, "completions/mean_terminated_length": 634.3846435546875, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 1.1450617283950617, "grad_norm": 1.4972720180071932, "kl": 0.2691650390625, "learning_rate": 4.502374000751891e-07, "loss": -0.0622, "num_tokens": 10367407.0, "reward": 3.725290298461914e-09, "reward_std": 0.17211824655532837, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 753.1875, "completions/mean_terminated_length": 647.2174072265625, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 1.1481481481481481, "grad_norm": 1.8175021168347008, "kl": 0.275634765625, "learning_rate": 4.49937672828076e-07, "loss": -0.0613, "num_tokens": 10397509.0, "reward": -2.7939677238464355e-09, "reward_std": 0.23933574557304382, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110855221748352, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 709.5, "completions/mean_terminated_length": 651.25927734375, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 1.1512345679012346, "grad_norm": 1.3041032505629424, "kl": 0.2508544921875, "learning_rate": 4.4963714603789315e-07, "loss": -0.0946, "num_tokens": 10426677.0, "reward": -1.862645149230957e-09, "reward_std": 0.16686978936195374, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 675.625, "completions/mean_terminated_length": 664.3870849609375, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 1.154320987654321, "grad_norm": 1.034446464436393, "kl": 0.2545166015625, "learning_rate": 4.4933582090643516e-07, "loss": -0.015, "num_tokens": 10454741.0, "reward": 0.028124995529651642, "reward_std": 0.07786068320274353, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 758.3125, "completions/mean_terminated_length": 697.0, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 1.1574074074074074, "grad_norm": 1.3057103198966025, "kl": 0.263671875, "learning_rate": 4.4903369863869e-07, "loss": -0.0673, "num_tokens": 10485355.0, "reward": 3.725290298461914e-09, "reward_std": 0.08985722810029984, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 719.1875, "completions/mean_terminated_length": 687.6551513671875, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 1.1604938271604939, "grad_norm": 1.2734489858967948, "kl": 0.29296875, "learning_rate": 4.4873078044283273e-07, "loss": -0.0296, "num_tokens": 10514849.0, "reward": -3.725290298461914e-09, "reward_std": 0.06900564581155777, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 766.34375, "completions/mean_terminated_length": 729.5357666015625, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 1.1635802469135803, "grad_norm": 1.6566510683607516, "kl": 0.2490234375, "learning_rate": 4.484270675302218e-07, "loss": 0.0077, "num_tokens": 10546096.0, "reward": 0.02812499925494194, "reward_std": 0.14731836318969727, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 767.09375, "completions/mean_terminated_length": 695.1599731445312, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 1.1666666666666667, "grad_norm": 1.3566675053102628, "kl": 0.292236328125, "learning_rate": 4.481225611153933e-07, "loss": -0.0718, "num_tokens": 10577267.0, "reward": 9.313225746154785e-10, "reward_std": 0.06749087572097778, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 872.78125, "completions/mean_terminated_length": 813.6087036132812, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "epoch": 1.1697530864197532, "grad_norm": 1.2471242642242912, "kl": 0.2442626953125, "learning_rate": 4.4781726241605683e-07, "loss": 0.0262, "num_tokens": 10612108.0, "reward": 1.862645149230957e-09, "reward_std": 0.08027059584856033, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 765.84375, "completions/mean_terminated_length": 679.7916870117188, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 1.1728395061728394, "grad_norm": 1.30494481174423, "kl": 0.3017578125, "learning_rate": 4.4751117265309e-07, "loss": 0.0063, "num_tokens": 10642895.0, "reward": 0.0, "reward_std": 0.08027060329914093, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 765.59375, "completions/mean_terminated_length": 648.1363525390625, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 1.175925925925926, "grad_norm": 0.9044069185541123, "kl": 0.275390625, "learning_rate": 4.472042930505342e-07, "loss": -0.0351, "num_tokens": 10674298.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 734.90625, "completions/mean_terminated_length": 668.1923217773438, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 1.1790123456790123, "grad_norm": 1.3501100187377286, "kl": 0.2606201171875, "learning_rate": 4.46896624835589e-07, "loss": -0.0829, "num_tokens": 10704623.0, "reward": -7.450580596923828e-09, "reward_std": 0.15223580598831177, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 799.0, "completions/mean_terminated_length": 724.0, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 1.1820987654320987, "grad_norm": 1.2447347049204807, "kl": 0.262939453125, "learning_rate": 4.465881692386078e-07, "loss": 0.0098, "num_tokens": 10736651.0, "reward": 0.02812499925494194, "reward_std": 0.11206148564815521, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 683.75, "completions/mean_terminated_length": 620.74072265625, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 1.1851851851851851, "grad_norm": 1.5798290251530331, "kl": 0.2781982421875, "learning_rate": 4.4627892749309273e-07, "loss": -0.0459, "num_tokens": 10764883.0, "reward": -9.313225746154785e-10, "reward_std": 0.1861894279718399, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 746.28125, "completions/mean_terminated_length": 694.8518676757812, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 1.1882716049382716, "grad_norm": 1.2819675135830302, "kl": 0.2896728515625, "learning_rate": 4.459689008356896e-07, "loss": -0.0129, "num_tokens": 10794992.0, "reward": 0.0, "reward_std": 0.07886750996112823, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 759.125, "completions/mean_terminated_length": 698.0, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 1.191358024691358, "grad_norm": 1.2606431620360652, "kl": 0.2867431640625, "learning_rate": 4.4565809050618317e-07, "loss": -0.0015, "num_tokens": 10825748.0, "reward": 0.02812499925494194, "reward_std": 0.12525564432144165, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 799.0625, "completions/mean_terminated_length": 664.1000366210938, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 1.1944444444444444, "grad_norm": 0.9421573699780736, "kl": 0.2657470703125, "learning_rate": 4.45346497747492e-07, "loss": -0.0066, "num_tokens": 10857862.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 755.75, "completions/mean_terminated_length": 680.6400146484375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 1.1975308641975309, "grad_norm": 0.9977061885065656, "kl": 0.36279296875, "learning_rate": 4.450341238056634e-07, "loss": -0.0194, "num_tokens": 10888590.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 763.84375, "completions/mean_terminated_length": 677.125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 1.2006172839506173, "grad_norm": 1.3470845527511202, "kl": 0.27880859375, "learning_rate": 4.4472096992986895e-07, "loss": 0.0067, "num_tokens": 10919857.0, "reward": 4.656612873077393e-10, "reward_std": 0.09305032342672348, "rewards/format_reward_func/mean": 1.1175870895385742e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 673.78125, "completions/mean_terminated_length": 637.5516967773438, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 1.2037037037037037, "grad_norm": 1.3016995988610571, "kl": 0.302490234375, "learning_rate": 4.444070373723989e-07, "loss": -0.0277, "num_tokens": 10947570.0, "reward": 0.02812499925494194, "reward_std": 0.13184288144111633, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 775.5625, "completions/mean_terminated_length": 692.75, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 1.2067901234567902, "grad_norm": 1.3185866373896116, "kl": 0.274658203125, "learning_rate": 4.4409232738865744e-07, "loss": 0.0107, "num_tokens": 10979024.0, "reward": 0.028124993667006493, "reward_std": 0.12057674676179886, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 755.28125, "completions/mean_terminated_length": 665.7083740234375, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 1.2098765432098766, "grad_norm": 1.1114605142361125, "kl": 0.3101806640625, "learning_rate": 4.4377684123715763e-07, "loss": -0.0029, "num_tokens": 11009681.0, "reward": -2.3283064365386963e-09, "reward_std": 0.06432675570249557, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 757.40625, "completions/mean_terminated_length": 682.760009765625, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 1.212962962962963, "grad_norm": 1.2370891083488853, "kl": 0.2557373046875, "learning_rate": 4.434605801795167e-07, "loss": -0.0249, "num_tokens": 11040470.0, "reward": 0.028124995529651642, "reward_std": 0.12057675421237946, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 742.78125, "completions/mean_terminated_length": 632.7391357421875, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 1.2160493827160495, "grad_norm": 0.9488392827947685, "kl": 0.296630859375, "learning_rate": 4.431435454804503e-07, "loss": 0.0133, "num_tokens": 11070483.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 735.75, "completions/mean_terminated_length": 682.370361328125, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 1.2191358024691359, "grad_norm": 1.3788628642821232, "kl": 0.288330078125, "learning_rate": 4.42825738407768e-07, "loss": -0.0402, "num_tokens": 11101003.0, "reward": -3.725290298461914e-09, "reward_std": 0.19551047682762146, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 782.8125, "completions/mean_terminated_length": 702.4166870117188, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 1.2222222222222223, "grad_norm": 0.8794678162422529, "kl": 0.29345703125, "learning_rate": 4.425071602323681e-07, "loss": 0.0209, "num_tokens": 11132509.0, "reward": -2.3283064365386963e-09, "reward_std": 0.06432675570249557, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 793.25, "completions/mean_terminated_length": 672.3809814453125, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 1.2253086419753085, "grad_norm": 1.113998526736297, "kl": 0.305908203125, "learning_rate": 4.421878122282325e-07, "loss": 0.0005, "num_tokens": 11164157.0, "reward": -4.656612873077393e-10, "reward_std": 0.07151593267917633, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 760.09375, "completions/mean_terminated_length": 672.125, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 1.228395061728395, "grad_norm": 0.7409330586619888, "kl": 0.2860107421875, "learning_rate": 4.4186769567242163e-07, "loss": 0.0028, "num_tokens": 11195028.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 710.21875, "completions/mean_terminated_length": 665.3928833007812, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 1.2314814814814814, "grad_norm": 1.034515189164874, "kl": 0.29443359375, "learning_rate": 4.4154681184506927e-07, "loss": 0.024, "num_tokens": 11224227.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 774.53125, "completions/mean_terminated_length": 661.1364135742188, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 1.2345679012345678, "grad_norm": 1.0577007948656316, "kl": 0.327392578125, "learning_rate": 4.4122516202937745e-07, "loss": 0.0348, "num_tokens": 11255848.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 764.71875, "completions/mean_terminated_length": 727.6785888671875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 1.2376543209876543, "grad_norm": 0.7348152848815923, "kl": 0.285888671875, "learning_rate": 4.4090274751161144e-07, "loss": 0.025, "num_tokens": 11287351.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 794.625, "completions/mean_terminated_length": 752.1481323242188, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 1.2407407407407407, "grad_norm": 1.1781215750943606, "kl": 0.31640625, "learning_rate": 4.4057956958109453e-07, "loss": 0.0225, "num_tokens": 11319351.0, "reward": 0.02812499925494194, "reward_std": 0.10609643161296844, "rewards/format_reward_func/mean": -7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 789.0, "completions/mean_terminated_length": 697.0435180664062, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 1.2438271604938271, "grad_norm": 0.014007581081706421, "kl": 0.288330078125, "learning_rate": 4.402556295302029e-07, "loss": 0.0003, "num_tokens": 11351103.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 751.21875, "completions/mean_terminated_length": 688.2692260742188, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 1.2469135802469136, "grad_norm": 1.232008619438482, "kl": 0.328125, "learning_rate": 4.3993092865436035e-07, "loss": -0.03, "num_tokens": 11381482.0, "reward": -4.656612873077393e-10, "reward_std": 0.07151593267917633, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 752.375, "completions/mean_terminated_length": 661.8333740234375, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 1.25, "grad_norm": 0.47822770796013664, "kl": 0.327392578125, "learning_rate": 4.3960546825203304e-07, "loss": 0.0151, "num_tokens": 11411846.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 773.09375, "completions/mean_terminated_length": 659.0454711914062, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 1.2530864197530864, "grad_norm": 0.8680461203097858, "kl": 0.306884765625, "learning_rate": 4.392792496247248e-07, "loss": 0.0099, "num_tokens": 11442933.0, "reward": 0.0, "reward_std": 0.13912895321846008, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 714.53125, "completions/mean_terminated_length": 643.1154174804688, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 1.2561728395061729, "grad_norm": 1.5318159052688987, "kl": 0.319091796875, "learning_rate": 4.3895227407697135e-07, "loss": -0.022, "num_tokens": 11471782.0, "reward": -3.725290298461914e-09, "reward_std": 0.20109625160694122, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 788.1875, "completions/mean_terminated_length": 695.9130859375, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 1.2592592592592593, "grad_norm": 1.8787070751135608, "kl": 0.2724609375, "learning_rate": 4.3862454291633523e-07, "loss": 0.1961, "num_tokens": 11503556.0, "reward": -5.587935447692871e-09, "reward_std": 0.18140104413032532, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 763.09375, "completions/mean_terminated_length": 690.0399780273438, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 1.2623456790123457, "grad_norm": 2.0333008654259195, "kl": 0.324951171875, "learning_rate": 4.382960574534009e-07, "loss": -0.2294, "num_tokens": 11534543.0, "reward": -3.259629011154175e-09, "reward_std": 0.1641397774219513, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 734.0, "completions/mean_terminated_length": 680.2963256835938, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 1.2654320987654322, "grad_norm": 1.2146753972907172, "kl": 0.32568359375, "learning_rate": 4.3796681900176903e-07, "loss": 0.0007, "num_tokens": 11564487.0, "reward": 0.02812499925494194, "reward_std": 0.1158682331442833, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 727.875, "completions/mean_terminated_length": 708.1333618164062, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 1.2685185185185186, "grad_norm": 1.2463339846434414, "kl": 0.32421875, "learning_rate": 4.3763682887805153e-07, "loss": -0.0302, "num_tokens": 11594387.0, "reward": 0.02812499925494194, "reward_std": 0.11586824059486389, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 733.40625, "completions/mean_terminated_length": 679.5925903320312, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 1.2716049382716048, "grad_norm": 1.1210913026388611, "kl": 0.33154296875, "learning_rate": 4.3730608840186625e-07, "loss": 0.0047, "num_tokens": 11624296.0, "reward": 0.02812499739229679, "reward_std": 0.1024516224861145, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 732.09375, "completions/mean_terminated_length": 678.0370483398438, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 1.2746913580246915, "grad_norm": 0.9685326971132457, "kl": 0.3294677734375, "learning_rate": 4.3697459889583166e-07, "loss": 0.0148, "num_tokens": 11654427.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 715.90625, "completions/mean_terminated_length": 644.8077392578125, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 1.2777777777777777, "grad_norm": 1.048193580651815, "kl": 0.3486328125, "learning_rate": 4.366423616855615e-07, "loss": -0.068, "num_tokens": 11683920.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 733.15625, "completions/mean_terminated_length": 691.607177734375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 1.2808641975308643, "grad_norm": 1.2887510459006943, "kl": 0.307373046875, "learning_rate": 4.363093780996596e-07, "loss": -0.0282, "num_tokens": 11714213.0, "reward": 0.0, "reward_std": 0.14231424033641815, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 755.875, "completions/mean_terminated_length": 694.0, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 1.2839506172839505, "grad_norm": 0.008248914693801979, "kl": 0.29150390625, "learning_rate": 4.359756494697146e-07, "loss": 0.0003, "num_tokens": 11745765.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 802.6875, "completions/mean_terminated_length": 716.0869750976562, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 1.287037037037037, "grad_norm": 1.1157228360960278, "kl": 0.314453125, "learning_rate": 4.356411771302944e-07, "loss": -0.0366, "num_tokens": 11778179.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 764.21875, "completions/mean_terminated_length": 677.625, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 1.2901234567901234, "grad_norm": 0.7525352377728132, "kl": 0.339111328125, "learning_rate": 4.353059624189411e-07, "loss": 0.0033, "num_tokens": 11808910.0, "reward": 9.313225746154785e-10, "reward_std": 0.10225499421358109, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 754.3125, "completions/mean_terminated_length": 704.370361328125, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 1.2932098765432098, "grad_norm": 1.2550176381062912, "kl": 0.3251953125, "learning_rate": 4.3497000667616534e-07, "loss": 0.0052, "num_tokens": 11839792.0, "reward": 0.02812499925494194, "reward_std": 0.1158682331442833, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 763.53125, "completions/mean_terminated_length": 715.2963256835938, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 1.2962962962962963, "grad_norm": 0.5930065562985071, "kl": 0.3135986328125, "learning_rate": 4.346333112454413e-07, "loss": 0.0011, "num_tokens": 11870997.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 737.9375, "completions/mean_terminated_length": 697.0714721679688, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 1.2993827160493827, "grad_norm": 0.5148470707330806, "kl": 0.3297119140625, "learning_rate": 4.342958774732011e-07, "loss": -0.0102, "num_tokens": 11901319.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 762.3125, "completions/mean_terminated_length": 724.9285888671875, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 1.3024691358024691, "grad_norm": 0.8095249053527938, "kl": 0.318359375, "learning_rate": 4.3395770670882935e-07, "loss": -0.0128, "num_tokens": 11932529.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 719.21875, "completions/mean_terminated_length": 675.6785888671875, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 1.3055555555555556, "grad_norm": 1.0661536532752252, "kl": 0.328857421875, "learning_rate": 4.3361880030465803e-07, "loss": 0.0062, "num_tokens": 11962220.0, "reward": 0.0, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 704.3125, "completions/mean_terminated_length": 671.2413940429688, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 1.308641975308642, "grad_norm": 1.2453204382631962, "kl": 0.35400390625, "learning_rate": 4.3327915961596066e-07, "loss": -0.0105, "num_tokens": 11991466.0, "reward": -4.656612873077393e-10, "reward_std": 0.07151594012975693, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 729.6875, "completions/mean_terminated_length": 675.1851806640625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 1.3117283950617284, "grad_norm": 0.5767029027540111, "kl": 0.3193359375, "learning_rate": 4.3293878600094746e-07, "loss": 0.0027, "num_tokens": 12021000.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 755.90625, "completions/mean_terminated_length": 694.0385131835938, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 1.3148148148148149, "grad_norm": 0.8638429458633945, "kl": 0.3359375, "learning_rate": 4.325976808207594e-07, "loss": -0.0058, "num_tokens": 12051457.0, "reward": 0.02812499925494194, "reward_std": 0.08606424182653427, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 737.53125, "completions/mean_terminated_length": 671.423095703125, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 1.3179012345679013, "grad_norm": 1.0837469297609128, "kl": 0.32470703125, "learning_rate": 4.3225584543946303e-07, "loss": 0.0135, "num_tokens": 12081534.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 703.375, "completions/mean_terminated_length": 629.3846435546875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 1.3209876543209877, "grad_norm": 1.4149183005045936, "kl": 0.35205078125, "learning_rate": 4.319132812240448e-07, "loss": -0.0514, "num_tokens": 12110210.0, "reward": -7.450580596923828e-09, "reward_std": 0.15223580598831177, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 690.125, "completions/mean_terminated_length": 642.4285888671875, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 1.324074074074074, "grad_norm": 1.3785281938220508, "kl": 0.320556640625, "learning_rate": 4.3156998954440587e-07, "loss": -0.0047, "num_tokens": 12138394.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 692.21875, "completions/mean_terminated_length": 644.8214721679688, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 1.3271604938271606, "grad_norm": 0.5041500125980993, "kl": 0.3072509765625, "learning_rate": 4.312259717733565e-07, "loss": 0.0066, "num_tokens": 12166569.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 698.71875, "completions/mean_terminated_length": 665.0689697265625, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 1.3302469135802468, "grad_norm": 1.3994839331379305, "kl": 0.358154296875, "learning_rate": 4.308812292866105e-07, "loss": -0.0216, "num_tokens": 12195120.0, "reward": 0.028124993667006493, "reward_std": 0.12057674676179886, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 759.6875, "completions/mean_terminated_length": 698.6923217773438, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 1.3333333333333333, "grad_norm": 1.3113319724659547, "kl": 0.33251953125, "learning_rate": 4.3053576346277997e-07, "loss": -0.0501, "num_tokens": 12226218.0, "reward": -3.725290298461914e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 723.3125, "completions/mean_terminated_length": 623.0833740234375, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 1.3364197530864197, "grad_norm": 1.6940617261107462, "kl": 0.386474609375, "learning_rate": 4.301895756833692e-07, "loss": -0.0873, "num_tokens": 12255320.0, "reward": -3.725290298461914e-09, "reward_std": 0.16567710041999817, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 787.46875, "completions/mean_terminated_length": 679.95458984375, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 1.3395061728395061, "grad_norm": 0.9854554475323402, "kl": 0.3048095703125, "learning_rate": 4.298426673327701e-07, "loss": 0.0116, "num_tokens": 12287151.0, "reward": 0.028124995529651642, "reward_std": 0.09598580002784729, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 692.5, "completions/mean_terminated_length": 692.5, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 1.3425925925925926, "grad_norm": 0.7925101044513421, "kl": 0.361328125, "learning_rate": 4.2949503979825563e-07, "loss": 0.0061, "num_tokens": 12315707.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 704.90625, "completions/mean_terminated_length": 631.2692260742188, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 1.345679012345679, "grad_norm": 1.1121111546655797, "kl": 0.32177734375, "learning_rate": 4.2914669446997504e-07, "loss": -0.013, "num_tokens": 12344912.0, "reward": -3.725290298461914e-09, "reward_std": 0.15908847749233246, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 717.65625, "completions/mean_terminated_length": 685.9655151367188, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 1.3487654320987654, "grad_norm": 0.8626115907205927, "kl": 0.3309326171875, "learning_rate": 4.287976327409478e-07, "loss": -0.0177, "num_tokens": 12374525.0, "reward": -3.725290298461914e-09, "reward_std": 0.13241708278656006, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 730.8125, "completions/mean_terminated_length": 676.5184936523438, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 1.3518518518518519, "grad_norm": 0.8214353649895705, "kl": 0.34130859375, "learning_rate": 4.284478560070585e-07, "loss": 0.0237, "num_tokens": 12404283.0, "reward": 0.02812499925494194, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 753.78125, "completions/mean_terminated_length": 715.1785888671875, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 1.3549382716049383, "grad_norm": 1.416194781903389, "kl": 0.33544921875, "learning_rate": 4.280973656670508e-07, "loss": -0.066, "num_tokens": 12435016.0, "reward": 0.028124993667006493, "reward_std": 0.12057674676179886, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 755.71875, "completions/mean_terminated_length": 717.3928833007812, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 1.3580246913580247, "grad_norm": 1.1659610856616398, "kl": 0.3134765625, "learning_rate": 4.277461631225221e-07, "loss": -0.0346, "num_tokens": 12466079.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 644.5, "completions/mean_terminated_length": 632.258056640625, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 1.3611111111111112, "grad_norm": 0.9054769529161981, "kl": 0.34130859375, "learning_rate": 4.2739424977791784e-07, "loss": -0.0177, "num_tokens": 12493131.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 717.3125, "completions/mean_terminated_length": 673.5, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 1.3641975308641976, "grad_norm": 0.589301737696435, "kl": 0.3555908203125, "learning_rate": 4.2704162704052594e-07, "loss": 0.0094, "num_tokens": 12522029.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 799.78125, "completions/mean_terminated_length": 725.0416870117188, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 1.367283950617284, "grad_norm": 0.009300813154565053, "kl": 0.32470703125, "learning_rate": 4.2668829632047124e-07, "loss": 0.0003, "num_tokens": 12554062.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 718.09375, "completions/mean_terminated_length": 686.4483032226562, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 1.3703703703703702, "grad_norm": 0.7173552794948027, "kl": 0.302978515625, "learning_rate": 4.2633425903070973e-07, "loss": -0.0124, "num_tokens": 12583541.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 772.78125, "completions/mean_terminated_length": 714.8077392578125, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 1.373456790123457, "grad_norm": 0.49122238250860734, "kl": 0.288818359375, "learning_rate": 4.259795165870229e-07, "loss": -0.0027, "num_tokens": 12614670.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 753.78125, "completions/mean_terminated_length": 691.423095703125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 1.376543209876543, "grad_norm": 2.062589270834479, "kl": 0.353515625, "learning_rate": 4.256240704080121e-07, "loss": -0.1871, "num_tokens": 12645067.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 730.03125, "completions/mean_terminated_length": 675.5925903320312, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 1.3796296296296298, "grad_norm": 1.2127359756000133, "kl": 0.3388671875, "learning_rate": 4.2526792191509297e-07, "loss": 0.0012, "num_tokens": 12674968.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 687.15625, "completions/mean_terminated_length": 652.3103637695312, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 1.382716049382716, "grad_norm": 0.7313600857010765, "kl": 0.297607421875, "learning_rate": 4.249110725324897e-07, "loss": 0.0003, "num_tokens": 12703301.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 745.78125, "completions/mean_terminated_length": 694.25927734375, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 1.3858024691358024, "grad_norm": 1.332798569391717, "kl": 0.3165283203125, "learning_rate": 4.2455352368722916e-07, "loss": -0.0528, "num_tokens": 12734442.0, "reward": 0.02812499739229679, "reward_std": 0.12057675421237946, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 816.125, "completions/mean_terminated_length": 721.6364135742188, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 1.3888888888888888, "grad_norm": 0.4313526530697991, "kl": 0.335693359375, "learning_rate": 4.2419527680913554e-07, "loss": 0.0069, "num_tokens": 12767014.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 735.375, "completions/mean_terminated_length": 681.9259033203125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 1.3919753086419753, "grad_norm": 1.4063448246433303, "kl": 0.348388671875, "learning_rate": 4.2383633333082423e-07, "loss": -0.0812, "num_tokens": 12797074.0, "reward": -4.6566128730773926e-09, "reward_std": 0.15007510781288147, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.026798367500305e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 742.15625, "completions/mean_terminated_length": 689.9629516601562, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 1.3950617283950617, "grad_norm": 0.8428837510838175, "kl": 0.3349609375, "learning_rate": 4.234766946876965e-07, "loss": 0.0141, "num_tokens": 12827079.0, "reward": 3.725290298461914e-09, "reward_std": 0.1254904866218567, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 717.125, "completions/mean_terminated_length": 673.2857666015625, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 1.3981481481481481, "grad_norm": 0.009826459446768576, "kl": 0.371337890625, "learning_rate": 4.231163623179335e-07, "loss": 0.0004, "num_tokens": 12856795.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 794.125, "completions/mean_terminated_length": 689.6364135742188, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 1.4012345679012346, "grad_norm": 0.009539250603866232, "kl": 0.312744140625, "learning_rate": 4.227553376624904e-07, "loss": 0.0003, "num_tokens": 12889007.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 794.75, "completions/mean_terminated_length": 690.5454711914062, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 1.404320987654321, "grad_norm": 0.966587225456212, "kl": 0.328369140625, "learning_rate": 4.22393622165091e-07, "loss": 0.0279, "num_tokens": 12921271.0, "reward": -5.587935447692871e-09, "reward_std": 0.16195404529571533, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 782.09375, "completions/mean_terminated_length": 726.269287109375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 1.4074074074074074, "grad_norm": 0.6819926506904097, "kl": 0.335205078125, "learning_rate": 4.220312172722216e-07, "loss": 0.0361, "num_tokens": 12952590.0, "reward": 0.02812499925494194, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 818.90625, "completions/mean_terminated_length": 750.5416870117188, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 1.4104938271604939, "grad_norm": 0.008401392670728085, "kl": 0.34423828125, "learning_rate": 4.216681244331256e-07, "loss": 0.0003, "num_tokens": 12985303.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 784.34375, "completions/mean_terminated_length": 704.4583740234375, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 1.4135802469135803, "grad_norm": 0.9587602391256548, "kl": 0.353515625, "learning_rate": 4.2130434509979714e-07, "loss": -0.0278, "num_tokens": 13017686.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 734.71875, "completions/mean_terminated_length": 704.7930908203125, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 1.4166666666666667, "grad_norm": 1.5716828641475282, "kl": 0.34423828125, "learning_rate": 4.209398807269758e-07, "loss": -0.154, "num_tokens": 13047677.0, "reward": -3.725290298461914e-09, "reward_std": 0.15870162844657898, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 778.8125, "completions/mean_terminated_length": 710.1599731445312, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 1.4197530864197532, "grad_norm": 0.7154739665915979, "kl": 0.3125, "learning_rate": 4.205747327721407e-07, "loss": -0.0229, "num_tokens": 13078863.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 781.4375, "completions/mean_terminated_length": 700.5833740234375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 1.4228395061728394, "grad_norm": 0.9896365183054785, "kl": 0.31884765625, "learning_rate": 4.2020890269550454e-07, "loss": 0.0131, "num_tokens": 13110765.0, "reward": 0.0, "reward_std": 0.16249045729637146, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 776.0, "completions/mean_terminated_length": 678.95654296875, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 1.425925925925926, "grad_norm": 0.49623455304372194, "kl": 0.3243408203125, "learning_rate": 4.198423919600076e-07, "loss": 0.0053, "num_tokens": 13142181.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 724.625, "completions/mean_terminated_length": 724.625, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 1.4290123456790123, "grad_norm": 1.229600254826551, "kl": 0.3414306640625, "learning_rate": 4.1947520203131217e-07, "loss": -0.0044, "num_tokens": 13171625.0, "reward": 2.3283064365386963e-09, "reward_std": 0.20035111904144287, "rewards/format_reward_func/mean": 2.2351741790771484e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 778.21875, "completions/mean_terminated_length": 721.5, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 1.4320987654320987, "grad_norm": 0.8382003160649317, "kl": 0.3203125, "learning_rate": 4.191073343777968e-07, "loss": 0.0069, "num_tokens": 13203664.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 464 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 744.3125, "completions/mean_terminated_length": 679.7692260742188, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 1.4351851851851851, "grad_norm": 0.7937742594266002, "kl": NaN, "learning_rate": 4.1873879047055005e-07, "loss": 0.0015, "num_tokens": 13233842.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 780.96875, "completions/mean_terminated_length": 685.8695678710938, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 1.4382716049382716, "grad_norm": 1.2777775438614578, "kl": 0.322021484375, "learning_rate": 4.183695717833649e-07, "loss": -0.0758, "num_tokens": 13265621.0, "reward": 0.0, "reward_std": 0.14349564909934998, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 805.375, "completions/mean_terminated_length": 744.1599731445312, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 1.441358024691358, "grad_norm": 0.7591786636402239, "kl": 0.341064453125, "learning_rate": 4.179996797927326e-07, "loss": 0.0326, "num_tokens": 13297765.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 777.4375, "completions/mean_terminated_length": 680.95654296875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 1.4444444444444444, "grad_norm": 0.5209501100963339, "kl": 0.3345947265625, "learning_rate": 4.17629115977837e-07, "loss": 0.0056, "num_tokens": 13329271.0, "reward": 0.0, "reward_std": 0.09185586124658585, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 862.5, "completions/mean_terminated_length": 765.6000366210938, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 1.4475308641975309, "grad_norm": 0.5268008371662545, "kl": 0.33154296875, "learning_rate": 4.1725788182054867e-07, "loss": 0.0309, "num_tokens": 13363535.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 772.40625, "completions/mean_terminated_length": 640.6190795898438, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 1.4506172839506173, "grad_norm": 0.010361077728183815, "kl": 0.324462890625, "learning_rate": 4.1688597880541863e-07, "loss": 0.0003, "num_tokens": 13394608.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 737.59375, "completions/mean_terminated_length": 671.5, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 1.4537037037037037, "grad_norm": 1.4216610925062358, "kl": 0.31201171875, "learning_rate": 4.1651340841967284e-07, "loss": -0.0691, "num_tokens": 13424467.0, "reward": 0.0, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 684.28125, "completions/mean_terminated_length": 635.75, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 1.4567901234567902, "grad_norm": 0.8916789301480571, "kl": 0.38720703125, "learning_rate": 4.161401721532059e-07, "loss": -0.0286, "num_tokens": 13452452.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 763.46875, "completions/mean_terminated_length": 726.2500610351562, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 1.4598765432098766, "grad_norm": 0.009345990794967311, "kl": 0.31298828125, "learning_rate": 4.1576627149857513e-07, "loss": 0.0003, "num_tokens": 13483403.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 704.15625, "completions/mean_terminated_length": 671.0689697265625, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 1.462962962962963, "grad_norm": 1.335597575659992, "kl": 0.3358154296875, "learning_rate": 4.153917079509952e-07, "loss": -0.0718, "num_tokens": 13512116.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 786.5625, "completions/mean_terminated_length": 678.6364135742188, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 1.4660493827160495, "grad_norm": 1.1981375403128411, "kl": 0.34423828125, "learning_rate": 4.150164830083311e-07, "loss": -0.0288, "num_tokens": 13543370.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 835.0, "completions/mean_terminated_length": 736.0, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 1.4691358024691357, "grad_norm": 0.9639150272725308, "kl": 0.357666015625, "learning_rate": 4.146405981710931e-07, "loss": 0.0124, "num_tokens": 13576846.0, "reward": 0.02812499739229679, "reward_std": 0.12057675421237946, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 773.25, "completions/mean_terminated_length": 703.0399780273438, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 1.4722222222222223, "grad_norm": 0.47758534044903295, "kl": 0.3179931640625, "learning_rate": 4.142640549424302e-07, "loss": 0.0216, "num_tokens": 13607994.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 767.03125, "completions/mean_terminated_length": 681.375, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 1.4753086419753085, "grad_norm": 0.5999650568642271, "kl": 0.331787109375, "learning_rate": 4.1388685482812413e-07, "loss": 0.0102, "num_tokens": 13639259.0, "reward": 0.028124995529651642, "reward_std": 0.055743563920259476, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 763.5625, "completions/mean_terminated_length": 726.357177734375, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 1.4783950617283952, "grad_norm": 0.8444555416217089, "kl": 0.3466796875, "learning_rate": 4.135089993365839e-07, "loss": 0.0116, "num_tokens": 13669713.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 770.125, "completions/mean_terminated_length": 699.0399780273438, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 1.4814814814814814, "grad_norm": 0.7185053938384726, "kl": 0.312255859375, "learning_rate": 4.131304899788389e-07, "loss": 0.0193, "num_tokens": 13700593.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 729.5625, "completions/mean_terminated_length": 675.0370483398438, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 1.4845679012345678, "grad_norm": 0.014659263925658098, "kl": 0.32861328125, "learning_rate": 4.127513282685336e-07, "loss": 0.0003, "num_tokens": 13730239.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 758.6875, "completions/mean_terminated_length": 684.3999633789062, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 1.4876543209876543, "grad_norm": 1.2735473500356849, "kl": 0.326416015625, "learning_rate": 4.123715157219211e-07, "loss": -0.0272, "num_tokens": 13760825.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 776.125, "completions/mean_terminated_length": 718.923095703125, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 1.4907407407407407, "grad_norm": 1.0599098808411953, "kl": 0.336669921875, "learning_rate": 4.1199105385785727e-07, "loss": -0.038, "num_tokens": 13791961.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 767.34375, "completions/mean_terminated_length": 681.7916870117188, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 1.4938271604938271, "grad_norm": 1.0576775184964902, "kl": 0.342529296875, "learning_rate": 4.116099441977943e-07, "loss": -0.0091, "num_tokens": 13822512.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 795.9375, "completions/mean_terminated_length": 706.6956787109375, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 1.4969135802469136, "grad_norm": 0.9045893641800498, "kl": 0.333251953125, "learning_rate": 4.112281882657751e-07, "loss": 0.0041, "num_tokens": 13854758.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 763.9375, "completions/mean_terminated_length": 691.1199951171875, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 1.5, "grad_norm": 0.6837431452553143, "kl": 0.3494873046875, "learning_rate": 4.1084578758842714e-07, "loss": -0.0309, "num_tokens": 13885544.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 841.75, "completions/mean_terminated_length": 758.9091186523438, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 1.5030864197530864, "grad_norm": 0.5426373015151418, "kl": 0.3328857421875, "learning_rate": 4.104627436949559e-07, "loss": 0.0041, "num_tokens": 13919500.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 718.625, "completions/mean_terminated_length": 675.0, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 1.5061728395061729, "grad_norm": 0.03333683949432546, "kl": 0.3377685546875, "learning_rate": 4.1007905811713915e-07, "loss": 0.0003, "num_tokens": 13949272.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 731.40625, "completions/mean_terminated_length": 677.2222290039062, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 1.5092592592592593, "grad_norm": 0.8560193854362351, "kl": 0.319091796875, "learning_rate": 4.096947323893209e-07, "loss": 0.0182, "num_tokens": 13979333.0, "reward": 0.0, "reward_std": 0.15134452283382416, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 814.375, "completions/mean_terminated_length": 732.3478393554688, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 1.5123456790123457, "grad_norm": 0.7320888639123382, "kl": 0.338134765625, "learning_rate": 4.0930976804840487e-07, "loss": 0.013, "num_tokens": 14011745.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 751.90625, "completions/mean_terminated_length": 675.719970703125, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 1.515432098765432, "grad_norm": 1.1634269362480991, "kl": 0.3232421875, "learning_rate": 4.0892416663384874e-07, "loss": -0.0086, "num_tokens": 14042430.0, "reward": 0.0, "reward_std": 0.18555021286010742, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 770.28125, "completions/mean_terminated_length": 671.0, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 1.5185185185185186, "grad_norm": 0.9283628572890374, "kl": 0.342041015625, "learning_rate": 4.0853792968765765e-07, "loss": -0.0245, "num_tokens": 14073871.0, "reward": -2.3283064365386963e-09, "reward_std": 0.04620163142681122, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 742.65625, "completions/mean_terminated_length": 677.7307739257812, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 1.5216049382716048, "grad_norm": 0.007932142220837816, "kl": 0.312255859375, "learning_rate": 4.081510587543784e-07, "loss": 0.0003, "num_tokens": 14104008.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 711.0, "completions/mean_terminated_length": 690.1333618164062, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 1.5246913580246915, "grad_norm": 0.48836777635870254, "kl": 0.34423828125, "learning_rate": 4.0776355538109285e-07, "loss": 0.0342, "num_tokens": 14132764.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 746.09375, "completions/mean_terminated_length": 681.9615478515625, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 1.5277777777777777, "grad_norm": 0.7484600599280975, "kl": 0.343017578125, "learning_rate": 4.073754211174123e-07, "loss": -0.02, "num_tokens": 14162819.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 804.84375, "completions/mean_terminated_length": 705.227294921875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 1.5308641975308643, "grad_norm": 0.011511269890448983, "kl": 0.3104248046875, "learning_rate": 4.069866575154706e-07, "loss": 0.0003, "num_tokens": 14195318.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 801.3125, "completions/mean_terminated_length": 714.1739501953125, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 1.5339506172839505, "grad_norm": 0.41349112183922404, "kl": 0.3045654296875, "learning_rate": 4.0659726612991853e-07, "loss": 0.03, "num_tokens": 14227284.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 746.71875, "completions/mean_terminated_length": 682.7307739257812, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 1.5370370370370372, "grad_norm": 1.0345874828431705, "kl": 0.3349609375, "learning_rate": 4.062072485179172e-07, "loss": -0.0447, "num_tokens": 14257171.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 724.40625, "completions/mean_terminated_length": 681.607177734375, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 1.5401234567901234, "grad_norm": 0.9216301631012608, "kl": 0.306640625, "learning_rate": 4.0581660623913216e-07, "loss": -0.0125, "num_tokens": 14287008.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 718.78125, "completions/mean_terminated_length": 675.1785888671875, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 1.5432098765432098, "grad_norm": 0.008509882765931261, "kl": 0.301513671875, "learning_rate": 4.0542534085572677e-07, "loss": 0.0003, "num_tokens": 14316409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 816.28125, "completions/mean_terminated_length": 758.1199951171875, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 1.5462962962962963, "grad_norm": 0.8310320267661888, "kl": 0.3087158203125, "learning_rate": 4.050334539323563e-07, "loss": -0.0397, "num_tokens": 14348938.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 758.40625, "completions/mean_terminated_length": 709.2222290039062, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 1.5493827160493827, "grad_norm": 0.5701134440013239, "kl": 0.3228759765625, "learning_rate": 4.046409470361615e-07, "loss": 0.0109, "num_tokens": 14379459.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 829.0625, "completions/mean_terminated_length": 752.7825927734375, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 1.5524691358024691, "grad_norm": 0.9379343393898524, "kl": 0.357177734375, "learning_rate": 4.0424782173676235e-07, "loss": 0.0006, "num_tokens": 14413025.0, "reward": -1.862645149230957e-09, "reward_std": 0.13805748522281647, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 776.5625, "completions/mean_terminated_length": 719.4615478515625, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 1.5555555555555556, "grad_norm": 0.008022671684356462, "kl": 0.30859375, "learning_rate": 4.0385407960625185e-07, "loss": 0.0003, "num_tokens": 14444223.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 813.5, "completions/mean_terminated_length": 731.1304321289062, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 1.558641975308642, "grad_norm": 0.6535608066536647, "kl": 0.2850341796875, "learning_rate": 4.034597222191896e-07, "loss": 0.0175, "num_tokens": 14477047.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 732.5625, "completions/mean_terminated_length": 635.4166870117188, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 1.5617283950617284, "grad_norm": 1.3287593993080316, "kl": 0.30810546875, "learning_rate": 4.030647511525956e-07, "loss": -0.0387, "num_tokens": 14506753.0, "reward": -3.725290298461914e-09, "reward_std": 0.18145698308944702, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 751.8125, "completions/mean_terminated_length": 689.0, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 1.5648148148148149, "grad_norm": 1.3456393758009917, "kl": 0.359619140625, "learning_rate": 4.0266916798594417e-07, "loss": -0.0362, "num_tokens": 14537319.0, "reward": -2.3283064365386963e-09, "reward_std": 0.06432675570249557, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 764.4375, "completions/mean_terminated_length": 691.760009765625, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 1.567901234567901, "grad_norm": 1.1420903046945365, "kl": 0.344970703125, "learning_rate": 4.02272974301157e-07, "loss": -0.041, "num_tokens": 14568101.0, "reward": -1.862645149230957e-09, "reward_std": 0.06432675570249557, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 684.59375, "completions/mean_terminated_length": 649.4827270507812, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 1.5709876543209877, "grad_norm": 1.424125050178182, "kl": 0.3287353515625, "learning_rate": 4.018761716825974e-07, "loss": -0.0406, "num_tokens": 14596052.0, "reward": -1.862645149230957e-09, "reward_std": 0.14842106401920319, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 739.375, "completions/mean_terminated_length": 659.6799926757812, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 1.574074074074074, "grad_norm": 0.5340550821616038, "kl": 0.3369140625, "learning_rate": 4.014787617170639e-07, "loss": -0.0007, "num_tokens": 14625908.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 789.0, "completions/mean_terminated_length": 665.90478515625, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 1.5771604938271606, "grad_norm": 1.642133768672505, "kl": 0.3233642578125, "learning_rate": 4.010807459937836e-07, "loss": -0.0763, "num_tokens": 14658096.0, "reward": -3.259629011154175e-09, "reward_std": 0.1841180920600891, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 737.8125, "completions/mean_terminated_length": 696.9285888671875, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 1.5802469135802468, "grad_norm": 0.991364149264096, "kl": 0.3172607421875, "learning_rate": 4.006821261044061e-07, "loss": -0.0026, "num_tokens": 14687938.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 664.59375, "completions/mean_terminated_length": 640.6333618164062, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 1.5833333333333335, "grad_norm": 1.0790011938421236, "kl": 0.28271484375, "learning_rate": 4.002829036429971e-07, "loss": -0.0612, "num_tokens": 14715365.0, "reward": 0.0, "reward_std": 0.15626253187656403, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 807.03125, "completions/mean_terminated_length": 722.1304321289062, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 1.5864197530864197, "grad_norm": 0.5004503459877179, "kl": 0.3203125, "learning_rate": 3.998830802060317e-07, "loss": 0.011, "num_tokens": 14748206.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 675.125, "completions/mean_terminated_length": 663.8709716796875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 1.5895061728395061, "grad_norm": 0.8693506099410626, "kl": 0.315673828125, "learning_rate": 3.994826573923886e-07, "loss": 0.0045, "num_tokens": 14776182.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 778.0, "completions/mean_terminated_length": 696.0, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 1.5925925925925926, "grad_norm": 1.365680137710999, "kl": 0.3133544921875, "learning_rate": 3.9908163680334326e-07, "loss": 0.0466, "num_tokens": 14807814.0, "reward": 0.0, "reward_std": 0.15241660177707672, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 791.0, "completions/mean_terminated_length": 685.0909423828125, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 1.595679012345679, "grad_norm": 0.8125239703636703, "kl": 0.3349609375, "learning_rate": 3.9868002004256165e-07, "loss": -0.0124, "num_tokens": 14839902.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 678.15625, "completions/mean_terminated_length": 614.1111450195312, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 1.5987654320987654, "grad_norm": 1.1162676468006334, "kl": 0.34130859375, "learning_rate": 3.982778087160935e-07, "loss": -0.002, "num_tokens": 14867803.0, "reward": -1.862645149230957e-09, "reward_std": 0.17414312064647675, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592105805873871, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 706.9375, "completions/mean_terminated_length": 661.6428833007812, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 1.6018518518518519, "grad_norm": 0.9426113162928746, "kl": 0.337890625, "learning_rate": 3.9787500443236664e-07, "loss": 0.0316, "num_tokens": 14897001.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 730.125, "completions/mean_terminated_length": 632.1666870117188, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 1.6049382716049383, "grad_norm": 1.2663004119632248, "kl": 0.3106689453125, "learning_rate": 3.9747160880217994e-07, "loss": 0.0168, "num_tokens": 14927069.0, "reward": 0.0, "reward_std": 0.1774374544620514, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 716.90625, "completions/mean_terminated_length": 660.0370483398438, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 1.6080246913580247, "grad_norm": 0.7914569497747325, "kl": 0.3240966796875, "learning_rate": 3.9706762343869705e-07, "loss": -0.0565, "num_tokens": 14956450.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 713.71875, "completions/mean_terminated_length": 693.0333862304688, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 1.6111111111111112, "grad_norm": 0.7631646064641017, "kl": 0.316162109375, "learning_rate": 3.966630499574397e-07, "loss": -0.0269, "num_tokens": 14985545.0, "reward": 9.313225746154785e-10, "reward_std": 0.02981424145400524, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 734.0, "completions/mean_terminated_length": 637.3333740234375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 1.6141975308641974, "grad_norm": 0.8161152833021108, "kl": 0.349365234375, "learning_rate": 3.9625788997628196e-07, "loss": -0.0286, "num_tokens": 15015301.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 711.125, "completions/mean_terminated_length": 638.923095703125, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 1.617283950617284, "grad_norm": 0.028883129384840816, "kl": 0.370361328125, "learning_rate": 3.958521451154428e-07, "loss": 0.0004, "num_tokens": 15044109.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 700.5625, "completions/mean_terminated_length": 654.357177734375, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 1.6203703703703702, "grad_norm": 0.5909773514766553, "kl": 0.3387451171875, "learning_rate": 3.954458169974805e-07, "loss": 0.0013, "num_tokens": 15072655.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 769.40625, "completions/mean_terminated_length": 684.5416870117188, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 1.623456790123457, "grad_norm": 0.016783433466779622, "kl": 0.3369140625, "learning_rate": 3.950389072472855e-07, "loss": 0.0003, "num_tokens": 15104216.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 687.125, "completions/mean_terminated_length": 664.6666870117188, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 1.626543209876543, "grad_norm": 0.8239224947604685, "kl": 0.2880859375, "learning_rate": 3.9463141749207425e-07, "loss": -0.0072, "num_tokens": 15132948.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 774.96875, "completions/mean_terminated_length": 705.239990234375, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 1.6296296296296298, "grad_norm": 1.238861288818675, "kl": 0.346923828125, "learning_rate": 3.9422334936138255e-07, "loss": -0.0617, "num_tokens": 15163967.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 783.40625, "completions/mean_terminated_length": 674.0454711914062, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 1.632716049382716, "grad_norm": 0.9515951609164494, "kl": 0.2908935546875, "learning_rate": 3.938147044870594e-07, "loss": -0.0016, "num_tokens": 15196000.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 771.65625, "completions/mean_terminated_length": 672.9130859375, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 1.6358024691358026, "grad_norm": 0.7980404553042689, "kl": 0.323974609375, "learning_rate": 3.934054845032598e-07, "loss": -0.0048, "num_tokens": 15227053.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 530 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 756.71875, "completions/mean_terminated_length": 667.625, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 1.6388888888888888, "grad_norm": 0.007225599201778235, "kl": NaN, "learning_rate": 3.9299569104643876e-07, "loss": 0.0003, "num_tokens": 15258404.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 741.96875, "completions/mean_terminated_length": 701.6785888671875, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 1.6419753086419753, "grad_norm": 0.008925163796735084, "kl": 0.3076171875, "learning_rate": 3.925853257553445e-07, "loss": 0.0003, "num_tokens": 15288595.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 763.75, "completions/mean_terminated_length": 726.5714721679688, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 1.6450617283950617, "grad_norm": 1.3739270666570305, "kl": 0.3138427734375, "learning_rate": 3.921743902710122e-07, "loss": -0.0597, "num_tokens": 15319635.0, "reward": 0.0, "reward_std": 0.15000322461128235, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 729.25, "completions/mean_terminated_length": 646.719970703125, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 1.6481481481481481, "grad_norm": 0.9423901024229169, "kl": 0.34130859375, "learning_rate": 3.917628862367569e-07, "loss": 0.0212, "num_tokens": 15349663.0, "reward": -3.725290298461914e-09, "reward_std": 0.11199356615543365, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 816.5, "completions/mean_terminated_length": 707.8095092773438, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 1.6512345679012346, "grad_norm": 1.0973894313986772, "kl": 0.2958984375, "learning_rate": 3.913508152981674e-07, "loss": -0.0598, "num_tokens": 15382031.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 740.53125, "completions/mean_terminated_length": 661.1599731445312, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 1.654320987654321, "grad_norm": 0.017496266324640054, "kl": 0.330810546875, "learning_rate": 3.909381791030998e-07, "loss": 0.0003, "num_tokens": 15411776.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 767.96875, "completions/mean_terminated_length": 682.625, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 1.6574074074074074, "grad_norm": 1.0650048112777157, "kl": 0.3212890625, "learning_rate": 3.905249793016702e-07, "loss": -0.0223, "num_tokens": 15442927.0, "reward": 0.028124995529651642, "reward_std": 0.09598580002784729, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 788.25, "completions/mean_terminated_length": 722.239990234375, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 1.6604938271604939, "grad_norm": 0.7107384859454141, "kl": 0.2861328125, "learning_rate": 3.9011121754624865e-07, "loss": 0.014, "num_tokens": 15474435.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 745.125, "completions/mean_terminated_length": 693.4815063476562, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 1.6635802469135803, "grad_norm": 0.008815093813455377, "kl": 0.3265380859375, "learning_rate": 3.8969689549145266e-07, "loss": 0.0003, "num_tokens": 15505515.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 673.625, "completions/mean_terminated_length": 623.5714721679688, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 1.6666666666666665, "grad_norm": 0.021289576005990775, "kl": 0.366943359375, "learning_rate": 3.8928201479414024e-07, "loss": 0.0004, "num_tokens": 15533119.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 683.21875, "completions/mean_terminated_length": 634.5357666015625, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 1.6697530864197532, "grad_norm": 0.7322246144769231, "kl": 0.314697265625, "learning_rate": 3.888665771134032e-07, "loss": -0.0059, "num_tokens": 15561374.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 847.65625, "completions/mean_terminated_length": 710.5, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 1.6728395061728394, "grad_norm": 1.106478983479238, "kl": 0.27490234375, "learning_rate": 3.8845058411056095e-07, "loss": -0.0571, "num_tokens": 15595731.0, "reward": -3.725290298461914e-09, "reward_std": 0.11199356615543365, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 774.1875, "completions/mean_terminated_length": 676.434814453125, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 1.675925925925926, "grad_norm": 1.1423664079438094, "kl": 0.320556640625, "learning_rate": 3.880340374491535e-07, "loss": -0.0267, "num_tokens": 15627549.0, "reward": -9.313225746154785e-10, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 742.875, "completions/mean_terminated_length": 664.1599731445312, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 1.6790123456790123, "grad_norm": 0.8397405428454164, "kl": 0.352783203125, "learning_rate": 3.8761693879493495e-07, "loss": -0.061, "num_tokens": 15657561.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 845.125, "completions/mean_terminated_length": 737.7999877929688, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 1.682098765432099, "grad_norm": 0.009647961507256286, "kl": 0.313720703125, "learning_rate": 3.871992898158667e-07, "loss": 0.0003, "num_tokens": 15691197.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 780.625, "completions/mean_terminated_length": 724.4615478515625, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 1.6851851851851851, "grad_norm": 0.9076764410303536, "kl": 0.2767333984375, "learning_rate": 3.867810921821112e-07, "loss": -0.0132, "num_tokens": 15722969.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 770.5625, "completions/mean_terminated_length": 712.0769653320312, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 1.6882716049382716, "grad_norm": 1.1350196498890335, "kl": 0.3204345703125, "learning_rate": 3.863623475660245e-07, "loss": 0.0176, "num_tokens": 15754063.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 793.4375, "completions/mean_terminated_length": 728.8800048828125, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 1.691358024691358, "grad_norm": 0.7098311299876164, "kl": 0.3018798828125, "learning_rate": 3.859430576421503e-07, "loss": -0.0029, "num_tokens": 15785921.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 744.84375, "completions/mean_terminated_length": 715.9655151367188, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 1.6944444444444444, "grad_norm": 0.8675803468787185, "kl": 0.29931640625, "learning_rate": 3.855232240872128e-07, "loss": 0.03, "num_tokens": 15816460.0, "reward": 0.0, "reward_std": 0.13003921508789062, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 693.9375, "completions/mean_terminated_length": 646.7857666015625, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 1.6975308641975309, "grad_norm": 1.3689599144314457, "kl": 0.35400390625, "learning_rate": 3.851028485801105e-07, "loss": -0.0666, "num_tokens": 15844722.0, "reward": 0.028124993667006493, "reward_std": 0.09598580002784729, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 801.03125, "completions/mean_terminated_length": 726.7083740234375, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 1.7006172839506173, "grad_norm": 0.8281656840707402, "kl": 0.28076171875, "learning_rate": 3.8468193280190864e-07, "loss": -0.0107, "num_tokens": 15876527.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 701.84375, "completions/mean_terminated_length": 655.8214721679688, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 1.7037037037037037, "grad_norm": 0.00877201557433776, "kl": 0.335205078125, "learning_rate": 3.842604784358333e-07, "loss": 0.0003, "num_tokens": 15905018.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 796.9375, "completions/mean_terminated_length": 721.25, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 1.7067901234567902, "grad_norm": 0.5638693262466036, "kl": 0.327880859375, "learning_rate": 3.8383848716726444e-07, "loss": 0.0316, "num_tokens": 15936972.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 766.90625, "completions/mean_terminated_length": 719.2963256835938, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 1.7098765432098766, "grad_norm": 1.2138127334164506, "kl": 0.320556640625, "learning_rate": 3.8341596068372874e-07, "loss": 0.0101, "num_tokens": 15967969.0, "reward": 0.0, "reward_std": 0.15677303075790405, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 739.75, "completions/mean_terminated_length": 699.1428833007812, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 1.7129629629629628, "grad_norm": 0.7032138830186002, "kl": 0.32177734375, "learning_rate": 3.829929006748934e-07, "loss": -0.0115, "num_tokens": 15998525.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 840.09375, "completions/mean_terminated_length": 768.1304321289062, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 1.7160493827160495, "grad_norm": 0.008068159736726943, "kl": 0.2872314453125, "learning_rate": 3.8256930883255927e-07, "loss": 0.0003, "num_tokens": 16032160.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 736.59375, "completions/mean_terminated_length": 656.1199951171875, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 1.7191358024691357, "grad_norm": 0.6726821694845868, "kl": 0.3104248046875, "learning_rate": 3.8214518685065377e-07, "loss": -0.0004, "num_tokens": 16062155.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 795.03125, "completions/mean_terminated_length": 742.1923217773438, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 1.7222222222222223, "grad_norm": 0.8514285385130014, "kl": 0.327392578125, "learning_rate": 3.817205364252244e-07, "loss": -0.0153, "num_tokens": 16093984.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 787.96875, "completions/mean_terminated_length": 721.8800048828125, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 1.7253086419753085, "grad_norm": 0.9523489909182068, "kl": 0.33056640625, "learning_rate": 3.8129535925443187e-07, "loss": -0.0247, "num_tokens": 16125891.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 771.625, "completions/mean_terminated_length": 700.9599609375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 1.7283950617283952, "grad_norm": 1.0422373945268346, "kl": 0.34423828125, "learning_rate": 3.8086965703854336e-07, "loss": -0.0144, "num_tokens": 16157119.0, "reward": -1.862645149230957e-09, "reward_std": 0.18104533851146698, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 797.25, "completions/mean_terminated_length": 733.760009765625, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 1.7314814814814814, "grad_norm": 0.007672943135312286, "kl": 0.335693359375, "learning_rate": 3.8044343147992563e-07, "loss": 0.0003, "num_tokens": 16189839.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 796.34375, "completions/mean_terminated_length": 720.4583740234375, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 1.734567901234568, "grad_norm": 0.7619376986860051, "kl": 0.320556640625, "learning_rate": 3.8001668428303847e-07, "loss": 0.0218, "num_tokens": 16222026.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 769.59375, "completions/mean_terminated_length": 733.2500610351562, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 1.7376543209876543, "grad_norm": 0.014792220069454454, "kl": 0.342041015625, "learning_rate": 3.7958941715442726e-07, "loss": 0.0003, "num_tokens": 16253197.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 748.53125, "completions/mean_terminated_length": 671.3999633789062, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 1.7407407407407407, "grad_norm": 0.4933010943286016, "kl": 0.2974853515625, "learning_rate": 3.791616318027171e-07, "loss": 0.0099, "num_tokens": 16283482.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 842.96875, "completions/mean_terminated_length": 748.1428833007812, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "epoch": 1.7438271604938271, "grad_norm": 0.00825346935935361, "kl": 0.2969970703125, "learning_rate": 3.78733329938605e-07, "loss": 0.0003, "num_tokens": 16317837.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 774.53125, "completions/mean_terminated_length": 676.9130859375, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 1.7469135802469136, "grad_norm": 0.7834725540881555, "kl": 0.307861328125, "learning_rate": 3.7830451327485367e-07, "loss": -0.007, "num_tokens": 16349026.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 742.03125, "completions/mean_terminated_length": 676.9615478515625, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 1.75, "grad_norm": 0.7896783411535454, "kl": 0.309326171875, "learning_rate": 3.778751835262847e-07, "loss": -0.0235, "num_tokens": 16378831.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 753.5, "completions/mean_terminated_length": 725.5172119140625, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 1.7530864197530864, "grad_norm": 0.4569321087531079, "kl": 0.32763671875, "learning_rate": 3.7744534240977085e-07, "loss": -0.0215, "num_tokens": 16409591.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 775.28125, "completions/mean_terminated_length": 717.8846435546875, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 1.7561728395061729, "grad_norm": 0.8922151507505256, "kl": 0.3408203125, "learning_rate": 3.7701499164423045e-07, "loss": 0.0143, "num_tokens": 16440616.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 761.5, "completions/mean_terminated_length": 658.7825927734375, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 1.7592592592592593, "grad_norm": 1.6746867773094058, "kl": 0.3203125, "learning_rate": 3.7658413295061974e-07, "loss": -0.0416, "num_tokens": 16471664.0, "reward": 0.0, "reward_std": 0.15466603636741638, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 823.9375, "completions/mean_terminated_length": 745.6521606445312, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 1.7623456790123457, "grad_norm": 1.330520862716196, "kl": 0.2933349609375, "learning_rate": 3.7615276805192595e-07, "loss": 0.0483, "num_tokens": 16504522.0, "reward": 3.725290298461914e-09, "reward_std": 0.15880176424980164, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110855221748352, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 747.09375, "completions/mean_terminated_length": 669.5599975585938, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 1.765432098765432, "grad_norm": 0.7477047350570117, "kl": 0.3272705078125, "learning_rate": 3.7572089867316075e-07, "loss": 0.0122, "num_tokens": 16534861.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 781.40625, "completions/mean_terminated_length": 700.5416870117188, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 1.7685185185185186, "grad_norm": 1.0866824842532468, "kl": 0.3203125, "learning_rate": 3.7528852654135323e-07, "loss": 0.0174, "num_tokens": 16566714.0, "reward": 0.02812499739229679, "reward_std": 0.1024516224861145, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 715.1875, "completions/mean_terminated_length": 683.2413940429688, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 1.7716049382716048, "grad_norm": 1.0340351299850523, "kl": 0.345458984375, "learning_rate": 3.7485565338554294e-07, "loss": -0.023, "num_tokens": 16596320.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 816.0625, "completions/mean_terminated_length": 734.6956787109375, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 1.7746913580246915, "grad_norm": 0.6909734053595269, "kl": 0.352294921875, "learning_rate": 3.7442228093677296e-07, "loss": 0.0024, "num_tokens": 16628850.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 768.625, "completions/mean_terminated_length": 652.5454711914062, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 1.7777777777777777, "grad_norm": 0.7519408898271244, "kl": 0.372802734375, "learning_rate": 3.7398841092808307e-07, "loss": -0.0118, "num_tokens": 16660642.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 716.0, "completions/mean_terminated_length": 644.923095703125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 1.7808641975308643, "grad_norm": 1.0225990435274304, "kl": 0.3367919921875, "learning_rate": 3.735540450945028e-07, "loss": 0.0025, "num_tokens": 16689590.0, "reward": 0.02812499739229679, "reward_std": 0.07932206988334656, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 752.375, "completions/mean_terminated_length": 702.0740966796875, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 1.7839506172839505, "grad_norm": 0.49306929501114305, "kl": 0.333251953125, "learning_rate": 3.731191851730443e-07, "loss": -0.0014, "num_tokens": 16720414.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 805.4375, "completions/mean_terminated_length": 706.0909423828125, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 1.7870370370370372, "grad_norm": 0.9803899573700692, "kl": 0.295166015625, "learning_rate": 3.7268383290269583e-07, "loss": 0.0148, "num_tokens": 16753040.0, "reward": 2.7939677238464355e-09, "reward_std": 0.17674319446086884, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096889972687, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 779.5625, "completions/mean_terminated_length": 683.9130859375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 1.7901234567901234, "grad_norm": 1.1735776018585131, "kl": 0.317138671875, "learning_rate": 3.7224799002441427e-07, "loss": -0.0431, "num_tokens": 16784862.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 698.40625, "completions/mean_terminated_length": 638.1111450195312, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 1.7932098765432098, "grad_norm": 0.010817240832333085, "kl": 0.326171875, "learning_rate": 3.718116582811186e-07, "loss": 0.0003, "num_tokens": 16813619.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 776.8125, "completions/mean_terminated_length": 707.5999755859375, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 1.7962962962962963, "grad_norm": 0.5015604487434996, "kl": 0.3193359375, "learning_rate": 3.713748394176827e-07, "loss": 0.0167, "num_tokens": 16844805.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 810.40625, "completions/mean_terminated_length": 726.8261108398438, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 1.7993827160493827, "grad_norm": 0.00802844464316047, "kl": 0.3101806640625, "learning_rate": 3.7093753518092853e-07, "loss": 0.0003, "num_tokens": 16877326.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 747.90625, "completions/mean_terminated_length": 684.1923217773438, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 1.8024691358024691, "grad_norm": 1.0248104785700363, "kl": 0.3275146484375, "learning_rate": 3.704997473196187e-07, "loss": -0.0439, "num_tokens": 16907683.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 717.71875, "completions/mean_terminated_length": 647.0385131835938, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 1.8055555555555556, "grad_norm": 0.008714448696403034, "kl": 0.2947998046875, "learning_rate": 3.7006147758445017e-07, "loss": 0.0003, "num_tokens": 16936930.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 742.53125, "completions/mean_terminated_length": 632.3912963867188, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 1.808641975308642, "grad_norm": 2.231146106662793, "kl": 0.29833984375, "learning_rate": 3.696227277280467e-07, "loss": -0.0354, "num_tokens": 16967423.0, "reward": -3.259629011154175e-09, "reward_std": 0.17682674527168274, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 687.40625, "completions/mean_terminated_length": 676.54833984375, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 1.8117283950617284, "grad_norm": 1.4562902534305326, "kl": 0.28857421875, "learning_rate": 3.691834995049522e-07, "loss": -0.0859, "num_tokens": 16995588.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 812.75, "completions/mean_terminated_length": 702.0952758789062, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 1.8148148148148149, "grad_norm": 0.9753903401211428, "kl": 0.314697265625, "learning_rate": 3.687437946716234e-07, "loss": -0.017, "num_tokens": 17027744.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 800.625, "completions/mean_terminated_length": 666.6000366210938, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 1.817901234567901, "grad_norm": 0.8793951481635643, "kl": 0.30029296875, "learning_rate": 3.68303614986423e-07, "loss": -0.0106, "num_tokens": 17059916.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 764.6875, "completions/mean_terminated_length": 716.6666870117188, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 1.8209876543209877, "grad_norm": 1.0182024225613862, "kl": 0.3121337890625, "learning_rate": 3.6786296220961277e-07, "loss": 0.0119, "num_tokens": 17090630.0, "reward": 0.0, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 763.375, "completions/mean_terminated_length": 703.2307739257812, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 1.824074074074074, "grad_norm": 0.4918459088357609, "kl": 0.308837890625, "learning_rate": 3.6742183810334605e-07, "loss": 0.0205, "num_tokens": 17121198.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 782.09375, "completions/mean_terminated_length": 701.4583740234375, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 1.8271604938271606, "grad_norm": 0.541075467064785, "kl": 0.303466796875, "learning_rate": 3.6698024443166134e-07, "loss": -0.0013, "num_tokens": 17152845.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 793.1875, "completions/mean_terminated_length": 688.2727661132812, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 1.8302469135802468, "grad_norm": 1.224660735592709, "kl": 0.2767333984375, "learning_rate": 3.6653818296047466e-07, "loss": -0.0056, "num_tokens": 17184691.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 720.0, "completions/mean_terminated_length": 663.7037353515625, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 1.8333333333333335, "grad_norm": 0.9040897384601864, "kl": 0.322509765625, "learning_rate": 3.660956554575729e-07, "loss": -0.0367, "num_tokens": 17213943.0, "reward": -1.862645149230957e-09, "reward_std": 0.04620163142681122, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 795.78125, "completions/mean_terminated_length": 706.478271484375, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 1.8364197530864197, "grad_norm": 0.830769741431794, "kl": 0.343505859375, "learning_rate": 3.656526636926065e-07, "loss": 0.0037, "num_tokens": 17246256.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 738.75, "completions/mean_terminated_length": 698.0000610351562, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 1.8395061728395061, "grad_norm": 1.057906036541086, "kl": 0.30712890625, "learning_rate": 3.652092094370826e-07, "loss": -0.0391, "num_tokens": 17276232.0, "reward": -1.862645149230957e-09, "reward_std": 0.15134452283382416, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 714.8125, "completions/mean_terminated_length": 682.8275756835938, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 1.8425925925925926, "grad_norm": 3.485929854350874, "kl": 0.3009033203125, "learning_rate": 3.647652944643577e-07, "loss": -0.2627, "num_tokens": 17305438.0, "reward": 0.02812499925494194, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 799.71875, "completions/mean_terminated_length": 697.7727661132812, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 1.845679012345679, "grad_norm": 0.8559240818386872, "kl": 0.3125, "learning_rate": 3.6432092054963055e-07, "loss": -0.0229, "num_tokens": 17337949.0, "reward": -3.725290298461914e-09, "reward_std": 0.13241708278656006, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 823.9375, "completions/mean_terminated_length": 719.1428833007812, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 1.8487654320987654, "grad_norm": 0.009840729619351467, "kl": 0.279052734375, "learning_rate": 3.638760894699355e-07, "loss": 0.0003, "num_tokens": 17370767.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 735.65625, "completions/mean_terminated_length": 669.1154174804688, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 1.8518518518518519, "grad_norm": 0.7490011171282679, "kl": 0.2989501953125, "learning_rate": 3.6343080300413497e-07, "loss": 0.0305, "num_tokens": 17400700.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 742.125, "completions/mean_terminated_length": 689.9259033203125, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 1.8549382716049383, "grad_norm": 1.4493332077099517, "kl": 0.276611328125, "learning_rate": 3.629850629329124e-07, "loss": -0.0752, "num_tokens": 17431296.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 763.96875, "completions/mean_terminated_length": 703.9615478515625, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 1.8580246913580247, "grad_norm": 0.7347059578264971, "kl": 0.3212890625, "learning_rate": 3.625388710387651e-07, "loss": -0.012, "num_tokens": 17462131.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 823.3125, "completions/mean_terminated_length": 718.1904907226562, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 1.8611111111111112, "grad_norm": 1.257451334629759, "kl": 0.2890625, "learning_rate": 3.6209222910599746e-07, "loss": -0.0073, "num_tokens": 17495037.0, "reward": 3.725290298461914e-09, "reward_std": 0.15673846006393433, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 743.46875, "completions/mean_terminated_length": 678.7307739257812, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 1.8641975308641974, "grad_norm": 0.5034951058170785, "kl": 0.2926025390625, "learning_rate": 3.616451389207133e-07, "loss": 0.0217, "num_tokens": 17525484.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 773.3125, "completions/mean_terminated_length": 715.4615478515625, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "epoch": 1.867283950617284, "grad_norm": 0.7588473545628874, "kl": 0.2994384765625, "learning_rate": 3.611976022708091e-07, "loss": 0.0124, "num_tokens": 17556462.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 701.53125, "completions/mean_terminated_length": 627.1154174804688, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 1.8703703703703702, "grad_norm": 0.767726931403257, "kl": 0.293212890625, "learning_rate": 3.6074962094596676e-07, "loss": 0.0028, "num_tokens": 17584959.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 771.34375, "completions/mean_terminated_length": 687.125, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 1.873456790123457, "grad_norm": 0.9241261242502699, "kl": 0.2957763671875, "learning_rate": 3.603011967376464e-07, "loss": -0.0056, "num_tokens": 17616442.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 762.09375, "completions/mean_terminated_length": 674.7916870117188, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 1.876543209876543, "grad_norm": 0.622414233832134, "kl": 0.3138427734375, "learning_rate": 3.598523314390792e-07, "loss": 0.0083, "num_tokens": 17647905.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 697.84375, "completions/mean_terminated_length": 676.1000366210938, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 1.8796296296296298, "grad_norm": 1.1736021499772744, "kl": 0.314453125, "learning_rate": 3.594030268452601e-07, "loss": 0.0237, "num_tokens": 17676520.0, "reward": 0.0, "reward_std": 0.15075348317623138, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 753.90625, "completions/mean_terminated_length": 703.888916015625, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 1.882716049382716, "grad_norm": 0.45725316166440305, "kl": 0.290771484375, "learning_rate": 3.5895328475294106e-07, "loss": 0.0203, "num_tokens": 17707237.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 705.96875, "completions/mean_terminated_length": 673.0689697265625, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 1.8858024691358026, "grad_norm": 0.8632045276371411, "kl": 0.30810546875, "learning_rate": 3.585031069606234e-07, "loss": -0.0005, "num_tokens": 17736384.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 694.03125, "completions/mean_terminated_length": 659.8965454101562, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 1.8888888888888888, "grad_norm": 0.6344068055819815, "kl": 0.3033447265625, "learning_rate": 3.5805249526855074e-07, "loss": 0.0159, "num_tokens": 17764777.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 651.09375, "completions/mean_terminated_length": 639.0645141601562, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 1.8919753086419753, "grad_norm": 0.8866730404607887, "kl": 0.3212890625, "learning_rate": 3.5760145147870204e-07, "loss": -0.0119, "num_tokens": 17791632.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 760.3125, "completions/mean_terminated_length": 686.47998046875, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "epoch": 1.8950617283950617, "grad_norm": 0.7948213609530317, "kl": 0.28515625, "learning_rate": 3.571499773947839e-07, "loss": -0.0038, "num_tokens": 17822462.0, "reward": 0.0, "reward_std": 0.15908902883529663, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 789.25, "completions/mean_terminated_length": 682.5454711914062, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 1.8981481481481481, "grad_norm": 0.564664960370857, "kl": 0.289794921875, "learning_rate": 3.5669807482222395e-07, "loss": -0.012, "num_tokens": 17854338.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 763.125, "completions/mean_terminated_length": 702.923095703125, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 1.9012345679012346, "grad_norm": 0.5785058641997257, "kl": 0.2777099609375, "learning_rate": 3.562457455681633e-07, "loss": 0.0045, "num_tokens": 17885690.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 659.125, "completions/mean_terminated_length": 634.800048828125, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 1.904320987654321, "grad_norm": 0.529554309890252, "kl": 0.284912109375, "learning_rate": 3.557929914414491e-07, "loss": 0.0019, "num_tokens": 17912774.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 720.34375, "completions/mean_terminated_length": 676.9642944335938, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 1.9074074074074074, "grad_norm": 1.36796285909082, "kl": 0.330810546875, "learning_rate": 3.553398142526277e-07, "loss": -0.0637, "num_tokens": 17941565.0, "reward": -3.725290298461914e-09, "reward_std": 0.21052932739257812, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 776.40625, "completions/mean_terminated_length": 663.8636474609375, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 1.9104938271604939, "grad_norm": 1.223456191622778, "kl": 0.2587890625, "learning_rate": 3.5488621581393736e-07, "loss": 0.019, "num_tokens": 17972954.0, "reward": -3.725290298461914e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 738.90625, "completions/mean_terminated_length": 673.1154174804688, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 1.9135802469135803, "grad_norm": 0.9181830067181223, "kl": 0.4039306640625, "learning_rate": 3.5443219793930073e-07, "loss": 0.0137, "num_tokens": 18003263.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 859.6875, "completions/mean_terminated_length": 731.888916015625, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 1.9166666666666665, "grad_norm": 1.162305257769272, "kl": 0.2412109375, "learning_rate": 3.5397776244431794e-07, "loss": 0.033, "num_tokens": 18037797.0, "reward": -2.7939677238464355e-09, "reward_std": 0.2121918946504593, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -6.51925802230835e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 771.8125, "completions/mean_terminated_length": 713.6154174804688, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 1.9197530864197532, "grad_norm": 1.0754812877536444, "kl": 0.293212890625, "learning_rate": 3.535229111462589e-07, "loss": 0.0009, "num_tokens": 18069319.0, "reward": 0.0, "reward_std": 0.18565019965171814, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592105805873871, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 795.53125, "completions/mean_terminated_length": 719.375, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 1.9228395061728394, "grad_norm": 1.418906680488901, "kl": 0.246337890625, "learning_rate": 3.530676458640567e-07, "loss": 0.079, "num_tokens": 18101908.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 764.96875, "completions/mean_terminated_length": 692.4400024414062, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 1.925925925925926, "grad_norm": 0.6694783362066277, "kl": 0.2960205078125, "learning_rate": 3.5261196841829957e-07, "loss": 0.0058, "num_tokens": 18132931.0, "reward": 0.0, "reward_std": 0.12547743320465088, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 744.375, "completions/mean_terminated_length": 679.84619140625, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 1.9290123456790123, "grad_norm": 1.067940047943812, "kl": 0.31640625, "learning_rate": 3.521558806312241e-07, "loss": -0.0168, "num_tokens": 18162823.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 756.375, "completions/mean_terminated_length": 706.8148193359375, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 1.932098765432099, "grad_norm": 0.7395763172607944, "kl": 0.279052734375, "learning_rate": 3.5169938432670775e-07, "loss": 0.0014, "num_tokens": 18193815.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 657.1875, "completions/mean_terminated_length": 645.3547973632812, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 1.9351851851851851, "grad_norm": 0.7412616344854341, "kl": 0.2867431640625, "learning_rate": 3.5124248133026187e-07, "loss": 0.0426, "num_tokens": 18221013.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 736.21875, "completions/mean_terminated_length": 695.107177734375, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 1.9382716049382716, "grad_norm": 1.3118440074292663, "kl": 0.2803955078125, "learning_rate": 3.5078517346902384e-07, "loss": -0.0234, "num_tokens": 18251540.0, "reward": -3.725290298461914e-09, "reward_std": 0.22114333510398865, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 803.125, "completions/mean_terminated_length": 687.4285888671875, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 1.941358024691358, "grad_norm": 0.824049243789038, "kl": 0.2554931640625, "learning_rate": 3.503274625717504e-07, "loss": 0.0075, "num_tokens": 18283848.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 676.9375, "completions/mean_terminated_length": 627.357177734375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 1.9444444444444444, "grad_norm": 0.8005612327423927, "kl": 0.309814453125, "learning_rate": 3.498693504688097e-07, "loss": 0.04, "num_tokens": 18311882.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 723.8125, "completions/mean_terminated_length": 654.5385131835938, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 1.9475308641975309, "grad_norm": 0.968269644106719, "kl": 0.326416015625, "learning_rate": 3.494108389921744e-07, "loss": -0.0321, "num_tokens": 18341316.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 793.46875, "completions/mean_terminated_length": 703.2608642578125, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 1.9506172839506173, "grad_norm": 1.227757361012959, "kl": 0.2994384765625, "learning_rate": 3.4895192997541436e-07, "loss": 0.0301, "num_tokens": 18373583.0, "reward": -1.862645149230957e-09, "reward_std": 0.19315354526042938, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 709.09375, "completions/mean_terminated_length": 688.1000366210938, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 1.9537037037037037, "grad_norm": 1.0844100134163452, "kl": 0.278564453125, "learning_rate": 3.484926252536891e-07, "loss": 0.0508, "num_tokens": 18402806.0, "reward": 0.0, "reward_std": 0.1586425006389618, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 732.71875, "completions/mean_terminated_length": 665.5, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 1.9567901234567902, "grad_norm": 0.011084886665632733, "kl": 0.2913818359375, "learning_rate": 3.4803292666374047e-07, "loss": 0.0003, "num_tokens": 18432697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 622.65625, "completions/mean_terminated_length": 595.9000244140625, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 1.9598765432098766, "grad_norm": 1.6160733763497246, "kl": 0.3094482421875, "learning_rate": 3.4757283604388546e-07, "loss": -0.0094, "num_tokens": 18458722.0, "reward": -3.725290298461914e-09, "reward_std": 0.18118225038051605, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 734.40625, "completions/mean_terminated_length": 637.875, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 1.9629629629629628, "grad_norm": 1.7914416516037617, "kl": 0.274658203125, "learning_rate": 3.47112355234009e-07, "loss": 0.0618, "num_tokens": 18488591.0, "reward": 9.313225746154785e-10, "reward_std": 0.2114925980567932, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096293926239, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 762.71875, "completions/mean_terminated_length": 714.3333129882812, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 1.9660493827160495, "grad_norm": 1.0571098256792673, "kl": 0.248291015625, "learning_rate": 3.466514860755559e-07, "loss": 0.0368, "num_tokens": 18519174.0, "reward": 1.3969838619232178e-09, "reward_std": 0.14951469004154205, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 640.75, "completions/mean_terminated_length": 586.0, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 1.9691358024691357, "grad_norm": 1.0766277591693325, "kl": 0.28857421875, "learning_rate": 3.4619023041152433e-07, "loss": 0.028, "num_tokens": 18545930.0, "reward": -1.862645149230957e-09, "reward_std": 0.18108326196670532, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 743.375, "completions/mean_terminated_length": 649.8333740234375, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 1.9722222222222223, "grad_norm": 0.6608561339989446, "kl": 0.2994384765625, "learning_rate": 3.4572859008645796e-07, "loss": 0.0008, "num_tokens": 18576034.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 770.6875, "completions/mean_terminated_length": 712.2307739257812, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 1.9753086419753085, "grad_norm": 1.3288545908857277, "kl": 0.257568359375, "learning_rate": 3.452665669464386e-07, "loss": 0.0173, "num_tokens": 18607152.0, "reward": 7.450580596923828e-09, "reward_std": 0.20140354335308075, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 749.34375, "completions/mean_terminated_length": 657.7916870117188, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 1.9783950617283952, "grad_norm": 0.9487511446974353, "kl": 0.3446044921875, "learning_rate": 3.448041628390791e-07, "loss": -0.0056, "num_tokens": 18638011.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 682.15625, "completions/mean_terminated_length": 633.3214721679688, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 1.9814814814814814, "grad_norm": 1.1936306191580994, "kl": 0.3072509765625, "learning_rate": 3.443413796135159e-07, "loss": 0.0044, "num_tokens": 18666196.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 721.9375, "completions/mean_terminated_length": 621.25, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 1.984567901234568, "grad_norm": 0.024297146526663496, "kl": 0.293212890625, "learning_rate": 3.4387821912040116e-07, "loss": 0.0003, "num_tokens": 18695778.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 801.34375, "completions/mean_terminated_length": 700.1364135742188, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 1.9876543209876543, "grad_norm": 0.8239805671743939, "kl": 0.2957763671875, "learning_rate": 3.4341468321189574e-07, "loss": -0.013, "num_tokens": 18728109.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 789.8125, "completions/mean_terminated_length": 667.1428833007812, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 1.9907407407407407, "grad_norm": 1.135707835543851, "kl": 0.32373046875, "learning_rate": 3.4295077374166214e-07, "loss": -0.0133, "num_tokens": 18760591.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 667.0625, "completions/mean_terminated_length": 630.137939453125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 1.9938271604938271, "grad_norm": 0.9644947395422476, "kl": 0.3138427734375, "learning_rate": 3.4248649256485655e-07, "loss": -0.0243, "num_tokens": 18788005.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 728.34375, "completions/mean_terminated_length": 660.1154174804688, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 1.9969135802469136, "grad_norm": 0.8541538616070171, "kl": 0.30078125, "learning_rate": 3.4202184153812135e-07, "loss": -0.0279, "num_tokens": 18817956.0, "reward": -1.862645149230957e-09, "reward_std": 0.12777692079544067, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 752.0, "completions/mean_terminated_length": 675.8399658203125, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 2.0, "grad_norm": 1.109871309325268, "kl": 0.3179931640625, "learning_rate": 3.415568225195783e-07, "loss": -0.0034, "num_tokens": 18848804.0, "reward": -2.7939677238464355e-09, "reward_std": 0.18632806837558746, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592105805873871, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 739.25, "completions/mean_terminated_length": 609.8181762695312, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 2.003086419753086, "grad_norm": 1.4385743455681783, "kl": 0.2890625, "learning_rate": 3.410914373688205e-07, "loss": -0.0302, "num_tokens": 18879168.0, "reward": 0.0, "reward_std": 0.23120412230491638, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 697.21875, "completions/mean_terminated_length": 650.5357666015625, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 2.006172839506173, "grad_norm": 1.5648401892985397, "kl": 0.280029296875, "learning_rate": 3.4062568794690536e-07, "loss": -0.0508, "num_tokens": 18907779.0, "reward": 3.725290298461914e-09, "reward_std": 0.2660294771194458, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096293926239, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 741.09375, "completions/mean_terminated_length": 675.8077392578125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 2.009259259259259, "grad_norm": 0.9815139820291894, "kl": 0.295654296875, "learning_rate": 3.401595761163468e-07, "loss": -0.0218, "num_tokens": 18938862.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 699.78125, "completions/mean_terminated_length": 639.74072265625, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 2.0123456790123457, "grad_norm": 0.9115095196582362, "kl": 0.3330078125, "learning_rate": 3.3969310374110817e-07, "loss": -0.0196, "num_tokens": 18967635.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 739.46875, "completions/mean_terminated_length": 698.8214721679688, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 2.015432098765432, "grad_norm": 1.2778458791627896, "kl": 0.2794189453125, "learning_rate": 3.3922627268659467e-07, "loss": -0.0159, "num_tokens": 18997986.0, "reward": -2.3283064365386963e-09, "reward_std": 0.255505234003067, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 725.65625, "completions/mean_terminated_length": 626.2083740234375, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 2.0185185185185186, "grad_norm": 0.8902677301596652, "kl": 0.308837890625, "learning_rate": 3.387590848196456e-07, "loss": -0.0005, "num_tokens": 19027647.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 709.4375, "completions/mean_terminated_length": 676.8965454101562, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 2.021604938271605, "grad_norm": 1.1985951033383033, "kl": 0.29248046875, "learning_rate": 3.382915420085274e-07, "loss": 0.0065, "num_tokens": 19056717.0, "reward": 0.0, "reward_std": 0.12963305413722992, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 667.71875, "completions/mean_terminated_length": 630.862060546875, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 2.0246913580246915, "grad_norm": 1.6815841099967368, "kl": 0.2943115234375, "learning_rate": 3.3782364612292574e-07, "loss": 0.0686, "num_tokens": 19084496.0, "reward": 0.0, "reward_std": 0.26032719016075134, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 697.21875, "completions/mean_terminated_length": 621.8077392578125, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 2.0277777777777777, "grad_norm": 1.395411956711552, "kl": 0.3048095703125, "learning_rate": 3.3735539903393826e-07, "loss": -0.0058, "num_tokens": 19113567.0, "reward": 1.3969838619232178e-09, "reward_std": 0.14742480218410492, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 698.71875, "completions/mean_terminated_length": 638.4815063476562, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 2.0308641975308643, "grad_norm": 0.9317063020509377, "kl": 0.337646484375, "learning_rate": 3.368868026140672e-07, "loss": -0.0169, "num_tokens": 19142366.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 876.09375, "completions/mean_terminated_length": 728.1875, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 2.0339506172839505, "grad_norm": 1.2876846021592718, "kl": 0.2957763671875, "learning_rate": 3.364178587372115e-07, "loss": 0.0163, "num_tokens": 19177329.0, "reward": 0.0, "reward_std": 0.13859109580516815, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 737.0, "completions/mean_terminated_length": 670.7692260742188, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 2.037037037037037, "grad_norm": 1.061515415652727, "kl": 0.30517578125, "learning_rate": 3.359485692786597e-07, "loss": 0.0055, "num_tokens": 19207473.0, "reward": 9.313225746154785e-10, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 651.46875, "completions/mean_terminated_length": 612.9310302734375, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 2.0401234567901234, "grad_norm": 1.2138284059011546, "kl": 0.2908935546875, "learning_rate": 3.354789361150824e-07, "loss": -0.0062, "num_tokens": 19234564.0, "reward": -1.862645149230957e-09, "reward_std": 0.18632371723651886, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 781.28125, "completions/mean_terminated_length": 700.375, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 2.04320987654321, "grad_norm": 0.963881958120758, "kl": 0.2991943359375, "learning_rate": 3.350089611245246e-07, "loss": -0.012, "num_tokens": 19266345.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 664.53125, "completions/mean_terminated_length": 597.9629516601562, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 2.0462962962962963, "grad_norm": 1.5609560581092616, "kl": 0.31982421875, "learning_rate": 3.345386461863981e-07, "loss": 0.0188, "num_tokens": 19293898.0, "reward": 0.0, "reward_std": 0.19320368766784668, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 728.53125, "completions/mean_terminated_length": 645.7999877929688, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 2.049382716049383, "grad_norm": 1.3705062779467612, "kl": 0.2974853515625, "learning_rate": 3.340679931814743e-07, "loss": -0.0309, "num_tokens": 19324011.0, "reward": 0.0, "reward_std": 0.19272641837596893, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 758.53125, "completions/mean_terminated_length": 637.8636474609375, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 2.052469135802469, "grad_norm": 1.3954602989287956, "kl": 0.3125, "learning_rate": 3.3359700399187654e-07, "loss": -0.0351, "num_tokens": 19354904.0, "reward": 3.725290298461914e-09, "reward_std": 0.1973274052143097, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 719.0, "completions/mean_terminated_length": 633.5999755859375, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 2.0555555555555554, "grad_norm": 1.1553022962099326, "kl": 0.3074951171875, "learning_rate": 3.331256805010724e-07, "loss": 0.0275, "num_tokens": 19385080.0, "reward": 1.862645149230957e-09, "reward_std": 0.1587591916322708, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 693.59375, "completions/mean_terminated_length": 632.4074096679688, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 2.058641975308642, "grad_norm": 1.1050126577074906, "kl": 0.321044921875, "learning_rate": 3.326540245938666e-07, "loss": -0.009, "num_tokens": 19414171.0, "reward": -7.450580596923828e-09, "reward_std": 0.16144341230392456, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 690.34375, "completions/mean_terminated_length": 628.5555419921875, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 2.0617283950617282, "grad_norm": 1.041452430840546, "kl": 0.322998046875, "learning_rate": 3.3218203815639265e-07, "loss": -0.0023, "num_tokens": 19442558.0, "reward": -3.725290298461914e-09, "reward_std": 0.17738276720046997, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 635.28125, "completions/mean_terminated_length": 635.28125, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 2.064814814814815, "grad_norm": 0.9320130061694388, "kl": 0.33154296875, "learning_rate": 3.3170972307610654e-07, "loss": 0.0198, "num_tokens": 19470015.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 711.59375, "completions/mean_terminated_length": 666.9642944335938, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 2.067901234567901, "grad_norm": 1.0588532282410164, "kl": 0.3084716796875, "learning_rate": 3.312370812417779e-07, "loss": 0.0039, "num_tokens": 19499458.0, "reward": 0.0, "reward_std": 0.1587507575750351, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 732.6875, "completions/mean_terminated_length": 635.5833740234375, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 2.0709876543209877, "grad_norm": 0.6422280605695163, "kl": 0.32763671875, "learning_rate": 3.3076411454348336e-07, "loss": 0.0081, "num_tokens": 19529576.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 813.03125, "completions/mean_terminated_length": 686.4500122070312, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 2.074074074074074, "grad_norm": 0.5963639837233681, "kl": 0.2818603515625, "learning_rate": 3.3029082487259847e-07, "loss": -0.0212, "num_tokens": 19562377.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 733.28125, "completions/mean_terminated_length": 636.375, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 2.0771604938271606, "grad_norm": 0.9096796652560489, "kl": 0.335693359375, "learning_rate": 3.298172141217905e-07, "loss": -0.0103, "num_tokens": 19592410.0, "reward": -1.862645149230957e-09, "reward_std": 0.13241708278656006, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 703.21875, "completions/mean_terminated_length": 643.8148193359375, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 2.080246913580247, "grad_norm": 0.9119271580757892, "kl": 0.30224609375, "learning_rate": 3.2934328418501064e-07, "loss": -0.0006, "num_tokens": 19621717.0, "reward": -1.862645149230957e-09, "reward_std": 0.13241708278656006, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 719.28125, "completions/mean_terminated_length": 648.9615478515625, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 2.0833333333333335, "grad_norm": 0.9824268305954632, "kl": 0.322998046875, "learning_rate": 3.2886903695748647e-07, "loss": -0.0165, "num_tokens": 19651458.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 707.96875, "completions/mean_terminated_length": 662.8214721679688, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 2.0864197530864197, "grad_norm": 0.012430481591240248, "kl": 0.326904296875, "learning_rate": 3.2839447433571454e-07, "loss": 0.0003, "num_tokens": 19680221.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 680.46875, "completions/mean_terminated_length": 631.3928833007812, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 2.0895061728395063, "grad_norm": 1.000012060936064, "kl": 0.3388671875, "learning_rate": 3.279195982174524e-07, "loss": -0.0119, "num_tokens": 19708100.0, "reward": -9.313225746154785e-10, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 650.78125, "completions/mean_terminated_length": 597.4642944335938, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 2.0925925925925926, "grad_norm": 0.9945218864494757, "kl": 0.326416015625, "learning_rate": 3.2744441050171136e-07, "loss": 0.0249, "num_tokens": 19735305.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 778.6875, "completions/mean_terminated_length": 696.9166870117188, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 2.095679012345679, "grad_norm": 1.8222405721065627, "kl": 0.31787109375, "learning_rate": 3.26968913088749e-07, "loss": 0.0398, "num_tokens": 19766883.0, "reward": 1.862645149230957e-09, "reward_std": 0.14781978726387024, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 679.46875, "completions/mean_terminated_length": 599.9615478515625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 2.0987654320987654, "grad_norm": 0.00959192813711383, "kl": 0.32861328125, "learning_rate": 3.264931078800611e-07, "loss": 0.0003, "num_tokens": 19795778.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 699.90625, "completions/mean_terminated_length": 653.607177734375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 2.1018518518518516, "grad_norm": 0.9485800842585024, "kl": 0.30859375, "learning_rate": 3.260169967783744e-07, "loss": 0.0224, "num_tokens": 19824251.0, "reward": 0.0, "reward_std": 0.1271844357252121, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 653.59375, "completions/mean_terminated_length": 568.1154174804688, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 2.1049382716049383, "grad_norm": 0.009865302595620653, "kl": 0.337646484375, "learning_rate": 3.255405816876389e-07, "loss": 0.0003, "num_tokens": 19851382.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 725.65625, "completions/mean_terminated_length": 626.2083740234375, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 2.1080246913580245, "grad_norm": 0.5532169517801894, "kl": 0.333740234375, "learning_rate": 3.250638645130204e-07, "loss": -0.0093, "num_tokens": 19880827.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 735.15625, "completions/mean_terminated_length": 654.2799682617188, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 2.111111111111111, "grad_norm": 1.0527534126942464, "kl": 0.2872314453125, "learning_rate": 3.2458684716089224e-07, "loss": 0.0084, "num_tokens": 19910764.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 729.875, "completions/mean_terminated_length": 662.0, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 2.1141975308641974, "grad_norm": 0.8607550987770649, "kl": 0.3092041015625, "learning_rate": 3.241095315388287e-07, "loss": 0.0034, "num_tokens": 19940504.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 626.75, "completions/mean_terminated_length": 570.0, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 2.117283950617284, "grad_norm": 0.8887261315001356, "kl": 0.372314453125, "learning_rate": 3.2363191955559656e-07, "loss": 0.0126, "num_tokens": 19966924.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 671.0625, "completions/mean_terminated_length": 634.5516967773438, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 2.1203703703703702, "grad_norm": 0.8792114198448667, "kl": 0.30126953125, "learning_rate": 3.231540131211478e-07, "loss": 0.0014, "num_tokens": 19994406.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 719.96875, "completions/mean_terminated_length": 618.625, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 2.123456790123457, "grad_norm": 1.4858621699695647, "kl": 0.3321533203125, "learning_rate": 3.22675814146612e-07, "loss": 0.0211, "num_tokens": 20024065.0, "reward": -3.725290298461914e-09, "reward_std": 0.19692783057689667, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 655.5625, "completions/mean_terminated_length": 602.9285888671875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 2.126543209876543, "grad_norm": 1.157245962016017, "kl": 0.34033203125, "learning_rate": 3.221973245442883e-07, "loss": -0.0425, "num_tokens": 20051483.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 676.0, "completions/mean_terminated_length": 640.0, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 2.1296296296296298, "grad_norm": 0.9085926062898118, "kl": 0.35595703125, "learning_rate": 3.217185462276382e-07, "loss": 0.003, "num_tokens": 20079575.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 708.03125, "completions/mean_terminated_length": 662.8928833007812, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 2.132716049382716, "grad_norm": 1.0348106919231934, "kl": 0.29296875, "learning_rate": 3.2123948111127795e-07, "loss": 0.0375, "num_tokens": 20108488.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 647.9375, "completions/mean_terminated_length": 609.0344848632812, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 2.1358024691358026, "grad_norm": 1.051940699012955, "kl": 0.33154296875, "learning_rate": 3.2076013111097055e-07, "loss": -0.0279, "num_tokens": 20135570.0, "reward": -3.725290298461914e-09, "reward_std": 0.1258779913187027, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 703.1875, "completions/mean_terminated_length": 629.1538696289062, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 2.138888888888889, "grad_norm": 1.194276905036908, "kl": 0.32373046875, "learning_rate": 3.20280498143618e-07, "loss": -0.0139, "num_tokens": 20164616.0, "reward": -2.3283064365386963e-09, "reward_std": 0.06432675570249557, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 672.875, "completions/mean_terminated_length": 607.8518676757812, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 2.1419753086419755, "grad_norm": 1.57027050884244, "kl": 0.324951171875, "learning_rate": 3.1980058412725436e-07, "loss": -0.0408, "num_tokens": 20192964.0, "reward": 3.725290298461914e-09, "reward_std": 0.14020457863807678, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 688.40625, "completions/mean_terminated_length": 626.25927734375, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 2.1450617283950617, "grad_norm": 0.9778800856256499, "kl": 0.33740234375, "learning_rate": 3.1932039098103723e-07, "loss": 0.0111, "num_tokens": 20220881.0, "reward": 0.0, "reward_std": 0.12601491808891296, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 648.71875, "completions/mean_terminated_length": 609.8965454101562, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 2.148148148148148, "grad_norm": 1.0504986019750875, "kl": 0.332275390625, "learning_rate": 3.188399206252406e-07, "loss": -0.0081, "num_tokens": 20247852.0, "reward": -3.259629011154175e-09, "reward_std": 0.13805748522281647, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 726.8125, "completions/mean_terminated_length": 658.2307739257812, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 2.1512345679012346, "grad_norm": 0.9309988346346298, "kl": 0.33056640625, "learning_rate": 3.183591749812468e-07, "loss": 0.0146, "num_tokens": 20277322.0, "reward": -1.862645149230957e-09, "reward_std": 0.16529497504234314, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 715.625, "completions/mean_terminated_length": 683.72412109375, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 2.154320987654321, "grad_norm": 0.9657662092438589, "kl": 0.3033447265625, "learning_rate": 3.1787815597153934e-07, "loss": 0.0175, "num_tokens": 20306886.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 752.28125, "completions/mean_terminated_length": 628.7727661132812, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 2.1574074074074074, "grad_norm": 1.0651472313027317, "kl": 0.317138671875, "learning_rate": 3.173968655196947e-07, "loss": 0.02, "num_tokens": 20338207.0, "reward": 0.02812499739229679, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 729.0625, "completions/mean_terminated_length": 686.9285888671875, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 2.1604938271604937, "grad_norm": 0.9675557243805978, "kl": 0.31494140625, "learning_rate": 3.1691530555037493e-07, "loss": 0.0208, "num_tokens": 20368389.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 737.5, "completions/mean_terminated_length": 607.2727661132812, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 2.1635802469135803, "grad_norm": 0.010486660684088683, "kl": 0.350830078125, "learning_rate": 3.164334779893198e-07, "loss": 0.0004, "num_tokens": 20398497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 647.9375, "completions/mean_terminated_length": 594.2142944335938, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 2.1666666666666665, "grad_norm": 1.0665304434407505, "kl": 0.33349609375, "learning_rate": 3.159513847633393e-07, "loss": 0.0416, "num_tokens": 20425915.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 697.875, "completions/mean_terminated_length": 651.2857666015625, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 2.169753086419753, "grad_norm": 1.2849560268418567, "kl": 0.31201171875, "learning_rate": 3.1546902780030555e-07, "loss": -0.0599, "num_tokens": 20455459.0, "reward": 0.0, "reward_std": 0.14231424033641815, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 656.9375, "completions/mean_terminated_length": 604.5, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 2.1728395061728394, "grad_norm": 0.881842731943163, "kl": 0.343017578125, "learning_rate": 3.1498640902914565e-07, "loss": 0.0025, "num_tokens": 20482961.0, "reward": 0.02812499925494194, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 703.375, "completions/mean_terminated_length": 644.0, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 2.175925925925926, "grad_norm": 0.023534372819004366, "kl": 0.369140625, "learning_rate": 3.1450353037983346e-07, "loss": 0.0004, "num_tokens": 20511785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 682.9375, "completions/mean_terminated_length": 587.4400024414062, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 2.1790123456790123, "grad_norm": 1.4509484141446072, "kl": 0.32421875, "learning_rate": 3.140203937833821e-07, "loss": 0.033, "num_tokens": 20539951.0, "reward": -3.725290298461914e-09, "reward_std": 0.25439155101776123, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 697.03125, "completions/mean_terminated_length": 605.47998046875, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 2.182098765432099, "grad_norm": 1.5252535390561108, "kl": 0.3302001953125, "learning_rate": 3.135370011718364e-07, "loss": -0.0077, "num_tokens": 20569004.0, "reward": 0.02812499925494194, "reward_std": 0.11586824059486389, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 633.375, "completions/mean_terminated_length": 592.9655151367188, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 2.185185185185185, "grad_norm": 1.5863713496813534, "kl": 0.347900390625, "learning_rate": 3.1305335447826477e-07, "loss": -0.0169, "num_tokens": 20595440.0, "reward": -3.725290298461914e-09, "reward_std": 0.2432374656200409, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 687.3125, "completions/mean_terminated_length": 593.0399780273438, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 2.1882716049382718, "grad_norm": 0.6675990384614131, "kl": 0.3150634765625, "learning_rate": 3.125694556367517e-07, "loss": 0.0123, "num_tokens": 20624118.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 663.71875, "completions/mean_terminated_length": 612.25, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 2.191358024691358, "grad_norm": 0.9065284425566196, "kl": 0.354736328125, "learning_rate": 3.1208530658239e-07, "loss": 0.0227, "num_tokens": 20651893.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 702.875, "completions/mean_terminated_length": 628.7692260742188, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 2.1944444444444446, "grad_norm": 0.5921535166475328, "kl": 0.3311767578125, "learning_rate": 3.1160090925127325e-07, "loss": 0.0264, "num_tokens": 20680765.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 721.9375, "completions/mean_terminated_length": 666.0, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 2.197530864197531, "grad_norm": 0.016570738190573833, "kl": 0.3167724609375, "learning_rate": 3.1111626558048777e-07, "loss": 0.0003, "num_tokens": 20710403.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 706.25, "completions/mean_terminated_length": 647.4074096679688, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 2.200617283950617, "grad_norm": 0.7071664428786564, "kl": 0.33056640625, "learning_rate": 3.1063137750810493e-07, "loss": -0.0117, "num_tokens": 20739203.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 761.625, "completions/mean_terminated_length": 674.1666870117188, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 2.2037037037037037, "grad_norm": 0.7667457104932189, "kl": 0.3172607421875, "learning_rate": 3.101462469731735e-07, "loss": -0.0108, "num_tokens": 20770355.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 700.21875, "completions/mean_terminated_length": 666.72412109375, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 2.20679012345679, "grad_norm": 0.9688229532526872, "kl": 0.2862548828125, "learning_rate": 3.0966087591571184e-07, "loss": -0.0277, "num_tokens": 20799314.0, "reward": 0.0, "reward_std": 0.1463371366262436, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 710.71875, "completions/mean_terminated_length": 623.0, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 2.2098765432098766, "grad_norm": 0.13863287573914862, "kl": 0.3748779296875, "learning_rate": 3.091752662767001e-07, "loss": 0.0004, "num_tokens": 20828441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 733.5625, "completions/mean_terminated_length": 652.239990234375, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 2.212962962962963, "grad_norm": 1.1987272565380604, "kl": 0.299072265625, "learning_rate": 3.0868941999807274e-07, "loss": 0.0564, "num_tokens": 20858479.0, "reward": 0.0, "reward_std": 0.17122279107570648, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 726.8125, "completions/mean_terminated_length": 671.7777709960938, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 2.2160493827160495, "grad_norm": 0.008574960214126948, "kl": 0.3067626953125, "learning_rate": 3.082033390227102e-07, "loss": 0.0003, "num_tokens": 20888401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 741.09375, "completions/mean_terminated_length": 661.8800048828125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 2.2191358024691357, "grad_norm": 0.5220758773227693, "kl": 0.3082275390625, "learning_rate": 3.0771702529443163e-07, "loss": 0.0192, "num_tokens": 20919376.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 699.90625, "completions/mean_terminated_length": 653.607177734375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 2.2222222222222223, "grad_norm": 1.2888684625575575, "kl": 0.3294677734375, "learning_rate": 3.0723048075798694e-07, "loss": -0.0091, "num_tokens": 20948109.0, "reward": 0.0, "reward_std": 0.2004445195198059, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 758.25, "completions/mean_terminated_length": 669.6666870117188, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 2.2253086419753085, "grad_norm": 0.5431356344557754, "kl": 0.3170166015625, "learning_rate": 3.0674370735904917e-07, "loss": 0.0191, "num_tokens": 20978833.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 660.625, "completions/mean_terminated_length": 576.7692260742188, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 2.228395061728395, "grad_norm": 0.7805457357513953, "kl": 0.322265625, "learning_rate": 3.0625670704420634e-07, "loss": 0.0071, "num_tokens": 21006805.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 784.875, "completions/mean_terminated_length": 676.1818237304688, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 2.2314814814814814, "grad_norm": 0.8545715771313919, "kl": 0.300537109375, "learning_rate": 3.057694817609539e-07, "loss": -0.0007, "num_tokens": 21038533.0, "reward": -3.725290298461914e-09, "reward_std": 0.11199356615543365, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 682.34375, "completions/mean_terminated_length": 633.5357666015625, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 2.234567901234568, "grad_norm": 0.6239766259790861, "kl": 0.359375, "learning_rate": 3.0528203345768717e-07, "loss": 0.0124, "num_tokens": 21066944.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 756.6875, "completions/mean_terminated_length": 681.8399658203125, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 2.2376543209876543, "grad_norm": 1.1670403229338628, "kl": 0.294189453125, "learning_rate": 3.047943640836931e-07, "loss": -0.0422, "num_tokens": 21097602.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 697.03125, "completions/mean_terminated_length": 650.3214721679688, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 2.240740740740741, "grad_norm": 1.6386161867048643, "kl": 0.339111328125, "learning_rate": 3.0430647558914284e-07, "loss": 0.0218, "num_tokens": 21126391.0, "reward": -3.725290298461914e-09, "reward_std": 0.19169270992279053, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 675.0625, "completions/mean_terminated_length": 663.8064575195312, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 2.243827160493827, "grad_norm": 0.7084404263887658, "kl": 0.3253173828125, "learning_rate": 3.038183699250837e-07, "loss": -0.0072, "num_tokens": 21154069.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 741.1875, "completions/mean_terminated_length": 662.0, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 2.246913580246914, "grad_norm": 0.5896845115900525, "kl": 0.299072265625, "learning_rate": 3.0333004904343153e-07, "loss": 0.0241, "num_tokens": 21184135.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 773.3125, "completions/mean_terminated_length": 675.2174072265625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 2.25, "grad_norm": 0.10943753744940068, "kl": 0.39892578125, "learning_rate": 3.0284151489696264e-07, "loss": 0.0004, "num_tokens": 21215541.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 733.3125, "completions/mean_terminated_length": 601.1818237304688, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 2.253086419753086, "grad_norm": 1.2409572765841255, "kl": 0.3126220703125, "learning_rate": 3.023527694393064e-07, "loss": 0.0207, "num_tokens": 21245451.0, "reward": -5.587935447692871e-09, "reward_std": 0.1613985002040863, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 679.25, "completions/mean_terminated_length": 643.586181640625, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 2.256172839506173, "grad_norm": 0.7164065146154263, "kl": 0.287109375, "learning_rate": 3.0186381462493704e-07, "loss": 0.0127, "num_tokens": 21273583.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 720.8125, "completions/mean_terminated_length": 664.6666870117188, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 2.259259259259259, "grad_norm": 0.5526665202311036, "kl": 0.2779541015625, "learning_rate": 3.0137465240916614e-07, "loss": 0.0142, "num_tokens": 21303117.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 713.28125, "completions/mean_terminated_length": 641.5769653320312, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 2.2623456790123457, "grad_norm": 1.29598252946043, "kl": 0.297119140625, "learning_rate": 3.008852847481346e-07, "loss": -0.0244, "num_tokens": 21332378.0, "reward": -3.725290298461914e-09, "reward_std": 0.21016982197761536, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 697.1875, "completions/mean_terminated_length": 663.3793334960938, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 2.265432098765432, "grad_norm": 1.3865288813904986, "kl": 0.2939453125, "learning_rate": 3.003957135988049e-07, "loss": -0.0054, "num_tokens": 21361148.0, "reward": -7.450580596923828e-09, "reward_std": 0.21970567107200623, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096293926239, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 664.65625, "completions/mean_terminated_length": 613.3214721679688, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 2.2685185185185186, "grad_norm": 0.4971952411562597, "kl": 0.343505859375, "learning_rate": 2.999059409189533e-07, "loss": 0.0221, "num_tokens": 21388909.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 767.78125, "completions/mean_terminated_length": 651.3181762695312, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 2.271604938271605, "grad_norm": 0.011462006251291434, "kl": 0.3271484375, "learning_rate": 2.9941596866716174e-07, "loss": 0.0003, "num_tokens": 21420290.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 738.375, "completions/mean_terminated_length": 643.1666870117188, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 2.2746913580246915, "grad_norm": 0.7485629636877662, "kl": 0.32470703125, "learning_rate": 2.989257988028105e-07, "loss": 0.0215, "num_tokens": 21450518.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 750.59375, "completions/mean_terminated_length": 626.3181762695312, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 2.2777777777777777, "grad_norm": 1.1199405288147424, "kl": 0.32666015625, "learning_rate": 2.984354332860702e-07, "loss": 0.0284, "num_tokens": 21481077.0, "reward": -4.190951585769653e-09, "reward_std": 0.20220693945884705, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 690.09375, "completions/mean_terminated_length": 642.3928833007812, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 2.2808641975308643, "grad_norm": 1.2895971713547376, "kl": 0.298828125, "learning_rate": 2.979448740778935e-07, "loss": -0.0068, "num_tokens": 21509788.0, "reward": -1.862645149230957e-09, "reward_std": 0.1973792016506195, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 775.625, "completions/mean_terminated_length": 706.0799560546875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 2.2839506172839505, "grad_norm": 1.0120060608074388, "kl": 0.287353515625, "learning_rate": 2.9745412314000786e-07, "loss": 0.0098, "num_tokens": 21541356.0, "reward": 0.02812499925494194, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 816.0625, "completions/mean_terminated_length": 734.6956787109375, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "epoch": 2.287037037037037, "grad_norm": 0.5461073719288805, "kl": 0.292724609375, "learning_rate": 2.9696318243490746e-07, "loss": -0.005, "num_tokens": 21574434.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 643.4375, "completions/mean_terminated_length": 604.0689697265625, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 2.2901234567901234, "grad_norm": 1.0759940179355871, "kl": 0.3154296875, "learning_rate": 2.9647205392584533e-07, "loss": -0.0425, "num_tokens": 21601020.0, "reward": -1.862645149230957e-09, "reward_std": 0.1582072526216507, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 692.53125, "completions/mean_terminated_length": 616.0385131835938, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 2.29320987654321, "grad_norm": 0.6491421542546149, "kl": 0.31689453125, "learning_rate": 2.959807395768255e-07, "loss": -0.0451, "num_tokens": 21629493.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 746.1875, "completions/mean_terminated_length": 653.5833740234375, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 2.2962962962962963, "grad_norm": 0.6302045720515893, "kl": 0.275146484375, "learning_rate": 2.95489241352595e-07, "loss": 0.0297, "num_tokens": 21659619.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 778.0, "completions/mean_terminated_length": 696.0, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 2.299382716049383, "grad_norm": 1.1125022853901245, "kl": 0.294189453125, "learning_rate": 2.949975612186366e-07, "loss": -0.0008, "num_tokens": 21691223.0, "reward": 0.0, "reward_std": 0.12571673095226288, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 692.71875, "completions/mean_terminated_length": 616.2692260742188, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 2.302469135802469, "grad_norm": 0.8809908359461416, "kl": 0.3489990234375, "learning_rate": 2.9450570114116014e-07, "loss": 0.035, "num_tokens": 21719174.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 709.46875, "completions/mean_terminated_length": 636.8846435546875, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 2.3055555555555554, "grad_norm": 0.013393503118300074, "kl": 0.31298828125, "learning_rate": 2.9401366308709513e-07, "loss": 0.0003, "num_tokens": 21748533.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 739.375, "completions/mean_terminated_length": 644.5, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 2.308641975308642, "grad_norm": 0.009935897726793609, "kl": 0.2918701171875, "learning_rate": 2.9352144902408296e-07, "loss": 0.0003, "num_tokens": 21778437.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 759.25, "completions/mean_terminated_length": 685.1199951171875, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 2.3117283950617282, "grad_norm": 1.0034108345721173, "kl": 0.25439453125, "learning_rate": 2.930290609204686e-07, "loss": 0.0479, "num_tokens": 21809597.0, "reward": -1.862645149230957e-09, "reward_std": 0.15807422995567322, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 750.78125, "completions/mean_terminated_length": 626.5909423828125, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 2.314814814814815, "grad_norm": 1.1636573404859127, "kl": 0.2874755859375, "learning_rate": 2.925365007452933e-07, "loss": 0.0123, "num_tokens": 21839998.0, "reward": -1.3969838619232178e-09, "reward_std": 0.1641397774219513, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 686.5, "completions/mean_terminated_length": 664.0000610351562, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 2.317901234567901, "grad_norm": 1.4033954819531933, "kl": 0.307861328125, "learning_rate": 2.920437704682861e-07, "loss": 0.0044, "num_tokens": 21868306.0, "reward": -7.450580596923828e-09, "reward_std": 0.17682674527168274, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 720.4375, "completions/mean_terminated_length": 650.3846435546875, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 2.3209876543209877, "grad_norm": 0.8705940481720116, "kl": 0.2899169921875, "learning_rate": 2.915508720598566e-07, "loss": -0.0247, "num_tokens": 21897964.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 715.3125, "completions/mean_terminated_length": 644.0769653320312, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 2.324074074074074, "grad_norm": 0.6377240184037994, "kl": 0.2913818359375, "learning_rate": 2.910578074910865e-07, "loss": 0.0214, "num_tokens": 21927366.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 733.96875, "completions/mean_terminated_length": 680.25927734375, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 2.3271604938271606, "grad_norm": 0.8587335816847671, "kl": 0.2869873046875, "learning_rate": 2.9056457873372213e-07, "loss": -0.0115, "num_tokens": 21957433.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 751.40625, "completions/mean_terminated_length": 675.0799560546875, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 2.330246913580247, "grad_norm": 0.976589983769839, "kl": 0.33203125, "learning_rate": 2.9007118776016635e-07, "loss": -0.0064, "num_tokens": 21987746.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 688.6875, "completions/mean_terminated_length": 640.7857666015625, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 2.3333333333333335, "grad_norm": 0.8140323731714041, "kl": 0.30712890625, "learning_rate": 2.895776365434706e-07, "loss": 0.0236, "num_tokens": 22015872.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 716.875, "completions/mean_terminated_length": 646.0, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 2.3364197530864197, "grad_norm": 0.06551671188304388, "kl": 0.378662109375, "learning_rate": 2.8908392705732724e-07, "loss": 0.0004, "num_tokens": 22044812.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 715.1875, "completions/mean_terminated_length": 671.0714721679688, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 2.3395061728395063, "grad_norm": 1.1810752306099652, "kl": 0.31884765625, "learning_rate": 2.885900612760616e-07, "loss": -0.0153, "num_tokens": 22073846.0, "reward": -1.862645149230957e-09, "reward_std": 0.19472569227218628, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 687.25, "completions/mean_terminated_length": 652.413818359375, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 2.3425925925925926, "grad_norm": 1.07281953205766, "kl": 0.27734375, "learning_rate": 2.8809604117462397e-07, "loss": -0.0141, "num_tokens": 22101970.0, "reward": -3.725290298461914e-09, "reward_std": 0.20704184472560883, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592105805873871, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 737.90625, "completions/mean_terminated_length": 657.7999877929688, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 2.3456790123456788, "grad_norm": 1.3192607792629212, "kl": 0.3134765625, "learning_rate": 2.876018687285817e-07, "loss": 0.0074, "num_tokens": 22132591.0, "reward": 2.7939677238464355e-09, "reward_std": 0.15570716559886932, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 706.59375, "completions/mean_terminated_length": 647.8148193359375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 2.3487654320987654, "grad_norm": 0.9535517686962172, "kl": 0.30419921875, "learning_rate": 2.8710754591411147e-07, "loss": 0.0004, "num_tokens": 22161702.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 695.34375, "completions/mean_terminated_length": 661.3448486328125, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 2.351851851851852, "grad_norm": 0.7668050290613273, "kl": 0.2891845703125, "learning_rate": 2.8661307470799114e-07, "loss": -0.002, "num_tokens": 22190205.0, "reward": 0.0, "reward_std": 0.09185586124658585, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 695.84375, "completions/mean_terminated_length": 635.0740966796875, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 2.3549382716049383, "grad_norm": 1.0176226128200823, "kl": 0.302978515625, "learning_rate": 2.861184570875921e-07, "loss": -0.041, "num_tokens": 22218836.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 702.65625, "completions/mean_terminated_length": 656.75, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 2.3580246913580245, "grad_norm": 0.6903004677162509, "kl": 0.30908203125, "learning_rate": 2.856236950308711e-07, "loss": 0.0013, "num_tokens": 22247517.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 762.71875, "completions/mean_terminated_length": 689.5599975585938, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 2.361111111111111, "grad_norm": 0.6495978029567093, "kl": 0.276123046875, "learning_rate": 2.851287905163628e-07, "loss": 0.0194, "num_tokens": 22278528.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 725.4375, "completions/mean_terminated_length": 694.5516967773438, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 2.3641975308641974, "grad_norm": 0.8074552020078114, "kl": 0.3009033203125, "learning_rate": 2.8463374552317123e-07, "loss": 0.0075, "num_tokens": 22308238.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 695.59375, "completions/mean_terminated_length": 661.6206665039062, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 2.367283950617284, "grad_norm": 0.543797783597461, "kl": 0.3115234375, "learning_rate": 2.8413856203096226e-07, "loss": 0.0127, "num_tokens": 22336429.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 739.15625, "completions/mean_terminated_length": 627.6956787109375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 2.3703703703703702, "grad_norm": 0.9030282705032256, "kl": 0.31591796875, "learning_rate": 2.836432420199557e-07, "loss": -0.01, "num_tokens": 22367050.0, "reward": -2.3283064365386963e-09, "reward_std": 0.04620163142681122, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 770.5, "completions/mean_terminated_length": 712.0, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 2.373456790123457, "grad_norm": 6.818904619799035, "kl": 2.0286865234375, "learning_rate": 2.831477874709172e-07, "loss": -0.0081, "num_tokens": 22399074.0, "reward": 0.0, "reward_std": 0.178672194480896, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 796.5, "completions/mean_terminated_length": 677.3333740234375, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 2.376543209876543, "grad_norm": 0.623314486037102, "kl": 0.2818603515625, "learning_rate": 2.826522003651504e-07, "loss": 0.0174, "num_tokens": 22431374.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 751.625, "completions/mean_terminated_length": 660.8333740234375, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 2.3796296296296298, "grad_norm": 0.4352854882504204, "kl": 0.28125, "learning_rate": 2.8215648268448926e-07, "loss": 0.0304, "num_tokens": 22461682.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 727.1875, "completions/mean_terminated_length": 644.0799560546875, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 2.382716049382716, "grad_norm": 0.8684794622121298, "kl": 0.281005859375, "learning_rate": 2.8166063641128963e-07, "loss": 0.0193, "num_tokens": 22491364.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 733.0, "completions/mean_terminated_length": 702.8965454101562, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 2.3858024691358026, "grad_norm": 0.05524853583081608, "kl": 0.317138671875, "learning_rate": 2.8116466352842165e-07, "loss": 0.0003, "num_tokens": 22520964.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 655.71875, "completions/mean_terminated_length": 603.107177734375, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 2.388888888888889, "grad_norm": 1.369594835323384, "kl": 0.273681640625, "learning_rate": 2.80668566019262e-07, "loss": 0.0083, "num_tokens": 22548351.0, "reward": -7.450580596923828e-09, "reward_std": 0.2107936292886734, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592105805873871, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 700.1875, "completions/mean_terminated_length": 609.5199584960938, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 2.3919753086419755, "grad_norm": 0.8060769779208158, "kl": 0.294921875, "learning_rate": 2.8017234586768534e-07, "loss": -0.0052, "num_tokens": 22577225.0, "reward": -2.7939677238464355e-09, "reward_std": 0.12777692079544067, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 691.78125, "completions/mean_terminated_length": 657.413818359375, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 2.3950617283950617, "grad_norm": 1.0375051388367698, "kl": 0.296875, "learning_rate": 2.796760050580571e-07, "loss": -0.0175, "num_tokens": 22605962.0, "reward": 2.7939677238464355e-09, "reward_std": 0.15559504926204681, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110855221748352, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 745.53125, "completions/mean_terminated_length": 693.9629516601562, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 2.398148148148148, "grad_norm": 0.8340971492924649, "kl": 0.310546875, "learning_rate": 2.7917954557522503e-07, "loss": -0.0313, "num_tokens": 22636531.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 703.03125, "completions/mean_terminated_length": 596.0416870117188, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 2.4012345679012346, "grad_norm": 1.2557599420112906, "kl": 0.2816162109375, "learning_rate": 2.786829694045116e-07, "loss": -0.0246, "num_tokens": 22665720.0, "reward": 0.0, "reward_std": 0.18573541939258575, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 757.53125, "completions/mean_terminated_length": 653.2608642578125, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 2.4043209876543212, "grad_norm": 1.159626860385025, "kl": 0.2725830078125, "learning_rate": 2.7818627853170585e-07, "loss": -0.0034, "num_tokens": 22696929.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 699.375, "completions/mean_terminated_length": 653.0, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 2.4074074074074074, "grad_norm": 1.1340259348539956, "kl": 0.27978515625, "learning_rate": 2.7768947494305545e-07, "loss": -0.0003, "num_tokens": 22725821.0, "reward": -9.313225746154785e-10, "reward_std": 0.14842106401920319, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 712.8125, "completions/mean_terminated_length": 668.357177734375, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 2.4104938271604937, "grad_norm": 0.6779508979306605, "kl": 0.24578857421875, "learning_rate": 2.7719256062525884e-07, "loss": 0.006, "num_tokens": 22755187.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 793.0625, "completions/mean_terminated_length": 750.2963256835938, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 2.4135802469135803, "grad_norm": 0.00910277949727657, "kl": 0.2523193359375, "learning_rate": 2.766955375654573e-07, "loss": 0.0003, "num_tokens": 22787733.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 762.1875, "completions/mean_terminated_length": 688.8800048828125, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 2.4166666666666665, "grad_norm": 1.2121234705060395, "kl": 0.2923583984375, "learning_rate": 2.7619840775122695e-07, "loss": -0.0294, "num_tokens": 22818799.0, "reward": -1.862645149230957e-09, "reward_std": 0.1531248539686203, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 699.09375, "completions/mean_terminated_length": 665.4827270507812, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 2.419753086419753, "grad_norm": 1.3078284181942887, "kl": 0.28662109375, "learning_rate": 2.7570117317057087e-07, "loss": 0.003, "num_tokens": 22847198.0, "reward": 9.313225746154785e-10, "reward_std": 0.1515917330980301, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 669.0, "completions/mean_terminated_length": 632.27587890625, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 2.4228395061728394, "grad_norm": 1.3444904972323186, "kl": 0.30908203125, "learning_rate": 2.7520383581191085e-07, "loss": -0.043, "num_tokens": 22874666.0, "reward": 0.0, "reward_std": 0.17122045159339905, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 683.8125, "completions/mean_terminated_length": 648.6206665039062, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 2.425925925925926, "grad_norm": 1.4438116017026041, "kl": 0.2852783203125, "learning_rate": 2.7470639766408003e-07, "loss": -0.012, "num_tokens": 22903256.0, "reward": -3.725290298461914e-09, "reward_std": 0.21133752167224884, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592105805873871, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 738.9375, "completions/mean_terminated_length": 659.1199951171875, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 2.4290123456790123, "grad_norm": 1.6836583122916187, "kl": 0.447021484375, "learning_rate": 2.7420886071631455e-07, "loss": 0.0786, "num_tokens": 22933430.0, "reward": 0.0, "reward_std": 0.1577567458152771, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 757.15625, "completions/mean_terminated_length": 635.8636474609375, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 2.432098765432099, "grad_norm": 1.4462823123660733, "kl": 0.2528076171875, "learning_rate": 2.7371122695824534e-07, "loss": -0.0423, "num_tokens": 22964251.0, "reward": -5.587935447692871e-09, "reward_std": 0.2167874276638031, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -9.313225746154785e-10, "rewards/logprob_reward/std": 0.3592106103897095, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 716.96875, "completions/mean_terminated_length": 673.107177734375, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 2.435185185185185, "grad_norm": 0.8324573179981034, "kl": 0.2869873046875, "learning_rate": 2.732134983798907e-07, "loss": -0.0033, "num_tokens": 22993358.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 652.34375, "completions/mean_terminated_length": 613.8965454101562, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 2.4382716049382718, "grad_norm": 0.8389245706202125, "kl": 0.287109375, "learning_rate": 2.727156769716482e-07, "loss": -0.0081, "num_tokens": 23020409.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 721.96875, "completions/mean_terminated_length": 690.72412109375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 2.441358024691358, "grad_norm": 0.007722079367322754, "kl": 0.2696533203125, "learning_rate": 2.722177647242863e-07, "loss": 0.0003, "num_tokens": 23049816.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 757.84375, "completions/mean_terminated_length": 696.423095703125, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 2.4444444444444446, "grad_norm": 1.038484419275493, "kl": 0.271728515625, "learning_rate": 2.717197636289373e-07, "loss": -0.0391, "num_tokens": 23080615.0, "reward": -3.725290298461914e-09, "reward_std": 0.15867924690246582, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 726.84375, "completions/mean_terminated_length": 684.3928833007812, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 2.447530864197531, "grad_norm": 1.0483591323718822, "kl": 0.300048828125, "learning_rate": 2.712216756770881e-07, "loss": 0.0438, "num_tokens": 23110310.0, "reward": 0.0, "reward_std": 0.15284463763237, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 813.8125, "completions/mean_terminated_length": 718.2727661132812, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 2.450617283950617, "grad_norm": 1.4077606143842762, "kl": 0.2593994140625, "learning_rate": 2.7072350286057354e-07, "loss": -0.0301, "num_tokens": 23143416.0, "reward": -2.7939677238464355e-09, "reward_std": 0.1905532032251358, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592105805873871, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 784.875, "completions/mean_terminated_length": 676.1818237304688, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 2.4537037037037037, "grad_norm": 0.009587927875737758, "kl": 0.2674560546875, "learning_rate": 2.7022524717156734e-07, "loss": 0.0003, "num_tokens": 23174632.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 734.09375, "completions/mean_terminated_length": 637.4583740234375, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 2.45679012345679, "grad_norm": 1.0297913705744441, "kl": 0.2774658203125, "learning_rate": 2.6972691060257504e-07, "loss": 0.0184, "num_tokens": 23204627.0, "reward": -4.6566128730773926e-09, "reward_std": 0.11199356615543365, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 777.90625, "completions/mean_terminated_length": 630.25, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 2.4598765432098766, "grad_norm": 1.4406329964664708, "kl": 0.2735595703125, "learning_rate": 2.6922849514642524e-07, "loss": -0.0354, "num_tokens": 23236272.0, "reward": -1.6763806343078613e-08, "reward_std": 0.16224205493927002, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 755.3125, "completions/mean_terminated_length": 716.9285888671875, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 2.462962962962963, "grad_norm": 0.9164887141421001, "kl": 0.2772216796875, "learning_rate": 2.687300027962624e-07, "loss": 0.0078, "num_tokens": 23267274.0, "reward": -2.3283064365386963e-09, "reward_std": 0.06432675570249557, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 718.78125, "completions/mean_terminated_length": 662.25927734375, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 2.4660493827160495, "grad_norm": 1.534588977749662, "kl": 0.2891845703125, "learning_rate": 2.682314355455381e-07, "loss": 0.0485, "num_tokens": 23296879.0, "reward": -1.4901161193847656e-08, "reward_std": 0.21280820667743683, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 7.450580596923828e-09, "rewards/logprob_reward/std": 0.43994131684303284, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 786.34375, "completions/mean_terminated_length": 707.125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 2.4691358024691357, "grad_norm": 0.9315084172418175, "kl": 0.26025390625, "learning_rate": 2.677327953880038e-07, "loss": -0.0142, "num_tokens": 23328834.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 749.84375, "completions/mean_terminated_length": 699.0740966796875, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 2.4722222222222223, "grad_norm": 0.7175811500507011, "kl": 0.2691650390625, "learning_rate": 2.6723408431770214e-07, "loss": -0.0131, "num_tokens": 23359261.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 781.03125, "completions/mean_terminated_length": 713.0, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 2.4753086419753085, "grad_norm": 0.9845812387614366, "kl": 0.2730712890625, "learning_rate": 2.6673530432895957e-07, "loss": 0.0095, "num_tokens": 23390878.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 686.65625, "completions/mean_terminated_length": 664.1666870117188, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 2.478395061728395, "grad_norm": 1.2556912955304715, "kl": 0.2806396484375, "learning_rate": 2.6623645741637815e-07, "loss": -0.0074, "num_tokens": 23418823.0, "reward": 0.028124995529651642, "reward_std": 0.12057675421237946, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 655.21875, "completions/mean_terminated_length": 602.5357666015625, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 2.4814814814814814, "grad_norm": 0.7570621661561733, "kl": 0.2835693359375, "learning_rate": 2.6573754557482746e-07, "loss": -0.0092, "num_tokens": 23445842.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 651.3125, "completions/mean_terminated_length": 626.4666748046875, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 2.484567901234568, "grad_norm": 0.8131351621463883, "kl": 0.303466796875, "learning_rate": 2.652385707994369e-07, "loss": -0.0304, "num_tokens": 23472936.0, "reward": 2.7939677238464355e-09, "reward_std": 0.1317192167043686, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 720.03125, "completions/mean_terminated_length": 699.7667236328125, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 2.4876543209876543, "grad_norm": 1.0608940637272115, "kl": 0.280029296875, "learning_rate": 2.6473953508558726e-07, "loss": 0.0071, "num_tokens": 23502509.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 721.75, "completions/mean_terminated_length": 665.7777709960938, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 2.490740740740741, "grad_norm": 0.7478641121751027, "kl": 0.2586669921875, "learning_rate": 2.6424044042890334e-07, "loss": 0.0002, "num_tokens": 23531869.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 711.59375, "completions/mean_terminated_length": 639.5, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 2.493827160493827, "grad_norm": 0.9939590495143553, "kl": 0.27783203125, "learning_rate": 2.6374128882524527e-07, "loss": -0.0195, "num_tokens": 23561128.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 768.71875, "completions/mean_terminated_length": 709.8077392578125, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 2.496913580246914, "grad_norm": 0.9438428030549388, "kl": 0.2557373046875, "learning_rate": 2.6324208227070136e-07, "loss": -0.0343, "num_tokens": 23592107.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 689.5, "completions/mean_terminated_length": 612.3077392578125, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 2.5, "grad_norm": 1.2342092287156805, "kl": 0.2899169921875, "learning_rate": 2.6274282276157934e-07, "loss": 0.0197, "num_tokens": 23620143.0, "reward": 3.725290298461914e-09, "reward_std": 0.15541993081569672, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 759.15625, "completions/mean_terminated_length": 685.0, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 2.503086419753086, "grad_norm": 1.0885787695527718, "kl": 0.2723388671875, "learning_rate": 2.622435122943987e-07, "loss": -0.0143, "num_tokens": 23651120.0, "reward": 0.0, "reward_std": 0.17995095252990723, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 705.59375, "completions/mean_terminated_length": 672.6551513671875, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 2.506172839506173, "grad_norm": 2.2527079144224795, "kl": 0.273681640625, "learning_rate": 2.61744152865883e-07, "loss": 0.1885, "num_tokens": 23679915.0, "reward": -3.725290298461914e-09, "reward_std": 0.2709388732910156, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 695.78125, "completions/mean_terminated_length": 648.8928833007812, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 2.5092592592592595, "grad_norm": 0.5731926671606395, "kl": 0.269775390625, "learning_rate": 2.6124474647295137e-07, "loss": -0.0033, "num_tokens": 23708484.0, "reward": 9.313225746154785e-10, "reward_std": 0.02981424145400524, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 683.625, "completions/mean_terminated_length": 660.933349609375, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 2.5123456790123457, "grad_norm": 1.1484305020897618, "kl": 0.278076171875, "learning_rate": 2.607452951127107e-07, "loss": -0.0489, "num_tokens": 23737172.0, "reward": 0.0, "reward_std": 0.1328706294298172, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 796.5, "completions/mean_terminated_length": 693.0909423828125, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 2.515432098765432, "grad_norm": 0.8305244512573621, "kl": 0.2659912109375, "learning_rate": 2.6024580078244777e-07, "loss": 0.0045, "num_tokens": 23769328.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 675.71875, "completions/mean_terminated_length": 625.9642944335938, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 2.5185185185185186, "grad_norm": 1.4278326343250904, "kl": 0.2794189453125, "learning_rate": 2.5974626547962127e-07, "loss": -0.0616, "num_tokens": 23797291.0, "reward": 0.0, "reward_std": 0.19526955485343933, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.4016096889972687, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 742.8125, "completions/mean_terminated_length": 649.0833740234375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 2.521604938271605, "grad_norm": 0.6616642904593187, "kl": 0.2615966796875, "learning_rate": 2.5924669120185373e-07, "loss": 0.0081, "num_tokens": 23827093.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 750.75, "completions/mean_terminated_length": 700.1481323242188, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 2.5246913580246915, "grad_norm": 0.7219404271245976, "kl": 0.2576904296875, "learning_rate": 2.5874707994692333e-07, "loss": -0.0178, "num_tokens": 23857377.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 706.65625, "completions/mean_terminated_length": 673.8275756835938, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 2.5277777777777777, "grad_norm": 0.8481421154306074, "kl": 0.281005859375, "learning_rate": 2.582474337127564e-07, "loss": -0.0261, "num_tokens": 23886670.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 688.15625, "completions/mean_terminated_length": 653.413818359375, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 2.5308641975308643, "grad_norm": 1.0360589548354595, "kl": 0.2630615234375, "learning_rate": 2.5774775449741903e-07, "loss": -0.028, "num_tokens": 23915343.0, "reward": -1.862645149230957e-09, "reward_std": 0.15887999534606934, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 729.78125, "completions/mean_terminated_length": 687.7500610351562, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 2.5339506172839505, "grad_norm": 1.0788661192732718, "kl": 0.277099609375, "learning_rate": 2.572480442991092e-07, "loss": -0.0307, "num_tokens": 23945272.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 766.90625, "completions/mean_terminated_length": 681.2083740234375, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 2.537037037037037, "grad_norm": 0.6432639315080375, "kl": 0.271484375, "learning_rate": 2.567483051161487e-07, "loss": 0.0367, "num_tokens": 23976409.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 702.53125, "completions/mean_terminated_length": 643.0, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 2.5401234567901234, "grad_norm": 1.0878890966955206, "kl": 0.287109375, "learning_rate": 2.562485389469754e-07, "loss": 0.0382, "num_tokens": 24004966.0, "reward": 0.0, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 679.59375, "completions/mean_terminated_length": 643.9655151367188, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 2.5432098765432096, "grad_norm": 1.2688152577808474, "kl": 0.301513671875, "learning_rate": 2.5574874779013494e-07, "loss": 0.0335, "num_tokens": 24032845.0, "reward": 0.0, "reward_std": 0.23370715975761414, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 681.78125, "completions/mean_terminated_length": 618.4074096679688, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 2.5462962962962963, "grad_norm": 1.3680426474702223, "kl": 0.301025390625, "learning_rate": 2.5524893364427307e-07, "loss": -0.0296, "num_tokens": 24060926.0, "reward": 2.7939677238464355e-09, "reward_std": 0.13619740307331085, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110855221748352, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 720.09375, "completions/mean_terminated_length": 635.0, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 2.549382716049383, "grad_norm": 1.3668319384193501, "kl": 0.2723388671875, "learning_rate": 2.547490985081272e-07, "loss": -0.0103, "num_tokens": 24090577.0, "reward": -3.725290298461914e-09, "reward_std": 0.19500115513801575, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 667.03125, "completions/mean_terminated_length": 630.1034545898438, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 2.552469135802469, "grad_norm": 1.5669038390289485, "kl": 0.2872314453125, "learning_rate": 2.5424924438051896e-07, "loss": 0.0276, "num_tokens": 24118426.0, "reward": 3.725290298461914e-09, "reward_std": 0.20564739406108856, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 643.78125, "completions/mean_terminated_length": 631.51611328125, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 2.5555555555555554, "grad_norm": 1.0590135894169592, "kl": 0.29736328125, "learning_rate": 2.5374937326034575e-07, "loss": -0.0271, "num_tokens": 24145019.0, "reward": 0.0, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 809.375, "completions/mean_terminated_length": 725.3912963867188, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 2.558641975308642, "grad_norm": 1.1239317310925958, "kl": 0.307373046875, "learning_rate": 2.5324948714657287e-07, "loss": -0.0269, "num_tokens": 24177311.0, "reward": 0.0, "reward_std": 0.19456422328948975, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 764.40625, "completions/mean_terminated_length": 727.3214721679688, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 2.5617283950617287, "grad_norm": 1.0366865539180945, "kl": 0.288818359375, "learning_rate": 2.527495880382259e-07, "loss": -0.0707, "num_tokens": 24208148.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 724.71875, "completions/mean_terminated_length": 640.9199829101562, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 2.564814814814815, "grad_norm": 1.2986288395674328, "kl": 0.289306640625, "learning_rate": 2.522496779343819e-07, "loss": -0.0275, "num_tokens": 24237679.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 652.28125, "completions/mean_terminated_length": 627.5000610351562, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 2.567901234567901, "grad_norm": 0.8025661060787778, "kl": 0.3109130859375, "learning_rate": 2.5174975883416237e-07, "loss": 0.0056, "num_tokens": 24264684.0, "reward": 0.02812499925494194, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 729.59375, "completions/mean_terminated_length": 675.0740966796875, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 2.5709876543209877, "grad_norm": 0.08401355068948127, "kl": 0.310546875, "learning_rate": 2.512498327367245e-07, "loss": 0.0003, "num_tokens": 24294691.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 734.125, "completions/mean_terminated_length": 704.137939453125, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 2.574074074074074, "grad_norm": 1.0560475472697004, "kl": 0.286865234375, "learning_rate": 2.5074990164125355e-07, "loss": -0.0448, "num_tokens": 24324623.0, "reward": 0.0, "reward_std": 0.12547743320465088, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 756.5625, "completions/mean_terminated_length": 707.0370483398438, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 2.5771604938271606, "grad_norm": 0.5883132573063482, "kl": 0.276611328125, "learning_rate": 2.502499675469547e-07, "loss": 0.0204, "num_tokens": 24355029.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 745.28125, "completions/mean_terminated_length": 705.4642944335938, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 2.580246913580247, "grad_norm": 0.7565401852315675, "kl": 0.262451171875, "learning_rate": 2.497500324530453e-07, "loss": -0.0087, "num_tokens": 24385450.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 733.59375, "completions/mean_terminated_length": 692.107177734375, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 2.5833333333333335, "grad_norm": 0.8986246072885274, "kl": 0.2607421875, "learning_rate": 2.4925009835874643e-07, "loss": 0.0001, "num_tokens": 24415165.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 687.3125, "completions/mean_terminated_length": 664.86669921875, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 2.5864197530864197, "grad_norm": 0.7745117491811915, "kl": 0.287353515625, "learning_rate": 2.4875016726327555e-07, "loss": 0.0201, "num_tokens": 24443231.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 737.46875, "completions/mean_terminated_length": 684.4074096679688, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 2.5895061728395063, "grad_norm": 1.27069380642628, "kl": 0.25390625, "learning_rate": 2.482502411658376e-07, "loss": 0.0084, "num_tokens": 24473338.0, "reward": 1.862645149230957e-09, "reward_std": 0.18805178999900818, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 5.587935447692871e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 743.1875, "completions/mean_terminated_length": 664.5599975585938, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 2.5925925925925926, "grad_norm": 1.0143959017732704, "kl": 0.28515625, "learning_rate": 2.477503220656181e-07, "loss": 0.0347, "num_tokens": 24503412.0, "reward": -5.587935447692871e-09, "reward_std": 0.16838626563549042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -2.7939677238464355e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 722.15625, "completions/mean_terminated_length": 666.25927734375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 2.5956790123456788, "grad_norm": 0.4954769982377692, "kl": 0.27880859375, "learning_rate": 2.472504119617742e-07, "loss": 0.0227, "num_tokens": 24532537.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 782.9375, "completions/mean_terminated_length": 748.5000610351562, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 2.5987654320987654, "grad_norm": 1.1298471444079328, "kl": 0.260009765625, "learning_rate": 2.4675051285342716e-07, "loss": -0.0419, "num_tokens": 24564011.0, "reward": -1.862645149230957e-09, "reward_std": 0.07559289038181305, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 745.0, "completions/mean_terminated_length": 693.3333129882812, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 2.601851851851852, "grad_norm": 0.6878738170747613, "kl": 0.3057861328125, "learning_rate": 2.462506267396543e-07, "loss": -0.0049, "num_tokens": 24594623.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 783.25, "completions/mean_terminated_length": 703.0, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 2.6049382716049383, "grad_norm": 0.5058747461535761, "kl": 0.29638671875, "learning_rate": 2.45750755619481e-07, "loss": 0.0012, "num_tokens": 24626243.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 668.65625, "completions/mean_terminated_length": 617.8928833007812, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 2.6080246913580245, "grad_norm": 1.3353346300638027, "kl": 0.2615966796875, "learning_rate": 2.452509014918728e-07, "loss": 0.0659, "num_tokens": 24653920.0, "reward": -3.725290298461914e-09, "reward_std": 0.1300242692232132, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 720.34375, "completions/mean_terminated_length": 635.3200073242188, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 2.611111111111111, "grad_norm": 1.1611740377427653, "kl": 0.257080078125, "learning_rate": 2.4475106635572696e-07, "loss": 0.0056, "num_tokens": 24683467.0, "reward": 0.0, "reward_std": 0.16916194558143616, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 718.96875, "completions/mean_terminated_length": 675.3928833007812, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 2.6141975308641974, "grad_norm": 1.3708056657621666, "kl": 0.293212890625, "learning_rate": 2.4425125220986503e-07, "loss": -0.0198, "num_tokens": 24713138.0, "reward": -3.725290298461914e-09, "reward_std": 0.13394224643707275, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 687.9375, "completions/mean_terminated_length": 639.9285888671875, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 2.617283950617284, "grad_norm": 0.953915107942522, "kl": 0.2908935546875, "learning_rate": 2.437514610530246e-07, "loss": -0.0103, "num_tokens": 24741080.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 729.40625, "completions/mean_terminated_length": 709.7667236328125, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 2.6203703703703702, "grad_norm": 0.6262904933841149, "kl": 0.264892578125, "learning_rate": 2.4325169488385137e-07, "loss": -0.0066, "num_tokens": 24770901.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 719.34375, "completions/mean_terminated_length": 649.0385131835938, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 2.623456790123457, "grad_norm": 1.1377923496997484, "kl": 0.2659912109375, "learning_rate": 2.4275195570089083e-07, "loss": -0.0015, "num_tokens": 24800116.0, "reward": 0.0, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 822.9375, "completions/mean_terminated_length": 702.2999877929688, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 2.626543209876543, "grad_norm": 1.588263612638342, "kl": 0.275634765625, "learning_rate": 2.42252245502581e-07, "loss": -0.0301, "num_tokens": 24833350.0, "reward": -1.862645149230957e-09, "reward_std": 0.2094171941280365, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 9.313225746154785e-10, "rewards/logprob_reward/std": 0.4016096591949463, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 808.625, "completions/mean_terminated_length": 736.8333740234375, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 2.6296296296296298, "grad_norm": 1.2086816185668496, "kl": 0.263671875, "learning_rate": 2.417525662872436e-07, "loss": -0.0232, "num_tokens": 24866138.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 725.90625, "completions/mean_terminated_length": 683.3214721679688, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 2.632716049382716, "grad_norm": 1.4309719486748265, "kl": 0.2666015625, "learning_rate": 2.412529200530767e-07, "loss": -0.0447, "num_tokens": 24895843.0, "reward": -3.725290298461914e-09, "reward_std": 0.20958730578422546, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 751.28125, "completions/mean_terminated_length": 712.3214721679688, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 2.6358024691358026, "grad_norm": 0.8428307152060934, "kl": 0.2628173828125, "learning_rate": 2.407533087981463e-07, "loss": -0.0232, "num_tokens": 24926684.0, "reward": 0.0, "reward_std": 0.12123563885688782, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 653.0625, "completions/mean_terminated_length": 641.0967407226562, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 2.638888888888889, "grad_norm": 1.4396347735888524, "kl": 0.2926025390625, "learning_rate": 2.4025373452037865e-07, "loss": -0.0344, "num_tokens": 24953906.0, "reward": 1.862645149230957e-09, "reward_std": 0.20773616433143616, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 746.0625, "completions/mean_terminated_length": 681.923095703125, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 2.6419753086419755, "grad_norm": 1.3373908056729427, "kl": 0.30126953125, "learning_rate": 2.3975419921755215e-07, "loss": 0.0316, "num_tokens": 24983972.0, "reward": 0.0, "reward_std": 0.15817604959011078, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 726.75, "completions/mean_terminated_length": 658.1538696289062, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 2.6450617283950617, "grad_norm": 1.386289547194358, "kl": 0.264892578125, "learning_rate": 2.3925470488728935e-07, "loss": -0.0423, "num_tokens": 25013972.0, "reward": 3.725290298461914e-09, "reward_std": 0.14240965247154236, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 661.8125, "completions/mean_terminated_length": 650.1290283203125, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 2.648148148148148, "grad_norm": 0.9448071870602488, "kl": 0.2669677734375, "learning_rate": 2.3875525352704866e-07, "loss": -0.0172, "num_tokens": 25041362.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 676.0, "completions/mean_terminated_length": 652.800048828125, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 2.6512345679012346, "grad_norm": 1.0594595146085641, "kl": 0.2581787109375, "learning_rate": 2.38255847134117e-07, "loss": 0.0087, "num_tokens": 25069366.0, "reward": 0.02812499925494194, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 793.6875, "completions/mean_terminated_length": 673.047607421875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 2.6543209876543212, "grad_norm": 1.750820591589366, "kl": 0.2794189453125, "learning_rate": 2.3775648770560126e-07, "loss": -0.0002, "num_tokens": 25101712.0, "reward": -3.725290298461914e-09, "reward_std": 0.2277967929840088, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 778.875, "completions/mean_terminated_length": 710.239990234375, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 2.6574074074074074, "grad_norm": 1.1716960640053096, "kl": 0.253662109375, "learning_rate": 2.3725717723842066e-07, "loss": -0.0033, "num_tokens": 25133112.0, "reward": -3.725290298461914e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 773.78125, "completions/mean_terminated_length": 690.375, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 2.6604938271604937, "grad_norm": 0.5074171873813702, "kl": 0.305419921875, "learning_rate": 2.3675791772929862e-07, "loss": 0.0246, "num_tokens": 25164653.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 698.65625, "completions/mean_terminated_length": 652.1785888671875, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 2.6635802469135803, "grad_norm": 0.9363851628606226, "kl": 0.2802734375, "learning_rate": 2.3625871117475466e-07, "loss": 0.0002, "num_tokens": 25193398.0, "reward": 3.725290298461914e-09, "reward_std": 0.12879827618598938, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 710.21875, "completions/mean_terminated_length": 652.1111450195312, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 2.6666666666666665, "grad_norm": 0.8397991235587109, "kl": 0.268310546875, "learning_rate": 2.357595595710967e-07, "loss": -0.0081, "num_tokens": 25222269.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 718.28125, "completions/mean_terminated_length": 661.6666870117188, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 2.669753086419753, "grad_norm": 2.147673799202004, "kl": 0.27978515625, "learning_rate": 2.3526046491441277e-07, "loss": -0.0447, "num_tokens": 25251902.0, "reward": 4.656612873077393e-10, "reward_std": 0.17102104425430298, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 5.122274160385132e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 762.28125, "completions/mean_terminated_length": 675.0416870117188, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 2.6728395061728394, "grad_norm": 1.3506779003869993, "kl": 0.320068359375, "learning_rate": 2.3476142920056315e-07, "loss": -0.0104, "num_tokens": 25282779.0, "reward": -1.862645149230957e-09, "reward_std": 0.14842106401920319, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 760.34375, "completions/mean_terminated_length": 722.6785888671875, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 2.675925925925926, "grad_norm": 0.9226034555074042, "kl": 0.28125, "learning_rate": 2.3426245442517254e-07, "loss": 0.0013, "num_tokens": 25313570.0, "reward": -3.725290298461914e-09, "reward_std": 0.12777692079544067, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 613.21875, "completions/mean_terminated_length": 599.9677124023438, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 2.6790123456790123, "grad_norm": 1.2276290914825594, "kl": 0.29150390625, "learning_rate": 2.3376354258362185e-07, "loss": -0.0218, "num_tokens": 25339257.0, "reward": -3.725290298461914e-09, "reward_std": 0.19173535704612732, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -9.313225746154785e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 792.75, "completions/mean_terminated_length": 715.6666870117188, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 2.682098765432099, "grad_norm": 0.5015083810127813, "kl": 0.2744140625, "learning_rate": 2.3326469567104044e-07, "loss": 0.0187, "num_tokens": 25371121.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 715.1875, "completions/mean_terminated_length": 643.923095703125, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 2.685185185185185, "grad_norm": 0.011519412352654029, "kl": 0.294677734375, "learning_rate": 2.3276591568229787e-07, "loss": 0.0003, "num_tokens": 25400171.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 775.65625, "completions/mean_terminated_length": 706.1199951171875, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 2.6882716049382713, "grad_norm": 1.0218626235133945, "kl": 0.2607421875, "learning_rate": 2.3226720461199626e-07, "loss": -0.044, "num_tokens": 25432148.0, "reward": 0.02812499925494194, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 849.0, "completions/mean_terminated_length": 694.5882568359375, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 2.691358024691358, "grad_norm": 0.5338448545770911, "kl": 0.2791748046875, "learning_rate": 2.3176856445446187e-07, "loss": 0.0251, "num_tokens": 25466384.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 726.125, "completions/mean_terminated_length": 695.3103637695312, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 2.6944444444444446, "grad_norm": 1.1476966935091446, "kl": 0.258056640625, "learning_rate": 2.3126999720373757e-07, "loss": -0.0227, "num_tokens": 25496188.0, "reward": -3.725290298461914e-09, "reward_std": 0.17643702030181885, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 821.59375, "completions/mean_terminated_length": 742.3912963867188, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 2.697530864197531, "grad_norm": 0.9983732012570077, "kl": 0.2777099609375, "learning_rate": 2.3077150485357477e-07, "loss": 0.001, "num_tokens": 25528835.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 790.125, "completions/mean_terminated_length": 724.6399536132812, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 2.700617283950617, "grad_norm": 0.6854490421533415, "kl": 0.2733154296875, "learning_rate": 2.3027308939742502e-07, "loss": -0.0212, "num_tokens": 25561083.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 741.53125, "completions/mean_terminated_length": 662.4400024414062, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 2.7037037037037037, "grad_norm": 1.2926251434162352, "kl": 0.2996826171875, "learning_rate": 2.2977475282843266e-07, "loss": 0.0043, "num_tokens": 25591296.0, "reward": 1.862645149230957e-09, "reward_std": 0.15677526593208313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 688.09375, "completions/mean_terminated_length": 677.258056640625, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 2.7067901234567904, "grad_norm": 0.609544326786698, "kl": 0.273681640625, "learning_rate": 2.292764971394265e-07, "loss": 0.0108, "num_tokens": 25619547.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 711.0625, "completions/mean_terminated_length": 638.84619140625, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 2.7098765432098766, "grad_norm": 0.8579878365838796, "kl": 0.284912109375, "learning_rate": 2.2877832432291188e-07, "loss": -0.001, "num_tokens": 25648465.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 731.71875, "completions/mean_terminated_length": 701.4827270507812, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 2.712962962962963, "grad_norm": 0.8799359219560791, "kl": 0.2803955078125, "learning_rate": 2.2828023637106273e-07, "loss": -0.0087, "num_tokens": 25678656.0, "reward": 0.02812499925494194, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 721.8125, "completions/mean_terminated_length": 678.6428833007812, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 2.7160493827160495, "grad_norm": 1.0607403211756514, "kl": 0.287109375, "learning_rate": 2.2778223527571362e-07, "loss": -0.0256, "num_tokens": 25708638.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 793.875, "completions/mean_terminated_length": 729.4400024414062, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 2.7191358024691357, "grad_norm": 1.1726143724672464, "kl": 0.274169921875, "learning_rate": 2.2728432302835183e-07, "loss": -0.0561, "num_tokens": 25740870.0, "reward": 1.862645149230957e-09, "reward_std": 0.21564045548439026, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 773.15625, "completions/mean_terminated_length": 702.9199829101562, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 2.7222222222222223, "grad_norm": 0.8321198331902351, "kl": 0.273681640625, "learning_rate": 2.2678650162010937e-07, "loss": 0.022, "num_tokens": 25771943.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 751.78125, "completions/mean_terminated_length": 661.0416870117188, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 2.7253086419753085, "grad_norm": 1.250253132035384, "kl": 0.2916259765625, "learning_rate": 2.2628877304175472e-07, "loss": -0.0401, "num_tokens": 25802876.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 696.4375, "completions/mean_terminated_length": 662.5516967773438, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 2.728395061728395, "grad_norm": 1.1679716124487187, "kl": 0.2510986328125, "learning_rate": 2.2579113928368548e-07, "loss": -0.025, "num_tokens": 25831410.0, "reward": -1.862645149230957e-09, "reward_std": 0.14842106401920319, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 760.3125, "completions/mean_terminated_length": 711.4815063476562, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 2.7314814814814814, "grad_norm": 3.346991532297941, "kl": 1.3779296875, "learning_rate": 2.2529360233591997e-07, "loss": 0.0036, "num_tokens": 25862852.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 712.3125, "completions/mean_terminated_length": 640.3846435546875, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 2.734567901234568, "grad_norm": 1.5899803042298926, "kl": 0.533935546875, "learning_rate": 2.2479616418808915e-07, "loss": -0.0398, "num_tokens": 25891950.0, "reward": -1.862645149230957e-09, "reward_std": 0.18978948891162872, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 763.96875, "completions/mean_terminated_length": 691.1599731445312, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 2.7376543209876543, "grad_norm": 1.063542597921851, "kl": 0.297607421875, "learning_rate": 2.242988268294292e-07, "loss": 0.0266, "num_tokens": 25923333.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 710.4375, "completions/mean_terminated_length": 665.6428833007812, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 2.7407407407407405, "grad_norm": 0.5474607759704945, "kl": 0.2763671875, "learning_rate": 2.23801592248773e-07, "loss": 0.0149, "num_tokens": 25951927.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 787.90625, "completions/mean_terminated_length": 664.2380981445312, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 2.743827160493827, "grad_norm": 0.9670648589224451, "kl": 0.31005859375, "learning_rate": 2.2330446243454265e-07, "loss": -0.0183, "num_tokens": 25983920.0, "reward": 3.725290298461914e-09, "reward_std": 0.1265953779220581, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 664.59375, "completions/mean_terminated_length": 613.25, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 2.746913580246914, "grad_norm": 1.1402925599138931, "kl": 0.2781982421875, "learning_rate": 2.228074393747412e-07, "loss": -0.0077, "num_tokens": 26011571.0, "reward": 1.862645149230957e-09, "reward_std": 0.15863974392414093, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 767.375, "completions/mean_terminated_length": 681.8333740234375, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 2.75, "grad_norm": 1.3522779112465206, "kl": 0.273681640625, "learning_rate": 2.2231052505694458e-07, "loss": -0.0317, "num_tokens": 26042987.0, "reward": -3.725290298461914e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 766.8125, "completions/mean_terminated_length": 694.7999877929688, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 2.753086419753086, "grad_norm": 0.6430250085969144, "kl": 0.2457275390625, "learning_rate": 2.2181372146829418e-07, "loss": 0.0003, "num_tokens": 26074093.0, "reward": 0.0, "reward_std": 0.09185586124658585, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 805.65625, "completions/mean_terminated_length": 732.875, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 2.756172839506173, "grad_norm": 1.0707958292063462, "kl": 0.2540283203125, "learning_rate": 2.213170305954884e-07, "loss": -0.0176, "num_tokens": 26106926.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 727.0625, "completions/mean_terminated_length": 672.0740966796875, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 2.7592592592592595, "grad_norm": 1.0614821263774508, "kl": 0.307373046875, "learning_rate": 2.2082045442477497e-07, "loss": 0.0029, "num_tokens": 26137132.0, "reward": -3.725290298461914e-09, "reward_std": 0.1258947253227234, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 777.53125, "completions/mean_terminated_length": 681.0869750976562, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 2.7623456790123457, "grad_norm": 1.570547442403824, "kl": 0.2452392578125, "learning_rate": 2.2032399494194292e-07, "loss": 0.0891, "num_tokens": 26168925.0, "reward": 3.725290298461914e-09, "reward_std": 0.1507483720779419, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 734.78125, "completions/mean_terminated_length": 681.2222290039062, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 2.765432098765432, "grad_norm": 1.0004121014341636, "kl": 0.275634765625, "learning_rate": 2.1982765413231466e-07, "loss": -0.0032, "num_tokens": 26199166.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 771.0, "completions/mean_terminated_length": 656.0, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 2.7685185185185186, "grad_norm": 1.0749553660395452, "kl": 0.29248046875, "learning_rate": 2.1933143398073805e-07, "loss": 0.0248, "num_tokens": 26230666.0, "reward": 0.05624999850988388, "reward_std": 0.06495190411806107, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0625, "rewards/logprob_reward/std": 0.24593468010425568, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 747.75, "completions/mean_terminated_length": 684.0, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 2.771604938271605, "grad_norm": 0.8422264113509557, "kl": 0.2744140625, "learning_rate": 2.1883533647157828e-07, "loss": -0.0057, "num_tokens": 26261542.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 745.9375, "completions/mean_terminated_length": 717.1724243164062, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 2.7746913580246915, "grad_norm": 1.7566995325771781, "kl": 0.259521484375, "learning_rate": 2.1833936358871045e-07, "loss": 0.0919, "num_tokens": 26292444.0, "reward": 3.725290298461914e-09, "reward_std": 0.22395765781402588, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.43994131684303284, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 734.46875, "completions/mean_terminated_length": 704.5172119140625, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 2.7777777777777777, "grad_norm": 0.01226161702759194, "kl": 0.28515625, "learning_rate": 2.1784351731551077e-07, "loss": 0.0003, "num_tokens": 26322183.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 796.46875, "completions/mean_terminated_length": 707.434814453125, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 2.7808641975308643, "grad_norm": 1.2244764487572017, "kl": 0.26123046875, "learning_rate": 2.1734779963484959e-07, "loss": 0.0212, "num_tokens": 26354178.0, "reward": -1.862645149230957e-09, "reward_std": 0.19498921930789948, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 813.0625, "completions/mean_terminated_length": 774.0, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 2.7839506172839505, "grad_norm": 1.287437015814438, "kl": 0.2586669921875, "learning_rate": 2.1685221252908282e-07, "loss": 0.0242, "num_tokens": 26387348.0, "reward": 0.0, "reward_std": 0.21456822752952576, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 749.96875, "completions/mean_terminated_length": 686.7307739257812, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 2.787037037037037, "grad_norm": 1.41860485491689, "kl": 0.3116455078125, "learning_rate": 2.163567579800443e-07, "loss": 0.0032, "num_tokens": 26418071.0, "reward": 0.0, "reward_std": 0.09185586124658585, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 775.59375, "completions/mean_terminated_length": 678.3912963867188, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 2.7901234567901234, "grad_norm": 1.054656133279091, "kl": 0.2364501953125, "learning_rate": 2.1586143796903775e-07, "loss": -0.0097, "num_tokens": 26449738.0, "reward": -5.587935447692871e-09, "reward_std": 0.17700429260730743, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 717.15625, "completions/mean_terminated_length": 673.3214721679688, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 2.7932098765432096, "grad_norm": 1.1037996347432708, "kl": 0.265625, "learning_rate": 2.1536625447682877e-07, "loss": 0.012, "num_tokens": 26478983.0, "reward": -1.862645149230957e-09, "reward_std": 0.14842106401920319, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 767.375, "completions/mean_terminated_length": 650.727294921875, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 2.7962962962962963, "grad_norm": 1.6015105918910906, "kl": 0.262451171875, "learning_rate": 2.1487120948363713e-07, "loss": -0.0399, "num_tokens": 26509807.0, "reward": 0.0, "reward_std": 0.2630038857460022, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 5.587935447692871e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 777.8125, "completions/mean_terminated_length": 665.9091186523438, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 2.799382716049383, "grad_norm": 0.7547204069363477, "kl": 0.2603759765625, "learning_rate": 2.1437630496912889e-07, "loss": 0.0161, "num_tokens": 26541777.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 842.5625, "completions/mean_terminated_length": 733.7000122070312, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 2.802469135802469, "grad_norm": 0.9650770767730723, "kl": 0.2747802734375, "learning_rate": 2.1388154291240794e-07, "loss": 0.021, "num_tokens": 26575815.0, "reward": 0.0, "reward_std": 0.15334317088127136, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 679.5, "completions/mean_terminated_length": 630.2857666015625, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 2.8055555555555554, "grad_norm": 0.8565306768480461, "kl": 0.2716064453125, "learning_rate": 2.133869252920089e-07, "loss": -0.0256, "num_tokens": 26603619.0, "reward": 0.0, "reward_std": 0.126713365316391, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 719.1875, "completions/mean_terminated_length": 648.84619140625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 2.808641975308642, "grad_norm": 1.2245127423131754, "kl": 0.2950439453125, "learning_rate": 2.128924540858885e-07, "loss": -0.0015, "num_tokens": 26632957.0, "reward": 0.0, "reward_std": 0.14260268211364746, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 734.5625, "completions/mean_terminated_length": 693.2142944335938, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 2.8117283950617287, "grad_norm": 0.8344859032295361, "kl": 0.2655029296875, "learning_rate": 2.1239813127141828e-07, "loss": 0.0041, "num_tokens": 26662559.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 739.8125, "completions/mean_terminated_length": 674.2307739257812, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 2.814814814814815, "grad_norm": 0.7417314091719092, "kl": 0.2696533203125, "learning_rate": 2.1190395882537598e-07, "loss": 0.0198, "num_tokens": 26692685.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 727.78125, "completions/mean_terminated_length": 659.423095703125, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 2.817901234567901, "grad_norm": 1.030812093252084, "kl": 0.2685546875, "learning_rate": 2.1140993872393833e-07, "loss": -0.0115, "num_tokens": 26722202.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 666.65625, "completions/mean_terminated_length": 615.607177734375, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 2.8209876543209877, "grad_norm": 0.009137411678997759, "kl": 0.306884765625, "learning_rate": 2.1091607294267269e-07, "loss": 0.0003, "num_tokens": 26750283.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 731.6875, "completions/mean_terminated_length": 664.2307739257812, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 2.824074074074074, "grad_norm": 1.4408771794989097, "kl": 0.25830078125, "learning_rate": 2.1042236345652947e-07, "loss": -0.0312, "num_tokens": 26780637.0, "reward": -3.725290298461914e-09, "reward_std": 0.24957561492919922, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 796.34375, "completions/mean_terminated_length": 732.5999755859375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 2.8271604938271606, "grad_norm": 0.8706589470234878, "kl": 0.2572021484375, "learning_rate": 2.0992881223983368e-07, "loss": -0.0138, "num_tokens": 26812788.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 715.03125, "completions/mean_terminated_length": 643.7307739257812, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 2.830246913580247, "grad_norm": 0.9545348220695438, "kl": 0.294189453125, "learning_rate": 2.0943542126627784e-07, "loss": -0.0341, "num_tokens": 26842329.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 750.5625, "completions/mean_terminated_length": 699.9259033203125, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 2.8333333333333335, "grad_norm": 1.076229224187121, "kl": 0.258056640625, "learning_rate": 2.0894219250891352e-07, "loss": 0.0413, "num_tokens": 26873135.0, "reward": -3.725290298461914e-09, "reward_std": 0.1746388077735901, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 777.25, "completions/mean_terminated_length": 680.6956787109375, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 2.8364197530864197, "grad_norm": 0.810129740101571, "kl": 0.300537109375, "learning_rate": 2.0844912794014341e-07, "loss": 0.0126, "num_tokens": 26904691.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 718.5625, "completions/mean_terminated_length": 648.0769653320312, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 2.8395061728395063, "grad_norm": 1.028209853074001, "kl": 0.2764892578125, "learning_rate": 2.079562295317139e-07, "loss": 0.0078, "num_tokens": 26934061.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 717.25, "completions/mean_terminated_length": 660.4444580078125, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 2.8425925925925926, "grad_norm": 0.6148676306692956, "kl": 0.2845458984375, "learning_rate": 2.0746349925470672e-07, "loss": 0.0091, "num_tokens": 26963113.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 723.75, "completions/mean_terminated_length": 654.4615478515625, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 2.8456790123456788, "grad_norm": 1.5714184184051678, "kl": 0.2623291015625, "learning_rate": 2.0697093907953134e-07, "loss": 0.0209, "num_tokens": 26992537.0, "reward": -5.587935447692871e-09, "reward_std": 0.22554031014442444, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592105805873871, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 766.34375, "completions/mean_terminated_length": 694.2000122070312, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 2.8487654320987654, "grad_norm": 0.9648933776455749, "kl": 0.2469482421875, "learning_rate": 2.0647855097591704e-07, "loss": -0.0177, "num_tokens": 27023660.0, "reward": 1.862645149230957e-09, "reward_std": 0.15884052217006683, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 745.5, "completions/mean_terminated_length": 693.9259033203125, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 2.851851851851852, "grad_norm": 0.6825799829565506, "kl": 0.26904296875, "learning_rate": 2.0598633691290485e-07, "loss": 0.03, "num_tokens": 27054124.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 746.71875, "completions/mean_terminated_length": 654.2916870117188, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 2.8549382716049383, "grad_norm": 0.7756896185517373, "kl": 0.28369140625, "learning_rate": 2.054942988588399e-07, "loss": 0.005, "num_tokens": 27085243.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 689.09375, "completions/mean_terminated_length": 654.4483032226562, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 2.8580246913580245, "grad_norm": 1.2003046392689023, "kl": 0.2535400390625, "learning_rate": 2.050024387813634e-07, "loss": 0.0221, "num_tokens": 27113410.0, "reward": -3.725290298461914e-09, "reward_std": 0.15639330446720123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110855221748352, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 665.53125, "completions/mean_terminated_length": 641.6333618164062, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 2.861111111111111, "grad_norm": 1.390477833783607, "kl": 0.26611328125, "learning_rate": 2.0451075864740496e-07, "loss": -0.0009, "num_tokens": 27140931.0, "reward": -3.725290298461914e-09, "reward_std": 0.17154675722122192, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 672.15625, "completions/mean_terminated_length": 607.0, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 2.8641975308641974, "grad_norm": 0.9431352061890065, "kl": 0.3013916015625, "learning_rate": 2.0401926042317455e-07, "loss": 0.0218, "num_tokens": 27168744.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 928 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 750.0, "completions/mean_terminated_length": 686.7692260742188, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 2.867283950617284, "grad_norm": 1.28669571470756, "kl": NaN, "learning_rate": 2.0352794607415465e-07, "loss": 0.0356, "num_tokens": 27199136.0, "reward": 2.7939677238464355e-09, "reward_std": 0.18770192563533783, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 708.53125, "completions/mean_terminated_length": 650.1111450195312, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 2.8703703703703702, "grad_norm": 1.0583589591588964, "kl": 0.270263671875, "learning_rate": 2.0303681756509254e-07, "loss": -0.0197, "num_tokens": 27228653.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 742.15625, "completions/mean_terminated_length": 677.1154174804688, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 2.873456790123457, "grad_norm": 0.8061770656390963, "kl": 0.2479248046875, "learning_rate": 2.0254587685999215e-07, "loss": -0.021, "num_tokens": 27258490.0, "reward": 0.0, "reward_std": 0.14086535573005676, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 730.5625, "completions/mean_terminated_length": 688.6428833007812, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 2.876543209876543, "grad_norm": 0.6225019361821078, "kl": 0.298828125, "learning_rate": 2.020551259221066e-07, "loss": 0.0032, "num_tokens": 27288048.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 720.84375, "completions/mean_terminated_length": 664.7037353515625, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 2.8796296296296298, "grad_norm": 0.5814236458811808, "kl": 0.26953125, "learning_rate": 2.0156456671392988e-07, "loss": 0.012, "num_tokens": 27317987.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 768.96875, "completions/mean_terminated_length": 721.74072265625, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 2.882716049382716, "grad_norm": 0.8938298014596324, "kl": 0.27099609375, "learning_rate": 2.010742011971895e-07, "loss": 0.0008, "num_tokens": 27349590.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 699.90625, "completions/mean_terminated_length": 653.607177734375, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 2.8858024691358026, "grad_norm": 0.9615668383520513, "kl": 0.2698974609375, "learning_rate": 2.005840313328383e-07, "loss": -0.0073, "num_tokens": 27378359.0, "reward": 9.313225746154785e-10, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 770.1875, "completions/mean_terminated_length": 699.1199951171875, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 2.888888888888889, "grad_norm": 1.093017923551296, "kl": 0.2379150390625, "learning_rate": 2.0009405908104673e-07, "loss": 0.0117, "num_tokens": 27410009.0, "reward": 4.656612873077393e-10, "reward_std": 0.14578182995319366, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 4.656612873077393e-10, "rewards/logprob_reward/std": 0.3110855221748352, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 731.9375, "completions/mean_terminated_length": 664.5385131835938, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 2.8919753086419755, "grad_norm": 1.5894945367007276, "kl": 0.28271484375, "learning_rate": 1.996042864011951e-07, "loss": -0.034, "num_tokens": 27439831.0, "reward": 0.0, "reward_std": 0.25309431552886963, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 709.03125, "completions/mean_terminated_length": 664.0357666015625, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 2.8950617283950617, "grad_norm": 1.1271701128434652, "kl": 0.2523193359375, "learning_rate": 1.9911471525186534e-07, "loss": 0.0089, "num_tokens": 27469216.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 749.6875, "completions/mean_terminated_length": 710.5000610351562, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 2.898148148148148, "grad_norm": 1.4096152948567902, "kl": 0.2711181640625, "learning_rate": 1.9862534759083379e-07, "loss": 0.0582, "num_tokens": 27500262.0, "reward": 3.725290298461914e-09, "reward_std": 0.2868836522102356, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 637.5, "completions/mean_terminated_length": 611.7333374023438, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 2.9012345679012346, "grad_norm": 1.71495424482088, "kl": 0.28076171875, "learning_rate": 1.9813618537506302e-07, "loss": 0.0773, "num_tokens": 27526690.0, "reward": 1.862645149230957e-09, "reward_std": 0.2652709484100342, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 716.34375, "completions/mean_terminated_length": 630.2000122070312, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 2.9043209876543212, "grad_norm": 1.3088744357136437, "kl": 0.26611328125, "learning_rate": 1.9764723056069365e-07, "loss": -0.0102, "num_tokens": 27556405.0, "reward": 0.0, "reward_std": 0.1590607464313507, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 678.3125, "completions/mean_terminated_length": 581.5199584960938, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 2.9074074074074074, "grad_norm": 2.092974041961588, "kl": 0.2655029296875, "learning_rate": 1.9715848510303739e-07, "loss": -0.0471, "num_tokens": 27584243.0, "reward": -7.450580596923828e-09, "reward_std": 0.21604543924331665, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 731.25, "completions/mean_terminated_length": 677.0370483398438, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 2.9104938271604937, "grad_norm": 1.3904426704990516, "kl": 0.2652587890625, "learning_rate": 1.966699509565685e-07, "loss": 0.0217, "num_tokens": 27614263.0, "reward": -3.725290298461914e-09, "reward_std": 0.19446446001529694, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 706.75, "completions/mean_terminated_length": 661.4285888671875, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 2.9135802469135803, "grad_norm": 0.9308651722253931, "kl": 0.26953125, "learning_rate": 1.961816300749163e-07, "loss": -0.0338, "num_tokens": 27643043.0, "reward": 1.862645149230957e-09, "reward_std": 0.1556938886642456, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 9.313225746154785e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 657.28125, "completions/mean_terminated_length": 632.8333740234375, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 2.9166666666666665, "grad_norm": 0.7422821322314068, "kl": 0.2630615234375, "learning_rate": 1.9569352441085712e-07, "loss": 0.0029, "num_tokens": 27670192.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 721.75, "completions/mean_terminated_length": 637.1199951171875, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 2.919753086419753, "grad_norm": 2.666645959207306, "kl": 0.272216796875, "learning_rate": 1.9520563591630686e-07, "loss": -0.0778, "num_tokens": 27700040.0, "reward": 0.0, "reward_std": 0.2386818826198578, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -9.313225746154785e-10, "rewards/logprob_reward/std": 0.4016096889972687, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 676.125, "completions/mean_terminated_length": 626.4285888671875, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 2.9228395061728394, "grad_norm": 0.5929827461269377, "kl": 0.2696533203125, "learning_rate": 1.9471796654231278e-07, "loss": 0.0212, "num_tokens": 27728256.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 733.59375, "completions/mean_terminated_length": 666.5769653320312, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 2.925925925925926, "grad_norm": 0.8802088847347731, "kl": 0.25, "learning_rate": 1.9423051823904602e-07, "loss": 0.0108, "num_tokens": 27758147.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 755.03125, "completions/mean_terminated_length": 679.719970703125, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 2.9290123456790123, "grad_norm": 0.7138798730919996, "kl": 0.2115478515625, "learning_rate": 1.9374329295579372e-07, "loss": 0.0128, "num_tokens": 27788944.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 754.6875, "completions/mean_terminated_length": 664.9166870117188, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 2.932098765432099, "grad_norm": 0.9512255663617465, "kl": 0.260009765625, "learning_rate": 1.9325629264095083e-07, "loss": -0.0188, "num_tokens": 27819590.0, "reward": -9.313225746154785e-10, "reward_std": 0.1554727405309677, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -9.313225746154785e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 722.625, "completions/mean_terminated_length": 653.0769653320312, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 2.935185185185185, "grad_norm": 1.409302550476725, "kl": 0.259765625, "learning_rate": 1.9276951924201304e-07, "loss": 0.063, "num_tokens": 27849070.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 718.40625, "completions/mean_terminated_length": 661.8148193359375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 2.9382716049382713, "grad_norm": 1.1825110300486403, "kl": 0.25634765625, "learning_rate": 1.922829747055684e-07, "loss": 0.0264, "num_tokens": 27878551.0, "reward": -9.313225746154785e-10, "reward_std": 0.15734902024269104, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 739.1875, "completions/mean_terminated_length": 673.4615478515625, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 2.941358024691358, "grad_norm": 0.5487079637357012, "kl": 0.2608642578125, "learning_rate": 1.9179666097728982e-07, "loss": 0.0251, "num_tokens": 27908785.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 769.3125, "completions/mean_terminated_length": 710.5385131835938, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 2.9444444444444446, "grad_norm": 1.6200495429814423, "kl": 0.24365234375, "learning_rate": 1.9131058000192726e-07, "loss": -0.0238, "num_tokens": 27940015.0, "reward": 7.450580596923828e-09, "reward_std": 0.22852018475532532, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 7.450580596923828e-09, "rewards/logprob_reward/std": 0.4751909673213959, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 774.6875, "completions/mean_terminated_length": 691.5833740234375, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 2.947530864197531, "grad_norm": 1.159852418700381, "kl": 0.2657470703125, "learning_rate": 1.9082473372329983e-07, "loss": -0.0226, "num_tokens": 27971413.0, "reward": -1.862645149230957e-09, "reward_std": 0.14842106401920319, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 797.34375, "completions/mean_terminated_length": 721.7916870117188, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 2.950617283950617, "grad_norm": 1.156321761903506, "kl": 0.261962890625, "learning_rate": 1.903391240842882e-07, "loss": 0.0187, "num_tokens": 28003748.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 720.78125, "completions/mean_terminated_length": 689.413818359375, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 2.9537037037037037, "grad_norm": 1.1155823592840006, "kl": 0.2425537109375, "learning_rate": 1.8985375302682654e-07, "loss": 0.068, "num_tokens": 28033169.0, "reward": 0.0, "reward_std": 0.15829598903656006, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 723.34375, "completions/mean_terminated_length": 667.6666870117188, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 2.9567901234567904, "grad_norm": 0.9279735158564448, "kl": 0.3826904296875, "learning_rate": 1.8936862249189515e-07, "loss": -0.0023, "num_tokens": 28062844.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 692.71875, "completions/mean_terminated_length": 658.4483032226562, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 2.9598765432098766, "grad_norm": 0.05712688361471471, "kl": 0.302978515625, "learning_rate": 1.8888373441951228e-07, "loss": 0.0003, "num_tokens": 28091531.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 730.6875, "completions/mean_terminated_length": 648.5599975585938, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 2.962962962962963, "grad_norm": 0.9067233994247506, "kl": 0.3291015625, "learning_rate": 1.8839909074872675e-07, "loss": -0.0369, "num_tokens": 28121809.0, "reward": 1.862645149230957e-09, "reward_std": 0.156063511967659, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 9.313225746154785e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 728.25, "completions/mean_terminated_length": 660.0, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 2.9660493827160495, "grad_norm": 1.4283129287704093, "kl": 0.3004150390625, "learning_rate": 1.8791469341761e-07, "loss": 0.0217, "num_tokens": 28151341.0, "reward": 0.0, "reward_std": 0.18008019030094147, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 735.21875, "completions/mean_terminated_length": 681.74072265625, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 2.9691358024691357, "grad_norm": 1.8621004104764296, "kl": 0.268310546875, "learning_rate": 1.8743054436324835e-07, "loss": 0.0051, "num_tokens": 28181448.0, "reward": 9.313225746154785e-10, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 675.40625, "completions/mean_terminated_length": 652.1666870117188, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 2.9722222222222223, "grad_norm": 1.1859538802593061, "kl": 0.2818603515625, "learning_rate": 1.8694664552173529e-07, "loss": -0.0056, "num_tokens": 28209257.0, "reward": 0.0, "reward_std": 0.14223133027553558, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 661.15625, "completions/mean_terminated_length": 661.15625, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 2.9753086419753085, "grad_norm": 1.361886747157153, "kl": 0.2763671875, "learning_rate": 1.8646299882816358e-07, "loss": 0.0308, "num_tokens": 28236594.0, "reward": 0.0, "reward_std": 0.18217067420482635, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 747.96875, "completions/mean_terminated_length": 696.8518676757812, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 2.978395061728395, "grad_norm": 0.008830294254685428, "kl": 0.25732421875, "learning_rate": 1.859796062166178e-07, "loss": 0.0003, "num_tokens": 28267045.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 740.84375, "completions/mean_terminated_length": 700.3928833007812, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 2.9814814814814814, "grad_norm": 0.009799257206895364, "kl": 0.2650146484375, "learning_rate": 1.854964696201666e-07, "loss": 0.0003, "num_tokens": 28297612.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 755.59375, "completions/mean_terminated_length": 666.125, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 2.984567901234568, "grad_norm": 1.1439797351882075, "kl": 0.2791748046875, "learning_rate": 1.850135909708544e-07, "loss": -0.0241, "num_tokens": 28328503.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 750.1875, "completions/mean_terminated_length": 643.0435180664062, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 2.9876543209876543, "grad_norm": 0.8828282768159587, "kl": 0.2965087890625, "learning_rate": 1.8453097219969448e-07, "loss": 0.0069, "num_tokens": 28359061.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 707.125, "completions/mean_terminated_length": 648.4444580078125, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 2.9907407407407405, "grad_norm": 0.9626889615466286, "kl": 0.3009033203125, "learning_rate": 1.8404861523666073e-07, "loss": -0.0041, "num_tokens": 28387885.0, "reward": -3.725290298461914e-09, "reward_std": 0.12777692079544067, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 715.71875, "completions/mean_terminated_length": 658.629638671875, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 2.993827160493827, "grad_norm": 0.7579628972107867, "kl": 0.2510986328125, "learning_rate": 1.8356652201068024e-07, "loss": 0.0201, "num_tokens": 28417352.0, "reward": 0.02812499925494194, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 737.03125, "completions/mean_terminated_length": 683.888916015625, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 2.996913580246914, "grad_norm": 0.9859395013297854, "kl": 0.266845703125, "learning_rate": 1.830846944496251e-07, "loss": -0.0003, "num_tokens": 28447729.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 684.53125, "completions/mean_terminated_length": 661.9000244140625, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 3.0, "grad_norm": 1.0103808656080384, "kl": 0.248291015625, "learning_rate": 1.826031344803053e-07, "loss": -0.0324, "num_tokens": 28476310.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 707.0625, "completions/mean_terminated_length": 685.933349609375, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 3.003086419753086, "grad_norm": 1.0766089362078926, "kl": 0.254150390625, "learning_rate": 1.8212184402846064e-07, "loss": -0.0173, "num_tokens": 28505100.0, "reward": 0.0, "reward_std": 0.1465846300125122, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 804.34375, "completions/mean_terminated_length": 704.5, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 3.006172839506173, "grad_norm": 0.8076015127764825, "kl": 0.2384033203125, "learning_rate": 1.8164082501875326e-07, "loss": -0.0094, "num_tokens": 28538199.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 815.625, "completions/mean_terminated_length": 706.4761962890625, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 3.009259259259259, "grad_norm": 0.7930158030735597, "kl": 0.2666015625, "learning_rate": 1.8116007937475947e-07, "loss": 0.0072, "num_tokens": 28570635.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 771.9375, "completions/mean_terminated_length": 725.25927734375, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 3.0123456790123457, "grad_norm": 0.9088831310724863, "kl": 0.2623291015625, "learning_rate": 1.8067960901896278e-07, "loss": -0.0142, "num_tokens": 28602277.0, "reward": 0.0, "reward_std": 0.15838100016117096, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 763.6875, "completions/mean_terminated_length": 736.7586059570312, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 3.015432098765432, "grad_norm": 0.7898439743118666, "kl": 0.2899169921875, "learning_rate": 1.8019941587274565e-07, "loss": 0.002, "num_tokens": 28633331.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 685.71875, "completions/mean_terminated_length": 637.3928833007812, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 3.0185185185185186, "grad_norm": 1.1971246125556536, "kl": 0.3021240234375, "learning_rate": 1.7971950185638195e-07, "loss": -0.0207, "num_tokens": 28661834.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 767.90625, "completions/mean_terminated_length": 667.6956787109375, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 3.021604938271605, "grad_norm": 0.87021317675508, "kl": 0.2828369140625, "learning_rate": 1.7923986888902948e-07, "loss": -0.0007, "num_tokens": 28693275.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 756.78125, "completions/mean_terminated_length": 681.9599609375, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 3.0246913580246915, "grad_norm": 0.6666076971217348, "kl": 0.2564697265625, "learning_rate": 1.78760518888722e-07, "loss": 0.0, "num_tokens": 28724056.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 759.34375, "completions/mean_terminated_length": 671.125, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 3.0277777777777777, "grad_norm": 0.922703690793308, "kl": 0.2677001953125, "learning_rate": 1.782814537723617e-07, "loss": -0.0329, "num_tokens": 28755035.0, "reward": -1.862645149230957e-09, "reward_std": 0.15909674763679504, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 751.21875, "completions/mean_terminated_length": 644.478271484375, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 3.0308641975308643, "grad_norm": 0.90574835232403, "kl": 0.3001708984375, "learning_rate": 1.7780267545571175e-07, "loss": -0.0318, "num_tokens": 28785762.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 681.28125, "completions/mean_terminated_length": 632.3214721679688, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 3.0339506172839505, "grad_norm": 1.2363156639509054, "kl": 0.2816162109375, "learning_rate": 1.7732418585338804e-07, "loss": -0.0402, "num_tokens": 28814039.0, "reward": -4.656612873077393e-10, "reward_std": 0.2396731674671173, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 747.96875, "completions/mean_terminated_length": 696.8518676757812, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 3.037037037037037, "grad_norm": 1.0725872098852478, "kl": 0.249755859375, "learning_rate": 1.7684598687885216e-07, "loss": 0.0058, "num_tokens": 28844458.0, "reward": 0.0, "reward_std": 0.1590951681137085, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 760.75, "completions/mean_terminated_length": 712.0, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 3.0401234567901234, "grad_norm": 1.5867530014474474, "kl": 0.236328125, "learning_rate": 1.7636808044440344e-07, "loss": 0.0638, "num_tokens": 28875774.0, "reward": -8.847564458847046e-09, "reward_std": 0.1862604022026062, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -2.7939677238464355e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 739.5, "completions/mean_terminated_length": 673.84619140625, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 3.04320987654321, "grad_norm": 0.9447105016801203, "kl": 0.291259765625, "learning_rate": 1.7589046846117132e-07, "loss": -0.0011, "num_tokens": 28906302.0, "reward": -3.725290298461914e-09, "reward_std": 0.15887358784675598, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 771.25, "completions/mean_terminated_length": 700.47998046875, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 3.0462962962962963, "grad_norm": 1.5922069554784521, "kl": 0.2728271484375, "learning_rate": 1.754131528391078e-07, "loss": 0.0127, "num_tokens": 28938262.0, "reward": -3.725290298461914e-09, "reward_std": 0.25036370754241943, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 745.53125, "completions/mean_terminated_length": 652.7083740234375, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 3.049382716049383, "grad_norm": 0.8559104930401311, "kl": 0.2789306640625, "learning_rate": 1.7493613548697966e-07, "loss": -0.0055, "num_tokens": 28968691.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 770.84375, "completions/mean_terminated_length": 712.423095703125, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 3.052469135802469, "grad_norm": 1.2016564202779059, "kl": 0.2301025390625, "learning_rate": 1.744594183123611e-07, "loss": -0.0666, "num_tokens": 29000138.0, "reward": 0.0, "reward_std": 0.25610116124153137, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 668.375, "completions/mean_terminated_length": 668.375, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 3.0555555555555554, "grad_norm": 1.2034066140857234, "kl": 0.276611328125, "learning_rate": 1.7398300322162563e-07, "loss": 0.0164, "num_tokens": 29028274.0, "reward": 0.0, "reward_std": 0.1694464087486267, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 778.1875, "completions/mean_terminated_length": 666.45458984375, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 3.058641975308642, "grad_norm": 1.3762028459638855, "kl": 0.263671875, "learning_rate": 1.7350689211993902e-07, "loss": 0.0073, "num_tokens": 29060096.0, "reward": 0.0, "reward_std": 0.18888147175312042, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 685.09375, "completions/mean_terminated_length": 650.0344848632812, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 3.0617283950617282, "grad_norm": 1.2450297203744591, "kl": 0.240234375, "learning_rate": 1.7303108691125107e-07, "loss": -0.0955, "num_tokens": 29088315.0, "reward": 1.862645149230957e-09, "reward_std": 0.18684130907058716, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 720.5, "completions/mean_terminated_length": 664.2963256835938, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 3.064814814814815, "grad_norm": 0.5146764469695151, "kl": 0.2337646484375, "learning_rate": 1.725555894982887e-07, "loss": 0.0184, "num_tokens": 29118059.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 738.625, "completions/mean_terminated_length": 685.7777709960938, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 3.067901234567901, "grad_norm": 4.2349552915993485, "kl": 0.2689208984375, "learning_rate": 1.7208040178254768e-07, "loss": -0.1757, "num_tokens": 29148259.0, "reward": -3.725290298461914e-09, "reward_std": 0.19500862061977386, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 663.65625, "completions/mean_terminated_length": 652.0322265625, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 3.0709876543209877, "grad_norm": 0.6132065135896924, "kl": 0.271728515625, "learning_rate": 1.716055256642855e-07, "loss": -0.0186, "num_tokens": 29176424.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 676.09375, "completions/mean_terminated_length": 640.1034545898438, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 3.074074074074074, "grad_norm": 2.8841175768517346, "kl": 0.2662353515625, "learning_rate": 1.711309630425135e-07, "loss": -0.2248, "num_tokens": 29204171.0, "reward": -1.862645149230957e-09, "reward_std": 0.15646925568580627, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 700.375, "completions/mean_terminated_length": 666.8965454101562, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 3.0771604938271606, "grad_norm": 0.9452856128272616, "kl": 0.2845458984375, "learning_rate": 1.7065671581498936e-07, "loss": -0.0021, "num_tokens": 29232831.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 714.59375, "completions/mean_terminated_length": 657.2963256835938, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 3.080246913580247, "grad_norm": 0.8216890799478307, "kl": 0.251953125, "learning_rate": 1.701827858782095e-07, "loss": 0.0576, "num_tokens": 29261990.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 681.90625, "completions/mean_terminated_length": 659.1000366210938, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 3.0833333333333335, "grad_norm": 0.4945412077311694, "kl": 0.254638671875, "learning_rate": 1.697091751274016e-07, "loss": 0.0264, "num_tokens": 29290151.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 725.59375, "completions/mean_terminated_length": 682.9642944335938, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 3.0864197530864197, "grad_norm": 1.3606006021037704, "kl": 0.23291015625, "learning_rate": 1.6923588545651672e-07, "loss": 0.0241, "num_tokens": 29320146.0, "reward": 1.862645149230957e-09, "reward_std": 0.14860975742340088, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 694.65625, "completions/mean_terminated_length": 647.607177734375, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 3.0895061728395063, "grad_norm": 1.290694181029074, "kl": 0.2579345703125, "learning_rate": 1.687629187582221e-07, "loss": -0.043, "num_tokens": 29348679.0, "reward": 1.862645149230957e-09, "reward_std": 0.21570174396038055, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 704.625, "completions/mean_terminated_length": 645.4815063476562, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 3.0925925925925926, "grad_norm": 1.30463801788802, "kl": 0.2574462890625, "learning_rate": 1.6829027692389343e-07, "loss": 0.1353, "num_tokens": 29377927.0, "reward": 0.0, "reward_std": 0.13204941153526306, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 718.0625, "completions/mean_terminated_length": 674.357177734375, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 3.095679012345679, "grad_norm": 1.762064383114544, "kl": 0.251708984375, "learning_rate": 1.678179618436073e-07, "loss": -0.1035, "num_tokens": 29407613.0, "reward": -1.862645149230957e-09, "reward_std": 0.2035190612077713, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 763.21875, "completions/mean_terminated_length": 676.2916870117188, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 3.0987654320987654, "grad_norm": 1.394911026973124, "kl": 0.256103515625, "learning_rate": 1.6734597540613344e-07, "loss": -0.005, "num_tokens": 29438844.0, "reward": -1.862645149230957e-09, "reward_std": 0.18761375546455383, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 723.34375, "completions/mean_terminated_length": 639.1599731445312, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 3.1018518518518516, "grad_norm": 1.8197181485250147, "kl": 0.3023681640625, "learning_rate": 1.6687431949892753e-07, "loss": 0.0472, "num_tokens": 29468407.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 667.96875, "completions/mean_terminated_length": 644.2333374023438, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 3.1049382716049383, "grad_norm": 1.3329960123541742, "kl": 0.281494140625, "learning_rate": 1.664029960081234e-07, "loss": -0.028, "num_tokens": 29496134.0, "reward": 0.0, "reward_std": 0.19164466857910156, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 699.125, "completions/mean_terminated_length": 638.9629516601562, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 3.1080246913580245, "grad_norm": 0.008444538809175298, "kl": 0.248779296875, "learning_rate": 1.6593200681852574e-07, "loss": 0.0002, "num_tokens": 29524710.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 734.9375, "completions/mean_terminated_length": 693.6428833007812, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 3.111111111111111, "grad_norm": 1.2221476308156387, "kl": 0.24755859375, "learning_rate": 1.6546135381360194e-07, "loss": -0.0128, "num_tokens": 29554292.0, "reward": -1.862645149230957e-09, "reward_std": 0.1641397774219513, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 778.40625, "completions/mean_terminated_length": 709.6400146484375, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 3.1141975308641974, "grad_norm": 1.0889000163622438, "kl": 0.2532958984375, "learning_rate": 1.6499103887547544e-07, "loss": 0.0024, "num_tokens": 29585645.0, "reward": 0.0, "reward_std": 0.1917317807674408, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 705.53125, "completions/mean_terminated_length": 672.586181640625, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 3.117283950617284, "grad_norm": 0.5614509907699836, "kl": 0.2498779296875, "learning_rate": 1.6452106388491762e-07, "loss": 0.0144, "num_tokens": 29614442.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 751.28125, "completions/mean_terminated_length": 688.34619140625, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 3.1203703703703702, "grad_norm": 0.6057075822312525, "kl": 0.2852783203125, "learning_rate": 1.6405143072134031e-07, "loss": 0.0124, "num_tokens": 29644767.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 764.0625, "completions/mean_terminated_length": 726.9285888671875, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 3.123456790123457, "grad_norm": 0.46899477512584786, "kl": 0.271240234375, "learning_rate": 1.6358214126278855e-07, "loss": 0.0027, "num_tokens": 29675629.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 778.25, "completions/mean_terminated_length": 682.0869750976562, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 3.126543209876543, "grad_norm": 1.1254396274309257, "kl": 0.2421875, "learning_rate": 1.6311319738593281e-07, "loss": -0.035, "num_tokens": 29707209.0, "reward": -1.862645149230957e-09, "reward_std": 0.14842106401920319, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 779.9375, "completions/mean_terminated_length": 698.5833740234375, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 3.1296296296296298, "grad_norm": 1.003660901667382, "kl": 0.24365234375, "learning_rate": 1.6264460096606169e-07, "loss": -0.0082, "num_tokens": 29738651.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 763.875, "completions/mean_terminated_length": 715.7037353515625, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 3.132716049382716, "grad_norm": 1.328120572842891, "kl": 0.2237548828125, "learning_rate": 1.621763538770743e-07, "loss": -0.0924, "num_tokens": 29769803.0, "reward": 1.862645149230957e-09, "reward_std": 0.20375239849090576, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 770.71875, "completions/mean_terminated_length": 723.8148193359375, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 3.1358024691358026, "grad_norm": 1.0646566319626596, "kl": 0.278564453125, "learning_rate": 1.6170845799147266e-07, "loss": 0.0081, "num_tokens": 29800950.0, "reward": 0.0, "reward_std": 0.13428640365600586, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 735.21875, "completions/mean_terminated_length": 693.9642944335938, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 3.138888888888889, "grad_norm": 0.9780147807978669, "kl": 0.2899169921875, "learning_rate": 1.6124091518035443e-07, "loss": -0.0277, "num_tokens": 29830617.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 688.4375, "completions/mean_terminated_length": 653.72412109375, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 3.1419753086419755, "grad_norm": 0.023944795044643093, "kl": 0.2646484375, "learning_rate": 1.607737273134054e-07, "loss": 0.0003, "num_tokens": 29858863.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 757.75, "completions/mean_terminated_length": 696.3077392578125, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 3.1450617283950617, "grad_norm": 1.3792267988657705, "kl": 0.251708984375, "learning_rate": 1.603068962588918e-07, "loss": -0.0015, "num_tokens": 29889463.0, "reward": -1.862645149230957e-09, "reward_std": 0.22755002975463867, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 799.0, "completions/mean_terminated_length": 724.0, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 3.148148148148148, "grad_norm": 0.8595002612418882, "kl": 0.28173828125, "learning_rate": 1.598404238836532e-07, "loss": -0.0384, "num_tokens": 29921719.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 788.25, "completions/mean_terminated_length": 733.84619140625, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 3.1512345679012346, "grad_norm": 1.1107760097980046, "kl": 0.221923828125, "learning_rate": 1.5937431205309465e-07, "loss": -0.045, "num_tokens": 29953707.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 742.46875, "completions/mean_terminated_length": 702.2500610351562, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 3.154320987654321, "grad_norm": 0.793439979172092, "kl": 0.2550048828125, "learning_rate": 1.589085626311795e-07, "loss": -0.0012, "num_tokens": 29983438.0, "reward": 3.725290298461914e-09, "reward_std": 0.1385486125946045, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592105805873871, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 690.375, "completions/mean_terminated_length": 628.5925903320312, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 3.1574074074074074, "grad_norm": 1.3473054361293963, "kl": 0.27783203125, "learning_rate": 1.5844317748042167e-07, "loss": -0.0377, "num_tokens": 30012114.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 745.78125, "completions/mean_terminated_length": 717.0, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 3.1604938271604937, "grad_norm": 1.5521047207589356, "kl": 0.236572265625, "learning_rate": 1.5797815846187868e-07, "loss": -0.0563, "num_tokens": 30042235.0, "reward": -1.862645149230957e-09, "reward_std": 0.1641397774219513, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 757.125, "completions/mean_terminated_length": 707.7037353515625, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 3.1635802469135803, "grad_norm": 0.5039402374992891, "kl": 0.2437744140625, "learning_rate": 1.575135074351435e-07, "loss": 0.0066, "num_tokens": 30073135.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 744.78125, "completions/mean_terminated_length": 693.0740966796875, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 3.1666666666666665, "grad_norm": 1.268203203867581, "kl": 0.281005859375, "learning_rate": 1.5704922625833784e-07, "loss": -0.0174, "num_tokens": 30103584.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 668.625, "completions/mean_terminated_length": 644.933349609375, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 3.169753086419753, "grad_norm": 1.0867712934423577, "kl": 0.2960205078125, "learning_rate": 1.565853167881042e-07, "loss": 0.0045, "num_tokens": 30131120.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 759.59375, "completions/mean_terminated_length": 698.5769653320312, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 3.1728395061728394, "grad_norm": 1.0793686524716646, "kl": 0.2525634765625, "learning_rate": 1.5612178087959887e-07, "loss": 0.0003, "num_tokens": 30161587.0, "reward": -3.725290298461914e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 743.65625, "completions/mean_terminated_length": 703.607177734375, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 3.175925925925926, "grad_norm": 1.1451486257740306, "kl": 0.2615966796875, "learning_rate": 1.556586203864841e-07, "loss": 0.0223, "num_tokens": 30191852.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 689.46875, "completions/mean_terminated_length": 627.5184936523438, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 3.1790123456790123, "grad_norm": 0.009643798735085194, "kl": 0.255615234375, "learning_rate": 1.5519583716092077e-07, "loss": 0.0003, "num_tokens": 30220263.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 764.40625, "completions/mean_terminated_length": 677.875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 3.182098765432099, "grad_norm": 1.4809011453828196, "kl": 0.2730712890625, "learning_rate": 1.5473343305356136e-07, "loss": 0.0179, "num_tokens": 30251648.0, "reward": 0.0, "reward_std": 0.19228190183639526, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 726.90625, "completions/mean_terminated_length": 643.719970703125, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 3.185185185185185, "grad_norm": 2.3859976342186706, "kl": 0.2730712890625, "learning_rate": 1.5427140991354215e-07, "loss": 0.0577, "num_tokens": 30281337.0, "reward": -9.313225746154785e-10, "reward_std": 0.23513615131378174, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 707.78125, "completions/mean_terminated_length": 686.7000122070312, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 3.1882716049382718, "grad_norm": 2.2456635856547305, "kl": 0.28466796875, "learning_rate": 1.5380976958847572e-07, "loss": -0.2009, "num_tokens": 30310466.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 730.6875, "completions/mean_terminated_length": 688.7857666015625, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 3.191358024691358, "grad_norm": 0.9617662099963058, "kl": 0.2615966796875, "learning_rate": 1.5334851392444412e-07, "loss": 0.0187, "num_tokens": 30340316.0, "reward": 0.0, "reward_std": 0.15636713802814484, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 761.5625, "completions/mean_terminated_length": 674.0833740234375, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 3.1944444444444446, "grad_norm": 1.057851202950758, "kl": 0.2554931640625, "learning_rate": 1.5288764476599102e-07, "loss": 0.0037, "num_tokens": 30370982.0, "reward": 0.0, "reward_std": 0.15223759412765503, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 646.375, "completions/mean_terminated_length": 646.375, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 3.197530864197531, "grad_norm": 1.311554180537854, "kl": 0.250244140625, "learning_rate": 1.524271639561145e-07, "loss": 0.0346, "num_tokens": 30397526.0, "reward": -2.7939677238464355e-09, "reward_std": 0.13557207584381104, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 691.1875, "completions/mean_terminated_length": 656.7586059570312, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 3.200617283950617, "grad_norm": 1.0566047366456837, "kl": 0.2537841796875, "learning_rate": 1.5196707333625959e-07, "loss": 0.0182, "num_tokens": 30425804.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 734.28125, "completions/mean_terminated_length": 653.1599731445312, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 3.2037037037037037, "grad_norm": 2.6196208044284663, "kl": 0.2647705078125, "learning_rate": 1.5150737474631092e-07, "loss": -0.2351, "num_tokens": 30455721.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 767.53125, "completions/mean_terminated_length": 708.34619140625, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 3.20679012345679, "grad_norm": 0.664531621539753, "kl": 0.25732421875, "learning_rate": 1.5104807002458564e-07, "loss": -0.008, "num_tokens": 30487166.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 793.78125, "completions/mean_terminated_length": 729.3200073242188, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 3.2098765432098766, "grad_norm": 0.0431546602367175, "kl": 0.2581787109375, "learning_rate": 1.5058916100782555e-07, "loss": 0.0003, "num_tokens": 30519015.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 789.6875, "completions/mean_terminated_length": 735.6154174804688, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 3.212962962962963, "grad_norm": 0.8377817886949486, "kl": 0.25732421875, "learning_rate": 1.5013064953119036e-07, "loss": -0.0042, "num_tokens": 30550509.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 747.375, "completions/mean_terminated_length": 655.1666870117188, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 3.2160493827160495, "grad_norm": 2.215482458994797, "kl": 0.254638671875, "learning_rate": 1.4967253742824962e-07, "loss": -0.1136, "num_tokens": 30581609.0, "reward": 1.862645149230957e-09, "reward_std": 0.18394683301448822, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 747.625, "completions/mean_terminated_length": 683.84619140625, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 3.2191358024691357, "grad_norm": 0.9499102220498177, "kl": 0.2445068359375, "learning_rate": 1.4921482653097614e-07, "loss": -0.0531, "num_tokens": 30612329.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 768.375, "completions/mean_terminated_length": 709.3846435546875, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 3.2222222222222223, "grad_norm": 0.5653222476164992, "kl": 0.260498046875, "learning_rate": 1.487575186697381e-07, "loss": -0.0054, "num_tokens": 30643305.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 732.28125, "completions/mean_terminated_length": 678.25927734375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 3.2253086419753085, "grad_norm": 1.9291025282518113, "kl": 0.2587890625, "learning_rate": 1.4830061567329223e-07, "loss": 0.0277, "num_tokens": 30673390.0, "reward": 0.0, "reward_std": 0.25847262144088745, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 4.656612873077393e-10, "rewards/logprob_reward/std": 0.4399413466453552, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 776.1875, "completions/mean_terminated_length": 646.3809814453125, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 3.228395061728395, "grad_norm": 0.027935652357127107, "kl": 0.271484375, "learning_rate": 1.4784411936877596e-07, "loss": 0.0003, "num_tokens": 30704996.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 695.0, "completions/mean_terminated_length": 634.0740966796875, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 3.2314814814814814, "grad_norm": 1.0983382773162031, "kl": 0.272705078125, "learning_rate": 1.4738803158170043e-07, "loss": 0.0155, "num_tokens": 30733664.0, "reward": 9.313225746154785e-10, "reward_std": 0.1455036699771881, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -9.313225746154785e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 732.1875, "completions/mean_terminated_length": 712.7333984375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 3.234567901234568, "grad_norm": 1.0642464302132313, "kl": 0.24951171875, "learning_rate": 1.469323541359433e-07, "loss": 0.0133, "num_tokens": 30763654.0, "reward": 3.725290298461914e-09, "reward_std": 0.15508639812469482, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 744.46875, "completions/mean_terminated_length": 704.5357666015625, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 3.2376543209876543, "grad_norm": 0.879023680963078, "kl": 0.2646484375, "learning_rate": 1.4647708885374105e-07, "loss": 0.0084, "num_tokens": 30793989.0, "reward": 0.02812499925494194, "reward_std": 0.08606424182653427, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 722.90625, "completions/mean_terminated_length": 691.7586059570312, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 3.240740740740741, "grad_norm": 0.8895204420052543, "kl": 0.2540283203125, "learning_rate": 1.4602223755568212e-07, "loss": 0.0001, "num_tokens": 30823990.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 753.15625, "completions/mean_terminated_length": 690.6538696289062, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 3.243827160493827, "grad_norm": 0.9153946029916924, "kl": 0.228515625, "learning_rate": 1.4556780206069925e-07, "loss": 0.0087, "num_tokens": 30855143.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 729.4375, "completions/mean_terminated_length": 661.4615478515625, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 3.246913580246914, "grad_norm": 1.1455629101525655, "kl": 0.2523193359375, "learning_rate": 1.4511378418606272e-07, "loss": -0.0103, "num_tokens": 30884541.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 757.21875, "completions/mean_terminated_length": 695.6538696289062, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 3.25, "grad_norm": 0.7353234991734671, "kl": 0.251220703125, "learning_rate": 1.4466018574737236e-07, "loss": 0.0218, "num_tokens": 30915308.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 787.3125, "completions/mean_terminated_length": 753.5000610351562, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 3.253086419753086, "grad_norm": 0.4477869406085615, "kl": 0.245849609375, "learning_rate": 1.4420700855855093e-07, "loss": -0.0192, "num_tokens": 30947126.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 757.90625, "completions/mean_terminated_length": 696.5, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 3.256172839506173, "grad_norm": 0.8646740435234543, "kl": 0.23583984375, "learning_rate": 1.4375425443183675e-07, "loss": 0.0188, "num_tokens": 30977967.0, "reward": 0.0, "reward_std": 0.13030678033828735, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 782.125, "completions/mean_terminated_length": 726.3077392578125, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 3.259259259259259, "grad_norm": 1.1082173590712923, "kl": 0.3033447265625, "learning_rate": 1.43301925177776e-07, "loss": -0.0765, "num_tokens": 31009255.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 733.75, "completions/mean_terminated_length": 637.0, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 3.2623456790123457, "grad_norm": 0.010185563589904368, "kl": 0.2779541015625, "learning_rate": 1.4285002260521617e-07, "loss": 0.0003, "num_tokens": 31039171.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 753.09375, "completions/mean_terminated_length": 647.0869750976562, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 3.265432098765432, "grad_norm": 0.6807906915146793, "kl": 0.336669921875, "learning_rate": 1.4239854852129807e-07, "loss": -0.0032, "num_tokens": 31069702.0, "reward": 0.0, "reward_std": 0.09185586124658585, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 767.71875, "completions/mean_terminated_length": 682.2916870117188, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 3.2685185185185186, "grad_norm": 1.1954869638403682, "kl": 0.26123046875, "learning_rate": 1.419475047314493e-07, "loss": -0.0103, "num_tokens": 31100769.0, "reward": 0.0, "reward_std": 0.18885569274425507, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 728.09375, "completions/mean_terminated_length": 708.36669921875, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 3.271604938271605, "grad_norm": 2.2188825714435043, "kl": 0.266357421875, "learning_rate": 1.4149689303937662e-07, "loss": -0.2275, "num_tokens": 31130304.0, "reward": 0.0, "reward_std": 0.15134452283382416, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 722.875, "completions/mean_terminated_length": 679.857177734375, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 3.2746913580246915, "grad_norm": 0.981641484247448, "kl": 0.22900390625, "learning_rate": 1.4104671524705892e-07, "loss": -0.0627, "num_tokens": 31159812.0, "reward": -1.862645149230957e-09, "reward_std": 0.14842106401920319, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 750.34375, "completions/mean_terminated_length": 687.1923217773438, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 3.2777777777777777, "grad_norm": 0.7865097763068806, "kl": 0.2552490234375, "learning_rate": 1.4059697315473988e-07, "loss": 0.0411, "num_tokens": 31190075.0, "reward": 0.02812499925494194, "reward_std": 0.08606424182653427, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 725.53125, "completions/mean_terminated_length": 670.25927734375, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 3.2808641975308643, "grad_norm": 1.5871852156633488, "kl": 0.2781982421875, "learning_rate": 1.4014766856092081e-07, "loss": -0.0573, "num_tokens": 31219832.0, "reward": -2.561137080192566e-09, "reward_std": 0.18082474172115326, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 9.313225746154785e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 773.90625, "completions/mean_terminated_length": 703.8800048828125, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 3.2839506172839505, "grad_norm": 0.7425942490096938, "kl": 0.24951171875, "learning_rate": 1.3969880326235362e-07, "loss": -0.0105, "num_tokens": 31251409.0, "reward": 0.0, "reward_std": 0.12664207816123962, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 763.59375, "completions/mean_terminated_length": 726.3928833007812, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 3.287037037037037, "grad_norm": 1.2067555235365977, "kl": 0.2589111328125, "learning_rate": 1.3925037905403324e-07, "loss": -0.0674, "num_tokens": 31282240.0, "reward": 0.0, "reward_std": 0.1882191002368927, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 795.65625, "completions/mean_terminated_length": 706.3043823242188, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 3.2901234567901234, "grad_norm": 1.299680625758312, "kl": 0.2777099609375, "learning_rate": 1.38802397729191e-07, "loss": 0.0594, "num_tokens": 31314637.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 750.5625, "completions/mean_terminated_length": 711.5000610351562, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 3.29320987654321, "grad_norm": 1.2129723717473224, "kl": 0.2320556640625, "learning_rate": 1.3835486107928678e-07, "loss": -0.0404, "num_tokens": 31345275.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 726.25, "completions/mean_terminated_length": 695.4483032226562, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 3.2962962962962963, "grad_norm": 0.6936382173321262, "kl": 0.268798828125, "learning_rate": 1.3790777089400262e-07, "loss": -0.0224, "num_tokens": 31374659.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 754.40625, "completions/mean_terminated_length": 678.9199829101562, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 3.299382716049383, "grad_norm": 1.4164979976340009, "kl": 0.2735595703125, "learning_rate": 1.3746112896123494e-07, "loss": -0.0427, "num_tokens": 31405896.0, "reward": -3.725290298461914e-09, "reward_std": 0.24048790335655212, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 811.53125, "completions/mean_terminated_length": 781.1785888671875, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "epoch": 3.302469135802469, "grad_norm": 0.011120476702207317, "kl": 0.247314453125, "learning_rate": 1.3701493706708768e-07, "loss": 0.0002, "num_tokens": 31439049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 767.59375, "completions/mean_terminated_length": 741.0689697265625, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 3.3055555555555554, "grad_norm": 0.483959515195949, "kl": 0.2911376953125, "learning_rate": 1.3656919699586503e-07, "loss": 0.0124, "num_tokens": 31469976.0, "reward": 0.0, "reward_std": 0.09185586124658585, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 748.65625, "completions/mean_terminated_length": 685.1154174804688, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 3.308641975308642, "grad_norm": 0.7348895873578779, "kl": 0.248779296875, "learning_rate": 1.3612391053006446e-07, "loss": -0.042, "num_tokens": 31500493.0, "reward": 0.05624999850988388, "reward_std": 0.06495190411806107, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0625, "rewards/logprob_reward/std": 0.24593468010425568, "step": 1072 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 722.125, "completions/mean_terminated_length": 679.0, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 3.3117283950617282, "grad_norm": 1.1887001755579771, "kl": NaN, "learning_rate": 1.356790794503694e-07, "loss": -0.0204, "num_tokens": 31530165.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 825.3125, "completions/mean_terminated_length": 779.4615478515625, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 3.314814814814815, "grad_norm": 0.012406080713913019, "kl": 0.2333984375, "learning_rate": 1.3523470553564238e-07, "loss": 0.0002, "num_tokens": 31563507.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 716.78125, "completions/mean_terminated_length": 672.8928833007812, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 3.317901234567901, "grad_norm": 0.5743044774276269, "kl": 0.2496337890625, "learning_rate": 1.3479079056291738e-07, "loss": 0.0087, "num_tokens": 31592920.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 788.0625, "completions/mean_terminated_length": 709.4166870117188, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 3.3209876543209877, "grad_norm": 3.6920513030189674, "kl": 0.2532958984375, "learning_rate": 1.3434733630739345e-07, "loss": -0.143, "num_tokens": 31625150.0, "reward": 0.0, "reward_std": 0.20147258043289185, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 729.25, "completions/mean_terminated_length": 687.1428833007812, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 3.324074074074074, "grad_norm": 0.7065504635632736, "kl": 0.25, "learning_rate": 1.3390434454242704e-07, "loss": 0.0049, "num_tokens": 31654834.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 746.3125, "completions/mean_terminated_length": 717.586181640625, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 3.3271604938271606, "grad_norm": 0.8346510215181216, "kl": 0.2330322265625, "learning_rate": 1.334618170395254e-07, "loss": -0.0133, "num_tokens": 31684944.0, "reward": 0.0, "reward_std": 0.09185586124658585, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 707.4375, "completions/mean_terminated_length": 697.2257690429688, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 3.330246913580247, "grad_norm": 1.4942222054853591, "kl": 0.248291015625, "learning_rate": 1.3301975556833872e-07, "loss": -0.0873, "num_tokens": 31713582.0, "reward": -3.725290298461914e-09, "reward_std": 0.19293318688869476, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 822.84375, "completions/mean_terminated_length": 731.4091186523438, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 3.3333333333333335, "grad_norm": 1.0428453264332782, "kl": 0.2427978515625, "learning_rate": 1.3257816189665398e-07, "loss": -0.0245, "num_tokens": 31746273.0, "reward": -3.725290298461914e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 821.84375, "completions/mean_terminated_length": 754.4583740234375, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 3.3364197530864197, "grad_norm": 1.3184160835714214, "kl": 0.22412109375, "learning_rate": 1.3213703779038726e-07, "loss": 0.0191, "num_tokens": 31779844.0, "reward": 0.0, "reward_std": 0.15904246270656586, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 742.28125, "completions/mean_terminated_length": 702.0357666015625, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 3.3395061728395063, "grad_norm": 0.7748820346284045, "kl": 0.269287109375, "learning_rate": 1.3169638501357697e-07, "loss": 0.0289, "num_tokens": 31809781.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 763.96875, "completions/mean_terminated_length": 703.9615478515625, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 3.3425925925925926, "grad_norm": 1.555543362600614, "kl": 0.2437744140625, "learning_rate": 1.3125620532837667e-07, "loss": 0.0216, "num_tokens": 31840768.0, "reward": -3.725290298461914e-09, "reward_std": 0.2136804610490799, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 729.65625, "completions/mean_terminated_length": 675.1481323242188, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 3.3456790123456788, "grad_norm": 0.9813131603858801, "kl": 0.2388916015625, "learning_rate": 1.3081650049504784e-07, "loss": -0.0027, "num_tokens": 31869985.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 729.71875, "completions/mean_terminated_length": 647.3200073242188, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 3.3487654320987654, "grad_norm": 1.2093380139341139, "kl": 0.2509765625, "learning_rate": 1.3037727227195333e-07, "loss": -0.0696, "num_tokens": 31899804.0, "reward": 0.0, "reward_std": 0.15134452283382416, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 786.1875, "completions/mean_terminated_length": 752.2142944335938, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 3.351851851851852, "grad_norm": 0.6034368765472946, "kl": 0.259033203125, "learning_rate": 1.2993852241554986e-07, "loss": -0.0151, "num_tokens": 31931686.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 701.8125, "completions/mean_terminated_length": 655.7857666015625, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 3.3549382716049383, "grad_norm": 1.227791919392234, "kl": 0.24609375, "learning_rate": 1.295002526803813e-07, "loss": -0.1123, "num_tokens": 31960116.0, "reward": -1.862645149230957e-09, "reward_std": 0.14231424033641815, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 793.90625, "completions/mean_terminated_length": 703.8695678710938, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 3.3580246913580245, "grad_norm": 1.1499498151806773, "kl": 0.2586669921875, "learning_rate": 1.2906246481907145e-07, "loss": -0.0692, "num_tokens": 31992133.0, "reward": -3.725290298461914e-09, "reward_std": 0.13885828852653503, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 754.0, "completions/mean_terminated_length": 678.3999633789062, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 3.361111111111111, "grad_norm": 0.4967767734834555, "kl": 0.25, "learning_rate": 1.2862516058231718e-07, "loss": 0.0055, "num_tokens": 32022453.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 709.625, "completions/mean_terminated_length": 664.7142944335938, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 3.3641975308641974, "grad_norm": 0.5488839834782302, "kl": 0.2513427734375, "learning_rate": 1.2818834171888136e-07, "loss": 0.0125, "num_tokens": 32051645.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 783.53125, "completions/mean_terminated_length": 728.0385131835938, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 3.367283950617284, "grad_norm": 1.1988321545343161, "kl": 0.283935546875, "learning_rate": 1.277520099755857e-07, "loss": 0.0258, "num_tokens": 32083502.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 753.46875, "completions/mean_terminated_length": 714.8214721679688, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 3.3703703703703702, "grad_norm": 0.7860866149746452, "kl": 0.241943359375, "learning_rate": 1.2731616709730428e-07, "loss": 0.0138, "num_tokens": 32114093.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 755.46875, "completions/mean_terminated_length": 693.5, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 3.373456790123457, "grad_norm": 1.9355067929149514, "kl": 0.2366943359375, "learning_rate": 1.2688081482695577e-07, "loss": 0.1288, "num_tokens": 32145040.0, "reward": 0.0, "reward_std": 0.15597835183143616, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 721.9375, "completions/mean_terminated_length": 701.800048828125, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 3.376543209876543, "grad_norm": 0.4790221526932161, "kl": 0.242431640625, "learning_rate": 1.264459549054973e-07, "loss": -0.018, "num_tokens": 32174690.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 801.25, "completions/mean_terminated_length": 727.0, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 3.3796296296296298, "grad_norm": 0.7093664604532016, "kl": 0.2525634765625, "learning_rate": 1.2601158907191696e-07, "loss": -0.0215, "num_tokens": 32207174.0, "reward": -4.656612873077393e-10, "reward_std": 0.12072179466485977, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 4.656612873077393e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 752.4375, "completions/mean_terminated_length": 702.1481323242188, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 3.382716049382716, "grad_norm": 0.16629034886974875, "kl": 0.26416015625, "learning_rate": 1.2557771906322704e-07, "loss": 0.0003, "num_tokens": 32237924.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 687.0, "completions/mean_terminated_length": 652.137939453125, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 3.3858024691358026, "grad_norm": 0.013630451196816433, "kl": 0.266845703125, "learning_rate": 1.2514434661445706e-07, "loss": 0.0003, "num_tokens": 32266056.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 788.3125, "completions/mean_terminated_length": 709.75, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 3.388888888888889, "grad_norm": 1.125990052255359, "kl": 0.2408447265625, "learning_rate": 1.2471147345864672e-07, "loss": -0.0432, "num_tokens": 32297386.0, "reward": -3.725290298461914e-09, "reward_std": 0.2222922444343567, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 763.09375, "completions/mean_terminated_length": 725.8214721679688, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 3.3919753086419755, "grad_norm": 0.9748479973086155, "kl": 0.253173828125, "learning_rate": 1.2427910132683928e-07, "loss": -0.069, "num_tokens": 32328577.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 705.9375, "completions/mean_terminated_length": 673.0344848632812, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 3.3950617283950617, "grad_norm": 1.1069650475836328, "kl": 0.2342529296875, "learning_rate": 1.2384723194807408e-07, "loss": 0.092, "num_tokens": 32357383.0, "reward": 3.725290298461914e-09, "reward_std": 0.1355731189250946, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 716.21875, "completions/mean_terminated_length": 659.2222290039062, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 3.398148148148148, "grad_norm": 1.0494479584682614, "kl": 0.273193359375, "learning_rate": 1.234158670493803e-07, "loss": 0.0144, "num_tokens": 32386702.0, "reward": 0.0, "reward_std": 0.188789963722229, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 757.96875, "completions/mean_terminated_length": 708.7037353515625, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 3.4012345679012346, "grad_norm": 2.099566012880971, "kl": 0.2247314453125, "learning_rate": 1.229850083557695e-07, "loss": 0.0934, "num_tokens": 32418061.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 772.125, "completions/mean_terminated_length": 714.0, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 3.4043209876543212, "grad_norm": 1.4716119220150468, "kl": 0.2271728515625, "learning_rate": 1.2255465759022913e-07, "loss": -0.0753, "num_tokens": 32449585.0, "reward": 0.0, "reward_std": 0.1641397774219513, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 782.21875, "completions/mean_terminated_length": 714.5199584960938, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 3.4074074074074074, "grad_norm": 0.5752351183998141, "kl": 0.271484375, "learning_rate": 1.2212481647371542e-07, "loss": -0.0129, "num_tokens": 32481344.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 767.3125, "completions/mean_terminated_length": 681.75, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 3.4104938271604937, "grad_norm": 0.017090939602922153, "kl": 0.2645263671875, "learning_rate": 1.2169548672514625e-07, "loss": 0.0003, "num_tokens": 32512458.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 788.15625, "completions/mean_terminated_length": 733.7307739257812, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 3.4135802469135803, "grad_norm": 0.9463338151541665, "kl": 0.24658203125, "learning_rate": 1.2126667006139495e-07, "loss": -0.007, "num_tokens": 32544003.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 787.59375, "completions/mean_terminated_length": 708.7916870117188, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 3.4166666666666665, "grad_norm": 0.7856020656526064, "kl": 0.2403564453125, "learning_rate": 1.208383681972829e-07, "loss": -0.0307, "num_tokens": 32576406.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 769.625, "completions/mean_terminated_length": 743.3103637695312, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 3.419753086419753, "grad_norm": 0.5092206644316002, "kl": 0.2269287109375, "learning_rate": 1.2041058284557277e-07, "loss": 0.0238, "num_tokens": 32607754.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 794.6875, "completions/mean_terminated_length": 730.47998046875, "completions/min_length": 549.0, "completions/min_terminated_length": 549.0, "epoch": 3.4228395061728394, "grad_norm": 1.3536454292536797, "kl": 0.2601318359375, "learning_rate": 1.1998331571696162e-07, "loss": -0.0313, "num_tokens": 32639904.0, "reward": -3.725290298461914e-09, "reward_std": 0.1885068714618683, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 736.65625, "completions/mean_terminated_length": 695.607177734375, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 3.425925925925926, "grad_norm": 0.9080027433099632, "kl": 0.21630859375, "learning_rate": 1.1955656852007438e-07, "loss": 0.0082, "num_tokens": 32669957.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 740.71875, "completions/mean_terminated_length": 711.413818359375, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 3.4290123456790123, "grad_norm": 1.20411101267003, "kl": 0.25634765625, "learning_rate": 1.1913034296145669e-07, "loss": -0.0067, "num_tokens": 32700268.0, "reward": 0.0, "reward_std": 0.14057862758636475, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 711.03125, "completions/mean_terminated_length": 653.0740966796875, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 3.432098765432099, "grad_norm": 1.5756966519011866, "kl": 0.245361328125, "learning_rate": 1.1870464074556816e-07, "loss": -0.032, "num_tokens": 32729513.0, "reward": 0.0, "reward_std": 0.15422560274600983, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 729.375, "completions/mean_terminated_length": 698.8965454101562, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 3.435185185185185, "grad_norm": 1.3969578411745558, "kl": 0.2464599609375, "learning_rate": 1.1827946357477559e-07, "loss": -0.0242, "num_tokens": 32759557.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 743.25, "completions/mean_terminated_length": 691.25927734375, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 3.4382716049382718, "grad_norm": 0.6396342570664947, "kl": 0.2535400390625, "learning_rate": 1.1785481314934618e-07, "loss": -0.027, "num_tokens": 32789885.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 775.25, "completions/mean_terminated_length": 692.3333740234375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 3.441358024691358, "grad_norm": 1.643829047253925, "kl": 0.2890625, "learning_rate": 1.1743069116744064e-07, "loss": 0.0054, "num_tokens": 32821453.0, "reward": 3.725290298461914e-09, "reward_std": 0.18929949402809143, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 775.375, "completions/mean_terminated_length": 678.0869750976562, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 3.4444444444444446, "grad_norm": 0.9569493190879544, "kl": 0.1943359375, "learning_rate": 1.1700709932510656e-07, "loss": -0.0106, "num_tokens": 32852733.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 760.75, "completions/mean_terminated_length": 700.0, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 3.447530864197531, "grad_norm": 2.4750548674904094, "kl": 0.412353515625, "learning_rate": 1.1658403931627125e-07, "loss": -0.0038, "num_tokens": 32884169.0, "reward": 0.0, "reward_std": 0.2040814757347107, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 713.65625, "completions/mean_terminated_length": 703.6451416015625, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 3.450617283950617, "grad_norm": 1.4756793797810057, "kl": 0.2459716796875, "learning_rate": 1.1616151283273565e-07, "loss": 0.031, "num_tokens": 32913474.0, "reward": -3.725290298461914e-09, "reward_std": 0.25264814496040344, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 817.4375, "completions/mean_terminated_length": 736.6087036132812, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 3.4537037037037037, "grad_norm": 1.2820093189019142, "kl": 0.240966796875, "learning_rate": 1.1573952156416672e-07, "loss": -0.0057, "num_tokens": 32946588.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 768.46875, "completions/mean_terminated_length": 683.2916870117188, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 3.45679012345679, "grad_norm": 1.1228896539965887, "kl": 0.2550048828125, "learning_rate": 1.1531806719809142e-07, "loss": 0.0296, "num_tokens": 32977639.0, "reward": -9.313225746154785e-10, "reward_std": 0.15779843926429749, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 769.625, "completions/mean_terminated_length": 743.3103637695312, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 3.4598765432098766, "grad_norm": 2.1765833178857745, "kl": 0.2271728515625, "learning_rate": 1.1489715141988954e-07, "loss": -0.1252, "num_tokens": 33008415.0, "reward": -2.3283064365386963e-09, "reward_std": 0.17488928139209747, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 9.313225746154785e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 745.46875, "completions/mean_terminated_length": 681.1923217773438, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 3.462962962962963, "grad_norm": 1.4780432846675047, "kl": 0.2430419921875, "learning_rate": 1.1447677591278715e-07, "loss": -0.0929, "num_tokens": 33038698.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 714.125, "completions/mean_terminated_length": 669.857177734375, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 3.4660493827160495, "grad_norm": 0.9649833911831643, "kl": 0.26953125, "learning_rate": 1.1405694235784972e-07, "loss": -0.0136, "num_tokens": 33067386.0, "reward": 0.02812499925494194, "reward_std": 0.08606424182653427, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 784.9375, "completions/mean_terminated_length": 676.2727661132812, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 3.4691358024691357, "grad_norm": 1.0868646245244866, "kl": 0.250244140625, "learning_rate": 1.1363765243397555e-07, "loss": -0.0037, "num_tokens": 33099024.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 778.78125, "completions/mean_terminated_length": 697.0416870117188, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 3.4722222222222223, "grad_norm": 1.1089002532131813, "kl": 0.242919921875, "learning_rate": 1.1321890781788884e-07, "loss": -0.0167, "num_tokens": 33130281.0, "reward": -3.725290298461914e-09, "reward_std": 0.20304615795612335, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 736.125, "completions/mean_terminated_length": 682.8148193359375, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 3.4753086419753085, "grad_norm": 1.5891742761815362, "kl": 0.2274169921875, "learning_rate": 1.1280071018413326e-07, "loss": 0.0249, "num_tokens": 33160353.0, "reward": 3.725290298461914e-09, "reward_std": 0.17188216745853424, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592105805873871, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 733.40625, "completions/mean_terminated_length": 666.34619140625, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 3.478395061728395, "grad_norm": 0.6745362013842914, "kl": 0.206787109375, "learning_rate": 1.1238306120506505e-07, "loss": -0.0042, "num_tokens": 33190170.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 806.8125, "completions/mean_terminated_length": 676.5, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 3.4814814814814814, "grad_norm": 1.0329730099114092, "kl": 0.2452392578125, "learning_rate": 1.1196596255084648e-07, "loss": 0.0906, "num_tokens": 33222672.0, "reward": 0.0, "reward_std": 0.1512327343225479, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 711.40625, "completions/mean_terminated_length": 623.8800048828125, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 3.484567901234568, "grad_norm": 0.5740854501659524, "kl": 0.2684326171875, "learning_rate": 1.11549415889439e-07, "loss": 0.0049, "num_tokens": 33252173.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 737.84375, "completions/mean_terminated_length": 625.8695678710938, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 3.4876543209876543, "grad_norm": 1.4283067364278836, "kl": 0.2694091796875, "learning_rate": 1.1113342288659683e-07, "loss": 0.0238, "num_tokens": 33282448.0, "reward": 0.0, "reward_std": 0.22223833203315735, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 707.53125, "completions/mean_terminated_length": 674.7930908203125, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 3.490740740740741, "grad_norm": 1.431448484274639, "kl": 0.2420654296875, "learning_rate": 1.1071798520585979e-07, "loss": -0.0524, "num_tokens": 33311209.0, "reward": -7.450580596923828e-09, "reward_std": 0.23882049322128296, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 770.71875, "completions/mean_terminated_length": 723.8148193359375, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 3.493827160493827, "grad_norm": 0.6104158682632964, "kl": 0.2403564453125, "learning_rate": 1.1030310450854729e-07, "loss": 0.0239, "num_tokens": 33342608.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 753.46875, "completions/mean_terminated_length": 703.370361328125, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 3.496913580246914, "grad_norm": 1.4568945591861568, "kl": 0.2320556640625, "learning_rate": 1.0988878245375138e-07, "loss": -0.1062, "num_tokens": 33373091.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 753.25, "completions/mean_terminated_length": 703.1111450195312, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 3.5, "grad_norm": 0.012419470848946491, "kl": 0.2547607421875, "learning_rate": 1.094750206983299e-07, "loss": 0.0003, "num_tokens": 33403499.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 698.75, "completions/mean_terminated_length": 665.1034545898438, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 3.503086419753086, "grad_norm": 1.810029084612459, "kl": 0.2451171875, "learning_rate": 1.0906182089690025e-07, "loss": 0.0225, "num_tokens": 33431835.0, "reward": -3.725290298461914e-09, "reward_std": 0.20778873562812805, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 818.96875, "completions/mean_terminated_length": 771.6538696289062, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 3.506172839506173, "grad_norm": 0.732214077066824, "kl": 0.2076416015625, "learning_rate": 1.0864918470183258e-07, "loss": -0.0132, "num_tokens": 33464690.0, "reward": 0.0, "reward_std": 0.12621738016605377, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 732.65625, "completions/mean_terminated_length": 691.0357666015625, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 3.5092592592592595, "grad_norm": 1.0566014922370506, "kl": 0.2276611328125, "learning_rate": 1.0823711376324313e-07, "loss": 0.0285, "num_tokens": 33494475.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 691.3125, "completions/mean_terminated_length": 656.8965454101562, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 3.5123456790123457, "grad_norm": 0.011678948078310923, "kl": 0.2423095703125, "learning_rate": 1.0782560972898783e-07, "loss": 0.0002, "num_tokens": 33523077.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 813.40625, "completions/mean_terminated_length": 764.8077392578125, "completions/min_length": 591.0, "completions/min_terminated_length": 591.0, "epoch": 3.515432098765432, "grad_norm": 1.4078196528096074, "kl": 0.2655029296875, "learning_rate": 1.0741467424465544e-07, "loss": -0.0697, "num_tokens": 33555682.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 779.8125, "completions/mean_terminated_length": 744.9285888671875, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 3.5185185185185186, "grad_norm": 1.0780273497558357, "kl": 0.21923828125, "learning_rate": 1.0700430895356119e-07, "loss": 0.0089, "num_tokens": 33587388.0, "reward": -1.862645149230957e-09, "reward_std": 0.15889893472194672, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 770.8125, "completions/mean_terminated_length": 734.6428833007812, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 3.521604938271605, "grad_norm": 0.9076270681440747, "kl": 0.2550048828125, "learning_rate": 1.0659451549674018e-07, "loss": -0.0297, "num_tokens": 33618354.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1141 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 835.6875, "completions/mean_terminated_length": 750.0909423828125, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "epoch": 3.5246913580246915, "grad_norm": 1.2267087063974567, "kl": NaN, "learning_rate": 1.0618529551294053e-07, "loss": 0.0709, "num_tokens": 33651484.0, "reward": 3.725290298461914e-09, "reward_std": 0.14866770803928375, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 757.9375, "completions/mean_terminated_length": 719.9285888671875, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 3.5277777777777777, "grad_norm": 1.2997993879329892, "kl": 0.2279052734375, "learning_rate": 1.0577665063861735e-07, "loss": 0.0211, "num_tokens": 33681902.0, "reward": -3.725290298461914e-09, "reward_std": 0.17986111342906952, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 824.0625, "completions/mean_terminated_length": 745.8261108398438, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 3.5308641975308643, "grad_norm": 0.008235207322554906, "kl": 0.2332763671875, "learning_rate": 1.0536858250792582e-07, "loss": 0.0002, "num_tokens": 33715040.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 788.6875, "completions/mean_terminated_length": 745.1111450195312, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 3.5339506172839505, "grad_norm": 0.9345907340887593, "kl": 0.2178955078125, "learning_rate": 1.0496109275271456e-07, "loss": -0.0089, "num_tokens": 33746670.0, "reward": 0.0, "reward_std": 0.15422803163528442, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 799.125, "completions/mean_terminated_length": 711.1304321289062, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 3.537037037037037, "grad_norm": 0.7607013714831201, "kl": 0.2451171875, "learning_rate": 1.0455418300251953e-07, "loss": -0.0066, "num_tokens": 33779046.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 764.53125, "completions/mean_terminated_length": 704.6538696289062, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 3.5401234567901234, "grad_norm": 1.7747619389516969, "kl": 0.226806640625, "learning_rate": 1.0414785488455718e-07, "loss": 0.0338, "num_tokens": 33810175.0, "reward": -1.1175870895385742e-08, "reward_std": 0.312080442905426, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -7.450580596923828e-09, "rewards/logprob_reward/std": 0.5080004930496216, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 687.46875, "completions/mean_terminated_length": 676.6128540039062, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 3.5432098765432096, "grad_norm": 0.792813190309275, "kl": 0.2557373046875, "learning_rate": 1.0374211002371808e-07, "loss": 0.0105, "num_tokens": 33838618.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 703.78125, "completions/mean_terminated_length": 658.0357666015625, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 3.5462962962962963, "grad_norm": 1.266345128114868, "kl": 0.2593994140625, "learning_rate": 1.0333695004256035e-07, "loss": 0.0295, "num_tokens": 33867667.0, "reward": -4.132743924856186e-09, "reward_std": 0.13894928991794586, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -2.9103830456733704e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 765.25, "completions/mean_terminated_length": 717.3333129882812, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 3.549382716049383, "grad_norm": 1.3837416037283727, "kl": 0.2548828125, "learning_rate": 1.0293237656130304e-07, "loss": 0.0193, "num_tokens": 33899055.0, "reward": 0.0, "reward_std": 0.24888131022453308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 778.375, "completions/mean_terminated_length": 721.6923217773438, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 3.552469135802469, "grad_norm": 0.67763339123376, "kl": 0.2357177734375, "learning_rate": 1.0252839119782006e-07, "loss": -0.0031, "num_tokens": 33930415.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 780.0625, "completions/mean_terminated_length": 711.760009765625, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 3.5555555555555554, "grad_norm": 1.2368525690198535, "kl": 0.2451171875, "learning_rate": 1.0212499556763335e-07, "loss": -0.0025, "num_tokens": 33962165.0, "reward": 0.0, "reward_std": 0.16599814593791962, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 737.84375, "completions/mean_terminated_length": 718.7667236328125, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 3.558641975308642, "grad_norm": 1.1274774311721085, "kl": 0.258544921875, "learning_rate": 1.017221912839065e-07, "loss": -0.0458, "num_tokens": 33992228.0, "reward": 0.0, "reward_std": 0.16356289386749268, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -9.313225746154785e-10, "rewards/logprob_reward/std": 0.3592105805873871, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 689.15625, "completions/mean_terminated_length": 654.5172119140625, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 3.5617283950617287, "grad_norm": 1.6189419036848904, "kl": 0.2236328125, "learning_rate": 1.0131997995743838e-07, "loss": 0.0808, "num_tokens": 34020237.0, "reward": -3.725290298461914e-09, "reward_std": 0.21227142214775085, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 766.8125, "completions/mean_terminated_length": 719.1851806640625, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 3.564814814814815, "grad_norm": 1.5469509316154286, "kl": 0.2310791015625, "learning_rate": 1.0091836319665664e-07, "loss": -0.0053, "num_tokens": 34051039.0, "reward": 0.0, "reward_std": 0.18308551609516144, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 790.6875, "completions/mean_terminated_length": 725.3599853515625, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 3.567901234567901, "grad_norm": 1.6687135530647947, "kl": 0.2198486328125, "learning_rate": 1.0051734260761135e-07, "loss": 0.0836, "num_tokens": 34082773.0, "reward": 0.0, "reward_std": 0.21906724572181702, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 772.6875, "completions/mean_terminated_length": 726.1481323242188, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 3.5709876543209877, "grad_norm": 1.58301607500672, "kl": 0.2186279296875, "learning_rate": 1.0011691979396827e-07, "loss": -0.0523, "num_tokens": 34114351.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 795.5625, "completions/mean_terminated_length": 742.84619140625, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 3.574074074074074, "grad_norm": 0.5842581224367376, "kl": 0.266357421875, "learning_rate": 9.971709635700301e-08, "loss": -0.0228, "num_tokens": 34146501.0, "reward": 0.0, "reward_std": 0.09185586124658585, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 718.875, "completions/mean_terminated_length": 675.2857666015625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 3.5771604938271606, "grad_norm": 1.4512661129081734, "kl": 0.26611328125, "learning_rate": 9.931787389559393e-08, "loss": -0.042, "num_tokens": 34175725.0, "reward": 0.0, "reward_std": 0.15134452283382416, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 781.375, "completions/mean_terminated_length": 746.7142944335938, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "epoch": 3.580246913580247, "grad_norm": 1.0641804795499572, "kl": 0.211181640625, "learning_rate": 9.891925400621642e-08, "loss": -0.0149, "num_tokens": 34207293.0, "reward": -9.313225746154785e-10, "reward_std": 0.15662670135498047, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 777.09375, "completions/mean_terminated_length": 720.1154174804688, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 3.5833333333333335, "grad_norm": 0.7428261939462992, "kl": 0.2603759765625, "learning_rate": 9.852123828293612e-08, "loss": -0.0141, "num_tokens": 34239224.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 781.03125, "completions/mean_terminated_length": 724.9615478515625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 3.5864197530864197, "grad_norm": 0.6274682781569767, "kl": 0.23388671875, "learning_rate": 9.812382831740259e-08, "loss": -0.0107, "num_tokens": 34271161.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 800.53125, "completions/mean_terminated_length": 759.1481323242188, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 3.5895061728395063, "grad_norm": 2.40200813673398, "kl": 0.225341796875, "learning_rate": 9.772702569884301e-08, "loss": -0.1205, "num_tokens": 34303158.0, "reward": -3.725290298461914e-09, "reward_std": 0.19116097688674927, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 755.71875, "completions/mean_terminated_length": 650.7391357421875, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 3.5925925925925926, "grad_norm": 30.56014032933971, "kl": 5.2987060546875, "learning_rate": 9.733083201405578e-08, "loss": -0.0124, "num_tokens": 34334141.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 807.5625, "completions/mean_terminated_length": 709.1818237304688, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 3.5956790123456788, "grad_norm": 1.4681208819627474, "kl": 0.254150390625, "learning_rate": 9.693524884740425e-08, "loss": -0.0296, "num_tokens": 34366283.0, "reward": 0.0, "reward_std": 0.2006455361843109, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 765.3125, "completions/mean_terminated_length": 692.8800048828125, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 3.5987654320987654, "grad_norm": 1.6054973971152977, "kl": 0.2462158203125, "learning_rate": 9.654027778081042e-08, "loss": -0.0172, "num_tokens": 34397005.0, "reward": 0.0, "reward_std": 0.2863330543041229, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 702.59375, "completions/mean_terminated_length": 692.2257690429688, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 3.601851851851852, "grad_norm": 0.00985929988231579, "kl": 0.2557373046875, "learning_rate": 9.614592039374817e-08, "loss": 0.0003, "num_tokens": 34425944.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 724.9375, "completions/mean_terminated_length": 705.0000610351562, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 3.6049382716049383, "grad_norm": 0.5216442638898365, "kl": 0.2509765625, "learning_rate": 9.575217826323761e-08, "loss": 0.0341, "num_tokens": 34455470.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 743.6875, "completions/mean_terminated_length": 679.0, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 3.6080246913580245, "grad_norm": 0.7589034234898857, "kl": 0.2447509765625, "learning_rate": 9.535905296383848e-08, "loss": -0.0125, "num_tokens": 34485752.0, "reward": 0.0, "reward_std": 0.09185586124658585, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 708.4375, "completions/mean_terminated_length": 687.4000244140625, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 3.611111111111111, "grad_norm": 1.4331379030331497, "kl": 0.23388671875, "learning_rate": 9.496654606764373e-08, "loss": 0.0319, "num_tokens": 34514542.0, "reward": 0.0, "reward_std": 0.21092897653579712, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 751.125, "completions/mean_terminated_length": 700.5925903320312, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 3.6141975308641974, "grad_norm": 0.011370511822070762, "kl": 0.2593994140625, "learning_rate": 9.457465914427326e-08, "loss": 0.0003, "num_tokens": 34545034.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 744.25, "completions/mean_terminated_length": 725.6000366210938, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 3.617283950617284, "grad_norm": 1.3221539497629224, "kl": 0.234375, "learning_rate": 9.418339376086785e-08, "loss": -0.1029, "num_tokens": 34575358.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 842.5625, "completions/mean_terminated_length": 718.4210815429688, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 3.6203703703703702, "grad_norm": 0.9350894369204988, "kl": 0.23699951171875, "learning_rate": 9.379275148208276e-08, "loss": -0.0411, "num_tokens": 34609080.0, "reward": 0.0, "reward_std": 0.1437978297472, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 730.34375, "completions/mean_terminated_length": 688.3928833007812, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 3.623456790123457, "grad_norm": 1.396211505512646, "kl": 0.2403564453125, "learning_rate": 9.340273387008152e-08, "loss": 0.0083, "num_tokens": 34639011.0, "reward": 1.862645149230957e-09, "reward_std": 0.15331010520458221, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 755.6875, "completions/mean_terminated_length": 706.0, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 3.626543209876543, "grad_norm": 1.2901908717336628, "kl": 0.2454833984375, "learning_rate": 9.30133424845294e-08, "loss": -0.0054, "num_tokens": 34669657.0, "reward": 0.0, "reward_std": 0.17049743235111237, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 721.78125, "completions/mean_terminated_length": 690.5172119140625, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 3.6296296296296298, "grad_norm": 1.2395269582660546, "kl": 0.2843017578125, "learning_rate": 9.26245788825877e-08, "loss": 0.0195, "num_tokens": 34698830.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 769.21875, "completions/mean_terminated_length": 722.0370483398438, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 3.632716049382716, "grad_norm": 2.0457587536858948, "kl": 0.2271728515625, "learning_rate": 9.223644461890711e-08, "loss": -0.0787, "num_tokens": 34729997.0, "reward": -3.725290298461914e-09, "reward_std": 0.21996617317199707, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 643.125, "completions/mean_terminated_length": 630.8386840820312, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 3.6358024691358026, "grad_norm": 1.0578775835202625, "kl": 0.265625, "learning_rate": 9.184894124562162e-08, "loss": 0.0145, "num_tokens": 34756577.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 696.03125, "completions/mean_terminated_length": 635.2963256835938, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 3.638888888888889, "grad_norm": 0.9553170792799089, "kl": 0.2421875, "learning_rate": 9.146207031234232e-08, "loss": 0.0113, "num_tokens": 34784818.0, "reward": 0.0, "reward_std": 0.15482844412326813, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 741.96875, "completions/mean_terminated_length": 701.6785888671875, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 3.6419753086419755, "grad_norm": 1.3298136293147023, "kl": 0.283203125, "learning_rate": 9.107583336615124e-08, "loss": -0.0201, "num_tokens": 34814953.0, "reward": 0.0, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 746.09375, "completions/mean_terminated_length": 681.9615478515625, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 3.6450617283950617, "grad_norm": 0.01177057179996018, "kl": 0.22705078125, "learning_rate": 9.069023195159505e-08, "loss": 0.0002, "num_tokens": 34845648.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 742.0, "completions/mean_terminated_length": 689.7777709960938, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 3.648148148148148, "grad_norm": 0.008294800869948358, "kl": 0.2191162109375, "learning_rate": 9.030526761067911e-08, "loss": 0.0002, "num_tokens": 34875980.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 774.0, "completions/mean_terminated_length": 676.1739501953125, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 3.6512345679012346, "grad_norm": 1.635013960586178, "kl": 0.2298583984375, "learning_rate": 8.992094188286081e-08, "loss": -0.0944, "num_tokens": 34907452.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 705.3125, "completions/mean_terminated_length": 672.3448486328125, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 3.6543209876543212, "grad_norm": 0.007816248613254148, "kl": 0.2305908203125, "learning_rate": 8.953725630504419e-08, "loss": 0.0002, "num_tokens": 34935962.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 809.03125, "completions/mean_terminated_length": 711.3181762695312, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 3.6574074074074074, "grad_norm": 1.2313276526376022, "kl": 0.24072265625, "learning_rate": 8.915421241157292e-08, "loss": -0.053, "num_tokens": 34968731.0, "reward": -3.725290298461914e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 807.1875, "completions/mean_terminated_length": 757.1538696289062, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 3.6604938271604937, "grad_norm": 1.1385373165629489, "kl": 0.25830078125, "learning_rate": 8.877181173422487e-08, "loss": -0.0018, "num_tokens": 35001105.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 774.34375, "completions/mean_terminated_length": 716.7307739257812, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 3.6635802469135803, "grad_norm": 1.30860574782242, "kl": 0.2275390625, "learning_rate": 8.839005580220574e-08, "loss": -0.0199, "num_tokens": 35032208.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 735.78125, "completions/mean_terminated_length": 682.4074096679688, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 3.6666666666666665, "grad_norm": 1.3666539395504609, "kl": 0.282470703125, "learning_rate": 8.800894614214274e-08, "loss": -0.0402, "num_tokens": 35062193.0, "reward": -9.313225746154785e-10, "reward_std": 0.14590159058570862, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.3969838619232178e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 720.40625, "completions/mean_terminated_length": 700.1666870117188, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 3.669753086419753, "grad_norm": 1.2104511750175704, "kl": 0.248779296875, "learning_rate": 8.762848427807882e-08, "loss": -0.0582, "num_tokens": 35091458.0, "reward": 4.656612873077393e-10, "reward_std": 0.17130933701992035, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 803.875, "completions/mean_terminated_length": 742.239990234375, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 3.6728395061728394, "grad_norm": 0.7273489481754196, "kl": 0.2391357421875, "learning_rate": 8.724867173146633e-08, "loss": -0.0032, "num_tokens": 35123586.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 776.375, "completions/mean_terminated_length": 719.2307739257812, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 3.675925925925926, "grad_norm": 0.654432598917328, "kl": 0.27197265625, "learning_rate": 8.686951002116111e-08, "loss": 0.025, "num_tokens": 35154802.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 749.0625, "completions/mean_terminated_length": 709.7857666015625, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 3.6790123456790123, "grad_norm": 0.5660220881502624, "kl": 0.25634765625, "learning_rate": 8.649100066341614e-08, "loss": 0.0234, "num_tokens": 35184896.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 778.84375, "completions/mean_terminated_length": 722.269287109375, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 3.682098765432099, "grad_norm": 0.7583301748186921, "kl": 0.20263671875, "learning_rate": 8.611314517187584e-08, "loss": -0.0194, "num_tokens": 35216403.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 737.78125, "completions/mean_terminated_length": 671.7307739257812, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 3.685185185185185, "grad_norm": 1.406099040003787, "kl": 0.239013671875, "learning_rate": 8.573594505756982e-08, "loss": 0.0091, "num_tokens": 35246440.0, "reward": 0.0, "reward_std": 0.187990203499794, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592105805873871, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 731.0625, "completions/mean_terminated_length": 700.7586059570312, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 3.6882716049382713, "grad_norm": 0.9753640422973019, "kl": 0.2176513671875, "learning_rate": 8.535940182890685e-08, "loss": 0.0288, "num_tokens": 35276530.0, "reward": 1.862645149230957e-09, "reward_std": 0.1585690975189209, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 842.25, "completions/mean_terminated_length": 733.2000122070312, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 3.691358024691358, "grad_norm": 0.8454736311530938, "kl": 0.216064453125, "learning_rate": 8.498351699166889e-08, "loss": -0.0311, "num_tokens": 35310050.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 790.59375, "completions/mean_terminated_length": 736.7307739257812, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 3.6944444444444446, "grad_norm": 0.692100861738555, "kl": 0.267333984375, "learning_rate": 8.460829204900483e-08, "loss": 0.0209, "num_tokens": 35341577.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 770.6875, "completions/mean_terminated_length": 723.7777709960938, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 3.697530864197531, "grad_norm": 0.8807008328975766, "kl": 0.2099609375, "learning_rate": 8.423372850142482e-08, "loss": -0.0115, "num_tokens": 35372319.0, "reward": 0.05624999850988388, "reward_std": 0.06495190411806107, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0625, "rewards/logprob_reward/std": 0.24593468010425568, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 786.75, "completions/mean_terminated_length": 707.6666870117188, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 3.700617283950617, "grad_norm": 0.9863560532915724, "kl": 0.23974609375, "learning_rate": 8.385982784679416e-08, "loss": -0.018, "num_tokens": 35404119.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 735.4375, "completions/mean_terminated_length": 668.84619140625, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 3.7037037037037037, "grad_norm": 1.3594854957902904, "kl": 0.2215576171875, "learning_rate": 8.348659158032723e-08, "loss": -0.0005, "num_tokens": 35433557.0, "reward": 0.0, "reward_std": 0.2354062795639038, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 825.0625, "completions/mean_terminated_length": 758.75, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 3.7067901234567904, "grad_norm": 1.0290489160469656, "kl": 0.235107421875, "learning_rate": 8.311402119458138e-08, "loss": -0.0175, "num_tokens": 35467111.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 809.625, "completions/mean_terminated_length": 712.1818237304688, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 3.7098765432098766, "grad_norm": 0.9492204499282125, "kl": 0.250244140625, "learning_rate": 8.274211817945135e-08, "loss": -0.0568, "num_tokens": 35499867.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 791.46875, "completions/mean_terminated_length": 700.478271484375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 3.712962962962963, "grad_norm": 1.0787016926730821, "kl": 0.228759765625, "learning_rate": 8.237088402216297e-08, "loss": -0.0023, "num_tokens": 35531610.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 705.0, "completions/mean_terminated_length": 672.0, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 3.7160493827160495, "grad_norm": 1.6027175561626164, "kl": 0.2474365234375, "learning_rate": 8.20003202072674e-08, "loss": -0.0895, "num_tokens": 35560542.0, "reward": -3.259629011154175e-09, "reward_std": 0.1641397774219513, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 754.53125, "completions/mean_terminated_length": 679.0799560546875, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 3.7191358024691357, "grad_norm": 0.918710188396746, "kl": 0.2266845703125, "learning_rate": 8.163042821663507e-08, "loss": 0.0214, "num_tokens": 35590707.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 805.4375, "completions/mean_terminated_length": 732.5833740234375, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 3.7222222222222223, "grad_norm": 0.770015704172395, "kl": 0.24853515625, "learning_rate": 8.126120952944987e-08, "loss": -0.0007, "num_tokens": 35623469.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 800.46875, "completions/mean_terminated_length": 759.0740966796875, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 3.7253086419753085, "grad_norm": 1.2081184592114798, "kl": 0.213134765625, "learning_rate": 8.089266562220312e-08, "loss": 0.0188, "num_tokens": 35656264.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 690.65625, "completions/mean_terminated_length": 643.0357666015625, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 3.728395061728395, "grad_norm": 0.8269706640905476, "kl": 0.260498046875, "learning_rate": 8.052479796868784e-08, "loss": -0.0257, "num_tokens": 35684765.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 743.0625, "completions/mean_terminated_length": 691.0370483398438, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 3.7314814814814814, "grad_norm": 1.2367759816629091, "kl": 0.25732421875, "learning_rate": 8.015760803999244e-08, "loss": -0.0473, "num_tokens": 35714971.0, "reward": 0.0, "reward_std": 0.15867450833320618, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 815.0, "completions/mean_terminated_length": 756.47998046875, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 3.734567901234568, "grad_norm": 0.010125124854750943, "kl": 0.243896484375, "learning_rate": 7.979109730449552e-08, "loss": 0.0002, "num_tokens": 35748047.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 752.34375, "completions/mean_terminated_length": 724.2413940429688, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 3.7376543209876543, "grad_norm": 0.8217062489450325, "kl": 0.240966796875, "learning_rate": 7.942526722785927e-08, "loss": -0.0307, "num_tokens": 35778626.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 811.125, "completions/mean_terminated_length": 714.3636474609375, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 3.7407407407407405, "grad_norm": 1.5405233136020489, "kl": 0.2342529296875, "learning_rate": 7.906011927302417e-08, "loss": -0.078, "num_tokens": 35811414.0, "reward": -3.725290298461914e-09, "reward_std": 0.2160208821296692, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.4016096889972687, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 706.6875, "completions/mean_terminated_length": 661.357177734375, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 3.743827160493827, "grad_norm": 1.0796459999583565, "kl": 0.2510986328125, "learning_rate": 7.869565490020288e-08, "loss": -0.0336, "num_tokens": 35840152.0, "reward": 1.862645149230957e-09, "reward_std": 0.1774420291185379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 725.5, "completions/mean_terminated_length": 705.6000366210938, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 3.746913580246914, "grad_norm": 0.8397243257628163, "kl": 0.238525390625, "learning_rate": 7.833187556687443e-08, "loss": -0.0187, "num_tokens": 35869388.0, "reward": -3.725290298461914e-09, "reward_std": 0.12777692079544067, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 800.875, "completions/mean_terminated_length": 713.5652465820312, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 3.75, "grad_norm": 2.3875383809072157, "kl": 0.247802734375, "learning_rate": 7.796878272777835e-08, "loss": -0.0655, "num_tokens": 35901652.0, "reward": 1.280568540096283e-09, "reward_std": 0.1393391489982605, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.5133991837501526e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 799.96875, "completions/mean_terminated_length": 737.239990234375, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 3.753086419753086, "grad_norm": 0.9268055926606276, "kl": 0.2421875, "learning_rate": 7.760637783490906e-08, "loss": -0.009, "num_tokens": 35933607.0, "reward": 0.0, "reward_std": 0.15413051843643188, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 750.625, "completions/mean_terminated_length": 711.5714721679688, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 3.756172839506173, "grad_norm": 0.8706513221683194, "kl": 0.244384765625, "learning_rate": 7.724466233750961e-08, "loss": 0.0241, "num_tokens": 35963827.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 790.3125, "completions/mean_terminated_length": 736.3846435546875, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 3.7592592592592595, "grad_norm": 1.1387784304241038, "kl": 0.2215576171875, "learning_rate": 7.688363768206651e-08, "loss": 0.0682, "num_tokens": 35996437.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 743.75, "completions/mean_terminated_length": 714.7586059570312, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 3.7623456790123457, "grad_norm": 2.5365342314904638, "kl": 0.2642822265625, "learning_rate": 7.652330531230344e-08, "loss": -0.1692, "num_tokens": 36026337.0, "reward": -3.725290298461914e-09, "reward_std": 0.1642131358385086, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 768.03125, "completions/mean_terminated_length": 731.4642944335938, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 3.765432098765432, "grad_norm": 1.5158845625867385, "kl": 0.232666015625, "learning_rate": 7.616366666917571e-08, "loss": 0.0122, "num_tokens": 36057790.0, "reward": 9.313225746154785e-10, "reward_std": 0.1783732920885086, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 9.313225746154785e-10, "rewards/logprob_reward/std": 0.3592105805873871, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 750.75, "completions/mean_terminated_length": 687.6923217773438, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 3.7685185185185186, "grad_norm": 1.118494479169383, "kl": 0.2314453125, "learning_rate": 7.580472319086442e-08, "loss": 0.0502, "num_tokens": 36088270.0, "reward": 1.862645149230957e-09, "reward_std": 0.1539483666419983, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 9.313225746154785e-10, "rewards/logprob_reward/std": 0.3110855221748352, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 732.25, "completions/mean_terminated_length": 722.8386840820312, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 3.771604938271605, "grad_norm": 0.8305719376898156, "kl": 0.229736328125, "learning_rate": 7.544647631277085e-08, "loss": -0.0166, "num_tokens": 36117882.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 738.4375, "completions/mean_terminated_length": 685.5555419921875, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 3.7746913580246915, "grad_norm": 1.0240970318607905, "kl": 0.2247314453125, "learning_rate": 7.508892746751034e-08, "loss": -0.0219, "num_tokens": 36148088.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 755.34375, "completions/mean_terminated_length": 680.1199951171875, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 3.7777777777777777, "grad_norm": 0.9011193535676861, "kl": 0.2557373046875, "learning_rate": 7.473207808490701e-08, "loss": -0.0197, "num_tokens": 36178907.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 778.0, "completions/mean_terminated_length": 721.2307739257812, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 3.7808641975308643, "grad_norm": 1.0150857603607386, "kl": 0.2393798828125, "learning_rate": 7.437592959198796e-08, "loss": 0.0188, "num_tokens": 36210039.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 739.9375, "completions/mean_terminated_length": 687.3333129882812, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 3.7839506172839505, "grad_norm": 0.88761027735542, "kl": 0.236572265625, "learning_rate": 7.402048341297718e-08, "loss": 0.0144, "num_tokens": 36239681.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 762.3125, "completions/mean_terminated_length": 713.8518676757812, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 3.787037037037037, "grad_norm": 2.6095830013259484, "kl": 0.263427734375, "learning_rate": 7.36657409692903e-08, "loss": -0.0196, "num_tokens": 36270887.0, "reward": 0.0, "reward_std": 0.21855804324150085, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 751.03125, "completions/mean_terminated_length": 722.7930908203125, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 3.7901234567901234, "grad_norm": 0.7237038686542401, "kl": 0.2569580078125, "learning_rate": 7.331170367952874e-08, "loss": 0.0165, "num_tokens": 36301424.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 766.84375, "completions/mean_terminated_length": 694.8399658203125, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 3.7932098765432096, "grad_norm": 0.951997830767936, "kl": 0.2742919921875, "learning_rate": 7.295837295947404e-08, "loss": 0.0099, "num_tokens": 36332239.0, "reward": 3.725290298461914e-09, "reward_std": 0.1255132108926773, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 777.46875, "completions/mean_terminated_length": 731.8148193359375, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 3.7962962962962963, "grad_norm": 1.3570407985352184, "kl": 0.2540283203125, "learning_rate": 7.260575022208218e-08, "loss": -0.0134, "num_tokens": 36363738.0, "reward": 0.0, "reward_std": 0.15896356105804443, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 773.40625, "completions/mean_terminated_length": 727.0, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 3.799382716049383, "grad_norm": 1.1077044651823722, "kl": 0.216552734375, "learning_rate": 7.225383687747789e-08, "loss": -0.0375, "num_tokens": 36395031.0, "reward": 0.0, "reward_std": 0.15849053859710693, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 777.96875, "completions/mean_terminated_length": 709.0799560546875, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 3.802469135802469, "grad_norm": 1.939508985822048, "kl": 0.201904296875, "learning_rate": 7.190263433294913e-08, "loss": 0.0532, "num_tokens": 36426290.0, "reward": 1.862645149230957e-09, "reward_std": 0.2718256115913391, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 679.9375, "completions/mean_terminated_length": 668.8386840820312, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 3.8055555555555554, "grad_norm": 1.4813568857659922, "kl": 0.2130126953125, "learning_rate": 7.155214399294146e-08, "loss": 0.0022, "num_tokens": 36454252.0, "reward": 0.0, "reward_std": 0.18591031432151794, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 778.6875, "completions/mean_terminated_length": 722.0769653320312, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 3.808641975308642, "grad_norm": 1.4791837298848438, "kl": 0.19793701171875, "learning_rate": 7.120236725905215e-08, "loss": 0.0315, "num_tokens": 36485714.0, "reward": -3.725290298461914e-09, "reward_std": 0.14229324460029602, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 734.4375, "completions/mean_terminated_length": 725.0967407226562, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 3.8117283950617287, "grad_norm": 1.1991852092814017, "kl": 0.2501220703125, "learning_rate": 7.085330553002494e-08, "loss": -0.0109, "num_tokens": 36515200.0, "reward": 0.0, "reward_std": 0.14888522028923035, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 721.125, "completions/mean_terminated_length": 689.7930908203125, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 3.814814814814815, "grad_norm": 1.6082348672108904, "kl": 0.2298583984375, "learning_rate": 7.05049602017444e-08, "loss": 0.0667, "num_tokens": 36544972.0, "reward": -1.862645149230957e-09, "reward_std": 0.15909549593925476, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 791.75, "completions/mean_terminated_length": 686.1818237304688, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 3.817901234567901, "grad_norm": 1.4981153782420504, "kl": 0.2171630859375, "learning_rate": 7.015733266722993e-08, "loss": -0.0476, "num_tokens": 36576864.0, "reward": -2.3283064365386963e-09, "reward_std": 0.2329539954662323, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 2.7939677238464355e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 738.9375, "completions/mean_terminated_length": 686.1481323242188, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 3.8209876543209877, "grad_norm": 1.256041190538623, "kl": 0.2286376953125, "learning_rate": 6.981042431663075e-08, "loss": 0.018, "num_tokens": 36607170.0, "reward": 1.862645149230957e-09, "reward_std": 0.15654632449150085, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 746.84375, "completions/mean_terminated_length": 718.1724243164062, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 3.824074074074074, "grad_norm": 1.0884165892836302, "kl": 0.2041015625, "learning_rate": 6.946423653722006e-08, "loss": -0.0175, "num_tokens": 36637745.0, "reward": 0.0, "reward_std": 0.15909849107265472, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 733.3125, "completions/mean_terminated_length": 691.7857666015625, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 3.8271604938271606, "grad_norm": 1.0050481325166605, "kl": 0.243896484375, "learning_rate": 6.911877071338942e-08, "loss": -0.0467, "num_tokens": 36667271.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 762.28125, "completions/mean_terminated_length": 713.8148193359375, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 3.830246913580247, "grad_norm": 1.1216072656358058, "kl": 0.2501220703125, "learning_rate": 6.877402822664352e-08, "loss": 0.0133, "num_tokens": 36697640.0, "reward": 0.0, "reward_std": 0.1578986495733261, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 758.5, "completions/mean_terminated_length": 709.3333129882812, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 3.8333333333333335, "grad_norm": 0.9022035573659892, "kl": 0.2386474609375, "learning_rate": 6.843001045559416e-08, "loss": 0.0079, "num_tokens": 36728568.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 703.75, "completions/mean_terminated_length": 644.4444580078125, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 3.8364197530864197, "grad_norm": 1.2986014931213434, "kl": 0.247802734375, "learning_rate": 6.808671877595524e-08, "loss": 0.0265, "num_tokens": 36757320.0, "reward": 0.0, "reward_std": 0.15040497481822968, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 714.3125, "completions/mean_terminated_length": 682.27587890625, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 3.8395061728395063, "grad_norm": 13.687712391070466, "kl": 1.268798828125, "learning_rate": 6.774415456053697e-08, "loss": -0.0369, "num_tokens": 36787386.0, "reward": -4.889443516731262e-09, "reward_std": 0.1524234265089035, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.259629011154175e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 714.9375, "completions/mean_terminated_length": 694.3333740234375, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 3.8425925925925926, "grad_norm": 1.0589863280587208, "kl": 0.2066650390625, "learning_rate": 6.740231917924053e-08, "loss": -0.0221, "num_tokens": 36816516.0, "reward": 0.0, "reward_std": 0.13008607923984528, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 754.96875, "completions/mean_terminated_length": 716.5357666015625, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 3.8456790123456788, "grad_norm": 1.4588364279218964, "kl": 0.241455078125, "learning_rate": 6.706121399905245e-08, "loss": 0.0053, "num_tokens": 36846823.0, "reward": 0.0, "reward_std": 0.15324005484580994, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 694.8125, "completions/mean_terminated_length": 672.86669921875, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 3.8487654320987654, "grad_norm": 0.651811657610569, "kl": 0.2752685546875, "learning_rate": 6.672084038403927e-08, "loss": -0.0013, "num_tokens": 36874885.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 774.65625, "completions/mean_terminated_length": 728.4815063476562, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 3.851851851851852, "grad_norm": 1.3869213160714808, "kl": 0.2127685546875, "learning_rate": 6.638119969534201e-08, "loss": 0.0236, "num_tokens": 36906034.0, "reward": -2.7939677238464355e-09, "reward_std": 0.18139660358428955, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 730.6875, "completions/mean_terminated_length": 688.7857666015625, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 3.8549382716049383, "grad_norm": 1.2346828352439247, "kl": 0.230224609375, "learning_rate": 6.604229329117064e-08, "loss": -0.0026, "num_tokens": 36935792.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 802.125, "completions/mean_terminated_length": 740.0, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 3.8580246913580245, "grad_norm": 1.0676462685125008, "kl": 0.239990234375, "learning_rate": 6.570412252679894e-08, "loss": 0.0158, "num_tokens": 36968488.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 783.40625, "completions/mean_terminated_length": 703.2083740234375, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 3.861111111111111, "grad_norm": 1.3215043546684153, "kl": 0.2652587890625, "learning_rate": 6.536668875455869e-08, "loss": 0.0101, "num_tokens": 37000453.0, "reward": 0.0, "reward_std": 0.16467581689357758, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -2.7939677238464355e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 742.5, "completions/mean_terminated_length": 690.370361328125, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 3.8641975308641974, "grad_norm": 1.528425807782001, "kl": 0.243896484375, "learning_rate": 6.502999332383465e-08, "loss": -0.0477, "num_tokens": 37030729.0, "reward": 0.0, "reward_std": 0.25724995136260986, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.4016096293926239, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 773.8125, "completions/mean_terminated_length": 703.760009765625, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 3.867283950617284, "grad_norm": 1.4882893888851654, "kl": 0.239013671875, "learning_rate": 6.469403758105894e-08, "loss": -0.0675, "num_tokens": 37062319.0, "reward": -3.725290298461914e-09, "reward_std": 0.15071991086006165, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 726.96875, "completions/mean_terminated_length": 671.9629516601562, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 3.8703703703703702, "grad_norm": 0.9228504447247637, "kl": 0.2623291015625, "learning_rate": 6.435882286970556e-08, "loss": 0.0083, "num_tokens": 37091754.0, "reward": 0.0, "reward_std": 0.1724267452955246, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 758.4375, "completions/mean_terminated_length": 669.9166870117188, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 3.873456790123457, "grad_norm": 1.6608069921641562, "kl": 0.2398681640625, "learning_rate": 6.402435053028538e-08, "loss": -0.0788, "num_tokens": 37122412.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 762.03125, "completions/mean_terminated_length": 734.9310302734375, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 3.876543209876543, "grad_norm": 0.9179852850231466, "kl": 0.2252197265625, "learning_rate": 6.369062190034036e-08, "loss": -0.0393, "num_tokens": 37152981.0, "reward": 2.3283064365386963e-09, "reward_std": 0.1416376531124115, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 9.313225746154785e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 803.0, "completions/mean_terminated_length": 741.1199951171875, "completions/min_length": 595.0, "completions/min_terminated_length": 595.0, "epoch": 3.8796296296296298, "grad_norm": 2.185652774060378, "kl": 0.2210693359375, "learning_rate": 6.335763831443847e-08, "loss": -0.0881, "num_tokens": 37185369.0, "reward": 0.0, "reward_std": 0.1948545277118683, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 772.0, "completions/mean_terminated_length": 688.0, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 3.882716049382716, "grad_norm": 3.1329146865719437, "kl": 0.230712890625, "learning_rate": 6.302540110416837e-08, "loss": -0.2205, "num_tokens": 37216725.0, "reward": 0.0, "reward_std": 0.13944709300994873, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 834.125, "completions/mean_terminated_length": 734.6666870117188, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 3.8858024691358026, "grad_norm": 1.376970072779452, "kl": 0.2353515625, "learning_rate": 6.269391159813372e-08, "loss": -0.0172, "num_tokens": 37250437.0, "reward": 0.0, "reward_std": 0.15089517831802368, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 764.375, "completions/mean_terminated_length": 691.6799926757812, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 3.888888888888889, "grad_norm": 1.6380279375547084, "kl": 0.25048828125, "learning_rate": 6.236317112194844e-08, "loss": -0.0775, "num_tokens": 37282245.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 752.40625, "completions/mean_terminated_length": 702.1111450195312, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "epoch": 3.8919753086419755, "grad_norm": 0.9837897169735943, "kl": 0.2252197265625, "learning_rate": 6.203318099823094e-08, "loss": -0.0268, "num_tokens": 37312658.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 723.78125, "completions/mean_terminated_length": 668.1851806640625, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 3.8950617283950617, "grad_norm": 1.3922457182827639, "kl": 0.252197265625, "learning_rate": 6.17039425465991e-08, "loss": -0.0769, "num_tokens": 37342367.0, "reward": -1.862645149230957e-09, "reward_std": 0.12777692079544067, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 766.8125, "completions/mean_terminated_length": 719.1851806640625, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 3.898148148148148, "grad_norm": 1.5101869396166963, "kl": 0.23046875, "learning_rate": 6.137545708366476e-08, "loss": -0.0927, "num_tokens": 37373441.0, "reward": 1.862645149230957e-09, "reward_std": 0.15856757760047913, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110855221748352, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 744.5, "completions/mean_terminated_length": 704.5714721679688, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 3.9012345679012346, "grad_norm": 0.49481321176650966, "kl": 0.2091064453125, "learning_rate": 6.104772592302868e-08, "loss": -0.009, "num_tokens": 37403501.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 698.0, "completions/mean_terminated_length": 664.27587890625, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 3.9043209876543212, "grad_norm": 1.6360742195653148, "kl": 0.2374267578125, "learning_rate": 6.072075037527519e-08, "loss": 0.0097, "num_tokens": 37432249.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 738.84375, "completions/mean_terminated_length": 698.107177734375, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 3.9074074074074074, "grad_norm": 0.9290586479842646, "kl": 0.234130859375, "learning_rate": 6.039453174796699e-08, "loss": -0.0167, "num_tokens": 37462168.0, "reward": -1.862645149230957e-09, "reward_std": 0.12777692079544067, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 717.40625, "completions/mean_terminated_length": 696.9666748046875, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 3.9104938271604937, "grad_norm": 1.6258232379225002, "kl": 0.2481689453125, "learning_rate": 6.006907134563973e-08, "loss": 0.0463, "num_tokens": 37491149.0, "reward": 0.0, "reward_std": 0.17654143273830414, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 774.09375, "completions/mean_terminated_length": 690.7916870117188, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 3.9135802469135803, "grad_norm": 0.8252770862492023, "kl": 0.256103515625, "learning_rate": 5.974437046979711e-08, "loss": 0.0221, "num_tokens": 37522332.0, "reward": 0.0, "reward_std": 0.12888267636299133, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 746.28125, "completions/mean_terminated_length": 706.607177734375, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 3.9166666666666665, "grad_norm": 2.2402671095082014, "kl": 0.19781494140625, "learning_rate": 5.9420430418905435e-08, "loss": 0.2052, "num_tokens": 37552597.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 729.0625, "completions/mean_terminated_length": 674.4444580078125, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 3.919753086419753, "grad_norm": 1.0094757911550474, "kl": 0.2978515625, "learning_rate": 5.909725248838854e-08, "loss": -0.0091, "num_tokens": 37582503.0, "reward": -2.7939677238464355e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 760.15625, "completions/mean_terminated_length": 686.2799682617188, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 3.9228395061728394, "grad_norm": 1.0322754762656066, "kl": 0.2508544921875, "learning_rate": 5.877483797062255e-08, "loss": -0.0157, "num_tokens": 37613260.0, "reward": -4.656612873077393e-10, "reward_std": 0.15889404714107513, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -4.656612873077393e-10, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 809.28125, "completions/mean_terminated_length": 696.8095092773438, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 3.925925925925926, "grad_norm": 1.1216442703296912, "kl": 0.2850341796875, "learning_rate": 5.845318815493069e-08, "loss": -0.0258, "num_tokens": 37645861.0, "reward": -3.725290298461914e-09, "reward_std": 0.21545448899269104, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 800.875, "completions/mean_terminated_length": 699.45458984375, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 3.9290123456790123, "grad_norm": 0.5340341897893215, "kl": 0.2393798828125, "learning_rate": 5.813230432757829e-08, "loss": 0.0022, "num_tokens": 37678037.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 770.8125, "completions/mean_terminated_length": 734.6428833007812, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 3.932098765432099, "grad_norm": 1.5345192291964382, "kl": 0.22509765625, "learning_rate": 5.781218777176744e-08, "loss": -0.0874, "num_tokens": 37709591.0, "reward": 0.028124995529651642, "reward_std": 0.12057675421237946, "rewards/format_reward_func/mean": 7.450580596923828e-09, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 687.8125, "completions/mean_terminated_length": 653.0344848632812, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 3.935185185185185, "grad_norm": 1.0116862834248213, "kl": 0.26220703125, "learning_rate": 5.749283976763186e-08, "loss": 0.001, "num_tokens": 37737737.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 763.84375, "completions/mean_terminated_length": 662.0435180664062, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 3.9382716049382713, "grad_norm": 1.5332987303545238, "kl": 0.282470703125, "learning_rate": 5.717426159223204e-08, "loss": -0.0185, "num_tokens": 37768828.0, "reward": 3.725290298461914e-09, "reward_std": 0.19998514652252197, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 828.40625, "completions/mean_terminated_length": 751.8695678710938, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 3.941358024691358, "grad_norm": 0.7900048691429388, "kl": 0.2333984375, "learning_rate": 5.685645451954976e-08, "loss": -0.0043, "num_tokens": 37802145.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 767.9375, "completions/mean_terminated_length": 696.239990234375, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 3.9444444444444446, "grad_norm": 0.8397533828484572, "kl": 0.259033203125, "learning_rate": 5.653941982048333e-08, "loss": 0.017, "num_tokens": 37833763.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 827.21875, "completions/mean_terminated_length": 674.1666870117188, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 3.947530864197531, "grad_norm": 0.6516620994614122, "kl": 0.3026123046875, "learning_rate": 5.6223158762842336e-08, "loss": 0.0037, "num_tokens": 37867098.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 717.375, "completions/mean_terminated_length": 673.5714721679688, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 3.950617283950617, "grad_norm": 1.5999414612910103, "kl": 0.2640380859375, "learning_rate": 5.59076726113426e-08, "loss": -0.1217, "num_tokens": 37896302.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 760.8125, "completions/mean_terminated_length": 700.0769653320312, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 3.9537037037037037, "grad_norm": 0.7509265961988767, "kl": 0.2271728515625, "learning_rate": 5.55929626276011e-08, "loss": -0.0069, "num_tokens": 37926740.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 747.0, "completions/mean_terminated_length": 728.5333862304688, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 3.9567901234567904, "grad_norm": 1.3573837093169847, "kl": 0.25048828125, "learning_rate": 5.527903007013099e-08, "loss": -0.115, "num_tokens": 37957236.0, "reward": -1.862645149230957e-09, "reward_std": 0.17631277441978455, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 735.96875, "completions/mean_terminated_length": 694.8214721679688, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 3.9598765432098766, "grad_norm": 1.833986422518576, "kl": 0.2196044921875, "learning_rate": 5.4965876194336567e-08, "loss": 0.0678, "num_tokens": 37987575.0, "reward": -1.862645149230957e-09, "reward_std": 0.19079440832138062, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1283 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 759.21875, "completions/mean_terminated_length": 698.1154174804688, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 3.962962962962963, "grad_norm": 3.1254104994235856, "kl": NaN, "learning_rate": 5.465350225250801e-08, "loss": -0.2184, "num_tokens": 38018746.0, "reward": -3.725290298461914e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 760.5, "completions/mean_terminated_length": 711.7037353515625, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 3.9660493827160495, "grad_norm": 1.2529092562723927, "kl": 0.2352294921875, "learning_rate": 5.4341909493816786e-08, "loss": -0.0364, "num_tokens": 38049506.0, "reward": 0.02812499739229679, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 734.15625, "completions/mean_terminated_length": 692.7500610351562, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 3.9691358024691357, "grad_norm": 1.2701229684839075, "kl": 0.2274169921875, "learning_rate": 5.4031099164310314e-08, "loss": -0.0172, "num_tokens": 38079387.0, "reward": 0.0, "reward_std": 0.15204033255577087, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 815.375, "completions/mean_terminated_length": 776.74072265625, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 3.9722222222222223, "grad_norm": 1.071806597415335, "kl": 0.2408447265625, "learning_rate": 5.372107250690719e-08, "loss": -0.044, "num_tokens": 38111955.0, "reward": 0.0, "reward_std": 0.12547743320465088, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 731.15625, "completions/mean_terminated_length": 676.9259033203125, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 3.9753086419753085, "grad_norm": 1.513232095260684, "kl": 0.2294921875, "learning_rate": 5.341183076139219e-08, "loss": -0.0682, "num_tokens": 38141760.0, "reward": 4.656612873077393e-10, "reward_std": 0.21574173867702484, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.259629011154175e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 764.40625, "completions/mean_terminated_length": 704.5, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 3.978395061728395, "grad_norm": 1.1832248599741026, "kl": 0.2423095703125, "learning_rate": 5.310337516441102e-08, "loss": -0.0264, "num_tokens": 38172557.0, "reward": -3.725290298461914e-09, "reward_std": 0.1510920524597168, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 820.125, "completions/mean_terminated_length": 740.3478393554688, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 3.9814814814814814, "grad_norm": 2.5120349336451113, "kl": 0.265869140625, "learning_rate": 5.279570694946581e-08, "loss": 0.0129, "num_tokens": 38205573.0, "reward": 0.0, "reward_std": 0.23568767309188843, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 7.450580596923828e-09, "rewards/logprob_reward/std": 0.4751909673213959, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 803.625, "completions/mean_terminated_length": 717.3912963867188, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 3.984567901234568, "grad_norm": 0.038282925972768285, "kl": 0.242431640625, "learning_rate": 5.2488827346910015e-08, "loss": 0.0002, "num_tokens": 38238237.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 751.625, "completions/mean_terminated_length": 712.7142944335938, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 3.9876543209876543, "grad_norm": 0.8511483491169654, "kl": 0.2491455078125, "learning_rate": 5.21827375839432e-08, "loss": -0.0101, "num_tokens": 38268473.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 775.625, "completions/mean_terminated_length": 706.0799560546875, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 3.9907407407407405, "grad_norm": 1.4681370946446408, "kl": 0.235107421875, "learning_rate": 5.187743888460669e-08, "loss": -0.1052, "num_tokens": 38299789.0, "reward": 0.0, "reward_std": 0.15859536826610565, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 804.75, "completions/mean_terminated_length": 731.6666870117188, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 3.993827160493827, "grad_norm": 1.5640264512679012, "kl": 0.2452392578125, "learning_rate": 5.15729324697782e-08, "loss": -0.1493, "num_tokens": 38332157.0, "reward": -3.725290298461914e-09, "reward_std": 0.20989200472831726, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 743.96875, "completions/mean_terminated_length": 703.9642944335938, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 3.996913580246914, "grad_norm": 0.5742256876448175, "kl": 0.220458984375, "learning_rate": 5.126921955716723e-08, "loss": 0.0024, "num_tokens": 38362624.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 793.0625, "completions/mean_terminated_length": 654.5, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 4.0, "grad_norm": 1.823258207693366, "kl": 0.24072265625, "learning_rate": 5.096630136131e-08, "loss": -0.0203, "num_tokens": 38395266.0, "reward": 0.0, "reward_std": 0.15346293151378632, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 803.78125, "completions/mean_terminated_length": 752.9615478515625, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 4.003086419753086, "grad_norm": 1.3754924643870987, "kl": 0.2269287109375, "learning_rate": 5.0664179093564765e-08, "loss": -0.0257, "num_tokens": 38427627.0, "reward": 0.0, "reward_std": 0.12547743320465088, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 718.0, "completions/mean_terminated_length": 674.2857666015625, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 4.006172839506172, "grad_norm": 1.1025742548783262, "kl": 0.2431640625, "learning_rate": 5.036285396210685e-08, "loss": 0.014, "num_tokens": 38456931.0, "reward": 0.0, "reward_std": 0.12561336159706116, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 758.78125, "completions/mean_terminated_length": 684.5199584960938, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 4.0092592592592595, "grad_norm": 1.189512363267725, "kl": 0.2606201171875, "learning_rate": 5.0062327171923935e-08, "loss": 0.0187, "num_tokens": 38488392.0, "reward": -3.725290298461914e-09, "reward_std": 0.1584044247865677, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 726.40625, "completions/mean_terminated_length": 683.8928833007812, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 4.012345679012346, "grad_norm": 0.7728010763079243, "kl": 0.2274169921875, "learning_rate": 4.976259992481097e-08, "loss": -0.0322, "num_tokens": 38517773.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 738.46875, "completions/mean_terminated_length": 719.433349609375, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 4.015432098765432, "grad_norm": 1.2090473868678877, "kl": 0.2313232421875, "learning_rate": 4.946367341936578e-08, "loss": 0.0106, "num_tokens": 38547764.0, "reward": 0.0, "reward_std": 0.1590990126132965, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 706.625, "completions/mean_terminated_length": 661.2857666015625, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 4.018518518518518, "grad_norm": 1.1869989440924829, "kl": 0.23388671875, "learning_rate": 4.916554885098403e-08, "loss": 0.0575, "num_tokens": 38576708.0, "reward": -3.725290298461914e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 711.9375, "completions/mean_terminated_length": 639.923095703125, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 4.021604938271605, "grad_norm": 1.4872898806560737, "kl": 0.2587890625, "learning_rate": 4.8868227411854287e-08, "loss": -0.0341, "num_tokens": 38606062.0, "reward": 0.0, "reward_std": 0.14842107892036438, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 757.125, "completions/mean_terminated_length": 707.7037353515625, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "epoch": 4.0246913580246915, "grad_norm": 0.8220477138299612, "kl": 0.2186279296875, "learning_rate": 4.857171029095364e-08, "loss": 0.0149, "num_tokens": 38637174.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 764.8125, "completions/mean_terminated_length": 738.0, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 4.027777777777778, "grad_norm": 1.29623937090978, "kl": 0.2288818359375, "learning_rate": 4.827599867404261e-08, "loss": 0.0527, "num_tokens": 38668204.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 739.78125, "completions/mean_terminated_length": 699.1785888671875, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 4.030864197530864, "grad_norm": 1.4667987300215786, "kl": 0.2337646484375, "learning_rate": 4.7981093743660634e-08, "loss": -0.0212, "num_tokens": 38698089.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 774.0, "completions/mean_terminated_length": 704.0, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 4.033950617283951, "grad_norm": 1.5253578068506375, "kl": 0.2095947265625, "learning_rate": 4.768699667912118e-08, "loss": -0.0004, "num_tokens": 38729637.0, "reward": 0.0, "reward_std": 0.22146491706371307, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.4016096293926239, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 736.96875, "completions/mean_terminated_length": 656.5999755859375, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 4.037037037037037, "grad_norm": 1.1419914492308705, "kl": 0.2244873046875, "learning_rate": 4.739370865650716e-08, "loss": 0.0326, "num_tokens": 38759980.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 707.6875, "completions/mean_terminated_length": 662.5, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 4.040123456790123, "grad_norm": 1.328147668615405, "kl": 0.2315673828125, "learning_rate": 4.710123084866602e-08, "loss": 0.0498, "num_tokens": 38788962.0, "reward": -3.725290298461914e-09, "reward_std": 0.1550375372171402, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 761.375, "completions/mean_terminated_length": 673.8333740234375, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 4.04320987654321, "grad_norm": 0.5849292838590477, "kl": 0.30615234375, "learning_rate": 4.6809564425205286e-08, "loss": 0.0211, "num_tokens": 38819658.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 691.65625, "completions/mean_terminated_length": 657.27587890625, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 4.046296296296297, "grad_norm": 0.9497969862630996, "kl": 0.24951171875, "learning_rate": 4.6518710552487796e-08, "loss": -0.0088, "num_tokens": 38847783.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 821.96875, "completions/mean_terminated_length": 742.9130859375, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 4.049382716049383, "grad_norm": 1.1436165805403045, "kl": 0.2147216796875, "learning_rate": 4.6228670393627014e-08, "loss": 0.0453, "num_tokens": 38881246.0, "reward": 2.7939677238464355e-09, "reward_std": 0.14513085782527924, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 2.7939677238464355e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 768.25, "completions/mean_terminated_length": 731.7142944335938, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 4.052469135802469, "grad_norm": 1.2709758034932812, "kl": 0.220458984375, "learning_rate": 4.5939445108482466e-08, "loss": 0.0051, "num_tokens": 38912210.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 745.03125, "completions/mean_terminated_length": 666.9199829101562, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 4.055555555555555, "grad_norm": 1.5778343204658982, "kl": 0.238525390625, "learning_rate": 4.565103585365479e-08, "loss": -0.0128, "num_tokens": 38942531.0, "reward": 0.0, "reward_std": 0.1633128821849823, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592105805873871, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 726.75, "completions/mean_terminated_length": 671.7037353515625, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 4.058641975308642, "grad_norm": 3.2831580240325646, "kl": 0.2197265625, "learning_rate": 4.536344378248161e-08, "loss": -0.0775, "num_tokens": 38972527.0, "reward": 0.0, "reward_std": 0.2469368875026703, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 818.4375, "completions/mean_terminated_length": 725.0, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 4.061728395061729, "grad_norm": 2.1271171477521342, "kl": 0.239013671875, "learning_rate": 4.50766700450326e-08, "loss": 0.0236, "num_tokens": 39005657.0, "reward": -1.862645149230957e-09, "reward_std": 0.24546441435813904, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 736.71875, "completions/mean_terminated_length": 640.9583740234375, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 4.064814814814815, "grad_norm": 1.5065991977028075, "kl": 0.235595703125, "learning_rate": 4.479071578810481e-08, "loss": -0.1461, "num_tokens": 39035432.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 833.09375, "completions/mean_terminated_length": 779.6399536132812, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 4.067901234567901, "grad_norm": 0.5444820448218113, "kl": 0.2464599609375, "learning_rate": 4.450558215521838e-08, "loss": 0.0067, "num_tokens": 39069295.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 753.8125, "completions/mean_terminated_length": 703.7777709960938, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 4.070987654320987, "grad_norm": 0.043507775135647214, "kl": 0.244384765625, "learning_rate": 4.4221270286611765e-08, "loss": 0.0002, "num_tokens": 39099637.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 775.71875, "completions/mean_terminated_length": 706.2000122070312, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 4.074074074074074, "grad_norm": 1.903184764361162, "kl": 0.2518310546875, "learning_rate": 4.3937781319237175e-08, "loss": -0.113, "num_tokens": 39130872.0, "reward": -3.725290298461914e-09, "reward_std": 0.26005661487579346, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 2.3283064365386963e-09, "rewards/logprob_reward/std": 0.43994131684303284, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 725.21875, "completions/mean_terminated_length": 669.888916015625, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 4.077160493827161, "grad_norm": 0.7411509330381716, "kl": 0.2197265625, "learning_rate": 4.365511638675612e-08, "loss": 0.0307, "num_tokens": 39160591.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 775.5, "completions/mean_terminated_length": 705.9199829101562, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 4.080246913580247, "grad_norm": 0.647895910812952, "kl": 0.2244873046875, "learning_rate": 4.337327661953477e-08, "loss": -0.0036, "num_tokens": 39192275.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 694.90625, "completions/mean_terminated_length": 633.9629516601562, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 4.083333333333333, "grad_norm": 1.7765976153792373, "kl": 0.25439453125, "learning_rate": 4.3092263144639565e-08, "loss": -0.0145, "num_tokens": 39220976.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 753.59375, "completions/mean_terminated_length": 714.9642944335938, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 4.08641975308642, "grad_norm": 0.5632068946424565, "kl": 0.2376708984375, "learning_rate": 4.281207708583256e-08, "loss": 0.0197, "num_tokens": 39251451.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 757.59375, "completions/mean_terminated_length": 696.1154174804688, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 4.089506172839506, "grad_norm": 0.0074581047025977934, "kl": 0.2513427734375, "learning_rate": 4.253271956356713e-08, "loss": 0.0003, "num_tokens": 39282394.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 791.9375, "completions/mean_terminated_length": 714.5833740234375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 4.092592592592593, "grad_norm": 0.48360248269634865, "kl": 0.2557373046875, "learning_rate": 4.2254191694983096e-08, "loss": -0.0138, "num_tokens": 39314036.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 775.5, "completions/mean_terminated_length": 692.6666870117188, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 4.095679012345679, "grad_norm": 0.8773153501536247, "kl": 0.2265625, "learning_rate": 4.197649459390287e-08, "loss": -0.0089, "num_tokens": 39345540.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 700.09375, "completions/mean_terminated_length": 653.8214721679688, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 4.098765432098766, "grad_norm": 1.3071656507023186, "kl": 0.2476806640625, "learning_rate": 4.169962937082635e-08, "loss": 0.0354, "num_tokens": 39374395.0, "reward": 0.0, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 755.8125, "completions/mean_terminated_length": 693.923095703125, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 4.101851851851852, "grad_norm": 0.012014299408962602, "kl": 0.256591796875, "learning_rate": 4.142359713292698e-08, "loss": 0.0003, "num_tokens": 39405145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 759.5, "completions/mean_terminated_length": 710.5184936523438, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "epoch": 4.104938271604938, "grad_norm": 1.1647885790172063, "kl": 0.1990966796875, "learning_rate": 4.11483989840471e-08, "loss": 0.0107, "num_tokens": 39435969.0, "reward": 0.0, "reward_std": 0.1588343232870102, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 732.5625, "completions/mean_terminated_length": 690.9285888671875, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 4.1080246913580245, "grad_norm": 0.892157759396175, "kl": 0.2264404296875, "learning_rate": 4.087403602469347e-08, "loss": -0.0048, "num_tokens": 39465951.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 770.15625, "completions/mean_terminated_length": 753.2333984375, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 4.111111111111111, "grad_norm": 1.6380842948757888, "kl": 0.2479248046875, "learning_rate": 4.060050935203307e-08, "loss": -0.0459, "num_tokens": 39497596.0, "reward": -1.862645149230957e-09, "reward_std": 0.14825105667114258, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 666.28125, "completions/mean_terminated_length": 642.433349609375, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 4.114197530864198, "grad_norm": 1.6888844533435508, "kl": 0.2288818359375, "learning_rate": 4.032782005988861e-08, "loss": -0.0474, "num_tokens": 39524797.0, "reward": 0.0, "reward_std": 0.15770003199577332, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 804.53125, "completions/mean_terminated_length": 718.6521606445312, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 4.117283950617284, "grad_norm": 1.7148153578814338, "kl": 0.233642578125, "learning_rate": 4.0055969238733945e-08, "loss": -0.0327, "num_tokens": 39557446.0, "reward": -1.862645149230957e-09, "reward_std": 0.1082625538110733, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 727.28125, "completions/mean_terminated_length": 644.2000122070312, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 4.12037037037037, "grad_norm": 0.568912016435439, "kl": 0.2469482421875, "learning_rate": 3.978495797569012e-08, "loss": 0.0021, "num_tokens": 39587207.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 781.0625, "completions/mean_terminated_length": 653.8095092773438, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 4.1234567901234565, "grad_norm": 1.2062782284133675, "kl": 0.240966796875, "learning_rate": 3.95147873545208e-08, "loss": -0.0935, "num_tokens": 39619137.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 718.96875, "completions/mean_terminated_length": 698.6333618164062, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 4.1265432098765435, "grad_norm": 0.7993156248367748, "kl": 0.2086181640625, "learning_rate": 3.924545845562791e-08, "loss": 0.0179, "num_tokens": 39648652.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 792.96875, "completions/mean_terminated_length": 759.9642944335938, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 4.12962962962963, "grad_norm": 1.1694157175137634, "kl": 0.244384765625, "learning_rate": 3.8976972356047325e-08, "loss": -0.0599, "num_tokens": 39680315.0, "reward": 0.0, "reward_std": 0.1475234180688858, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 711.75, "completions/mean_terminated_length": 653.9259033203125, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 4.132716049382716, "grad_norm": 1.1859909730996183, "kl": 0.268798828125, "learning_rate": 3.870933012944472e-08, "loss": -0.0761, "num_tokens": 39709199.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 756.0, "completions/mean_terminated_length": 717.7142944335938, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 4.135802469135802, "grad_norm": 2.6917961162589608, "kl": 0.22900390625, "learning_rate": 3.844253284611096e-08, "loss": 0.2168, "num_tokens": 39740131.0, "reward": 3.725290298461914e-09, "reward_std": 0.16228695213794708, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 703.71875, "completions/mean_terminated_length": 670.586181640625, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 4.138888888888889, "grad_norm": 1.1046492645136858, "kl": 0.2418212890625, "learning_rate": 3.817658157295819e-08, "loss": -0.0054, "num_tokens": 39768538.0, "reward": -3.725290298461914e-09, "reward_std": 0.1644470989704132, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 742.25, "completions/mean_terminated_length": 702.0000610351562, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 4.1419753086419755, "grad_norm": 1.5064693460733816, "kl": 0.22265625, "learning_rate": 3.791147737351541e-08, "loss": 0.0433, "num_tokens": 39798398.0, "reward": 0.0, "reward_std": 0.2021070271730423, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 829.9375, "completions/mean_terminated_length": 697.1578979492188, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 4.145061728395062, "grad_norm": 1.1798170534047878, "kl": 0.2657470703125, "learning_rate": 3.7647221307923946e-08, "loss": -0.0032, "num_tokens": 39831736.0, "reward": 0.02812499925494194, "reward_std": 0.10788977891206741, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1343 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 735.96875, "completions/mean_terminated_length": 682.629638671875, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 4.148148148148148, "grad_norm": 2.1159050272528184, "kl": NaN, "learning_rate": 3.738381443293376e-08, "loss": -0.1796, "num_tokens": 39862139.0, "reward": -1.862645149230957e-09, "reward_std": 0.15907026827335358, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 743.65625, "completions/mean_terminated_length": 691.74072265625, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 4.151234567901234, "grad_norm": 1.0283680016874122, "kl": 0.2330322265625, "learning_rate": 3.7121257801898814e-08, "loss": -0.0279, "num_tokens": 39891952.0, "reward": -3.725290298461914e-09, "reward_std": 0.1560322642326355, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 700.5, "completions/mean_terminated_length": 678.933349609375, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 4.154320987654321, "grad_norm": 1.3339008610165208, "kl": 0.2259521484375, "learning_rate": 3.685955246477296e-08, "loss": 0.0952, "num_tokens": 39921040.0, "reward": -1.862645149230957e-09, "reward_std": 0.17479786276817322, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 769.4375, "completions/mean_terminated_length": 698.1599731445312, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 4.157407407407407, "grad_norm": 0.7137905706762014, "kl": 0.2357177734375, "learning_rate": 3.659869946810581e-08, "loss": 0.0029, "num_tokens": 39951866.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 734.0625, "completions/mean_terminated_length": 667.1538696289062, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 4.160493827160494, "grad_norm": 0.6583705055243982, "kl": 0.2454833984375, "learning_rate": 3.6338699855038486e-08, "loss": 0.0113, "num_tokens": 39981844.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 744.40625, "completions/mean_terminated_length": 692.629638671875, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 4.16358024691358, "grad_norm": 1.1763179529794587, "kl": 0.254638671875, "learning_rate": 3.6079554665299414e-08, "loss": 0.0419, "num_tokens": 40012597.0, "reward": 2.7939677238464355e-09, "reward_std": 0.17487852275371552, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 2.7939677238464355e-09, "rewards/logprob_reward/std": 0.4016096293926239, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 683.625, "completions/mean_terminated_length": 620.5925903320312, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 4.166666666666667, "grad_norm": 1.6269839135127173, "kl": 0.2401123046875, "learning_rate": 3.5821264935200294e-08, "loss": 0.0475, "num_tokens": 40040325.0, "reward": 0.0, "reward_std": 0.18347704410552979, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 715.5625, "completions/mean_terminated_length": 683.6551513671875, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 4.169753086419753, "grad_norm": 1.4784932851557824, "kl": 0.2340087890625, "learning_rate": 3.5563831697631776e-08, "loss": 0.0049, "num_tokens": 40069951.0, "reward": -3.725290298461914e-09, "reward_std": 0.23774057626724243, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.43994131684303284, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 740.5625, "completions/mean_terminated_length": 688.0740966796875, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 4.172839506172839, "grad_norm": 1.0109079412657824, "kl": 0.2315673828125, "learning_rate": 3.53072559820595e-08, "loss": -0.0407, "num_tokens": 40099777.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 742.15625, "completions/mean_terminated_length": 677.1154174804688, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 4.175925925925926, "grad_norm": 1.4540323991785824, "kl": 0.2296142578125, "learning_rate": 3.505153881451997e-08, "loss": 0.0178, "num_tokens": 40130070.0, "reward": 1.862645149230957e-09, "reward_std": 0.1511717438697815, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 700.625, "completions/mean_terminated_length": 679.0667114257812, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 4.179012345679013, "grad_norm": 1.2291250718486564, "kl": 0.246337890625, "learning_rate": 3.479668121761617e-08, "loss": 0.0417, "num_tokens": 40159118.0, "reward": 0.0, "reward_std": 0.16396799683570862, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 759.1875, "completions/mean_terminated_length": 698.0769653320312, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 4.182098765432099, "grad_norm": 0.6068205422660885, "kl": 0.2266845703125, "learning_rate": 3.45426842105139e-08, "loss": -0.0017, "num_tokens": 40189600.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 705.5, "completions/mean_terminated_length": 646.5184936523438, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 4.185185185185185, "grad_norm": 0.955485888096828, "kl": 0.2479248046875, "learning_rate": 3.428954880893745e-08, "loss": -0.0167, "num_tokens": 40218696.0, "reward": -1.862645149230957e-09, "reward_std": 0.13241708278656006, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 688.84375, "completions/mean_terminated_length": 640.9642944335938, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 4.188271604938271, "grad_norm": 1.1845120812115468, "kl": 0.22705078125, "learning_rate": 3.403727602516554e-08, "loss": 0.0325, "num_tokens": 40247387.0, "reward": -1.862645149230957e-09, "reward_std": 0.18656574189662933, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 716.90625, "completions/mean_terminated_length": 685.137939453125, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 4.191358024691358, "grad_norm": 2.31573766530586, "kl": 0.2205810546875, "learning_rate": 3.3785866868027426e-08, "loss": -0.2404, "num_tokens": 40276972.0, "reward": -6.51925802230835e-09, "reward_std": 0.1844421625137329, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 715.3125, "completions/mean_terminated_length": 671.2142944335938, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 4.194444444444445, "grad_norm": 0.7058595940206223, "kl": 0.2503662109375, "learning_rate": 3.353532234289849e-08, "loss": -0.0048, "num_tokens": 40306290.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 741.75, "completions/mean_terminated_length": 712.5516967773438, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 4.197530864197531, "grad_norm": 1.0673894750164643, "kl": 0.228271484375, "learning_rate": 3.3285643451696796e-08, "loss": 0.0088, "num_tokens": 40336226.0, "reward": -3.725290298461914e-09, "reward_std": 0.13493458926677704, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 722.1875, "completions/mean_terminated_length": 690.9655151367188, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 4.200617283950617, "grad_norm": 0.013044095090708883, "kl": 0.260498046875, "learning_rate": 3.303683119287859e-08, "loss": 0.0003, "num_tokens": 40365748.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 747.1875, "completions/mean_terminated_length": 695.9259033203125, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 4.203703703703703, "grad_norm": 1.3756894297308906, "kl": 0.235107421875, "learning_rate": 3.278888656143453e-08, "loss": -0.0675, "num_tokens": 40395882.0, "reward": -1.862645149230957e-09, "reward_std": 0.1641397774219513, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 781.15625, "completions/mean_terminated_length": 736.1851806640625, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 4.20679012345679, "grad_norm": 1.1453659014890265, "kl": 0.2275390625, "learning_rate": 3.254181054888569e-08, "loss": 0.0535, "num_tokens": 40427243.0, "reward": 1.862645149230957e-09, "reward_std": 0.15646272897720337, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 679.125, "completions/mean_terminated_length": 629.857177734375, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 4.209876543209877, "grad_norm": 1.6602154127880575, "kl": 0.287109375, "learning_rate": 3.2295604143279534e-08, "loss": 0.0005, "num_tokens": 40455471.0, "reward": 0.0, "reward_std": 0.19475162029266357, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110855221748352, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 731.5625, "completions/mean_terminated_length": 712.0667114257812, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 4.212962962962963, "grad_norm": 1.1127604228909358, "kl": 0.2222900390625, "learning_rate": 3.205026832918606e-08, "loss": -0.0076, "num_tokens": 40485841.0, "reward": 0.0, "reward_std": 0.158689484000206, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 688.71875, "completions/mean_terminated_length": 666.36669921875, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 4.216049382716049, "grad_norm": 0.011130027266053834, "kl": 0.2569580078125, "learning_rate": 3.1805804087693676e-08, "loss": 0.0003, "num_tokens": 40513824.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 747.25, "completions/mean_terminated_length": 683.3846435546875, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 4.219135802469136, "grad_norm": 1.0897384378388661, "kl": 0.259765625, "learning_rate": 3.156221239640558e-08, "loss": -0.0705, "num_tokens": 40544048.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 816.875, "completions/mean_terminated_length": 747.8333740234375, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 4.222222222222222, "grad_norm": 3.4160042617833395, "kl": 0.2247314453125, "learning_rate": 3.13194942294355e-08, "loss": 0.0505, "num_tokens": 40576872.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 732.3125, "completions/mean_terminated_length": 665.0, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 4.2253086419753085, "grad_norm": 0.708153099053505, "kl": 0.275390625, "learning_rate": 3.1077650557404076e-08, "loss": 0.0106, "num_tokens": 40606486.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 645.8125, "completions/mean_terminated_length": 620.6000366210938, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 4.228395061728395, "grad_norm": 0.9270864093096222, "kl": 0.2149658203125, "learning_rate": 3.083668234743489e-08, "loss": -0.0417, "num_tokens": 40633316.0, "reward": 0.0, "reward_std": 0.12942197918891907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 703.25, "completions/mean_terminated_length": 681.86669921875, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 4.231481481481482, "grad_norm": 0.009636437470312481, "kl": 0.236083984375, "learning_rate": 3.059659056315053e-08, "loss": 0.0002, "num_tokens": 40662216.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 684.625, "completions/mean_terminated_length": 662.0000610351562, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 4.234567901234568, "grad_norm": 0.7075246434007493, "kl": 0.27001953125, "learning_rate": 3.035737616466885e-08, "loss": 0.0019, "num_tokens": 40691088.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 777.96875, "completions/mean_terminated_length": 732.4074096679688, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 4.237654320987654, "grad_norm": 1.5995754242130091, "kl": 0.28271484375, "learning_rate": 3.0119040108598974e-08, "loss": -0.0659, "num_tokens": 40722603.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 706.5, "completions/mean_terminated_length": 661.1428833007812, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 4.2407407407407405, "grad_norm": 0.008518979815666915, "kl": 0.2490234375, "learning_rate": 2.98815833480377e-08, "loss": 0.0002, "num_tokens": 40751287.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 767.71875, "completions/mean_terminated_length": 731.107177734375, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 4.243827160493828, "grad_norm": 0.6234656514821495, "kl": 0.2220458984375, "learning_rate": 2.964500683256549e-08, "loss": -0.0157, "num_tokens": 40782634.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 733.15625, "completions/mean_terminated_length": 703.0689697265625, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 4.246913580246914, "grad_norm": 1.83563101530564, "kl": 0.21240234375, "learning_rate": 2.9409311508242663e-08, "loss": 0.1522, "num_tokens": 40812755.0, "reward": 0.0, "reward_std": 0.14941859245300293, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 693.46875, "completions/mean_terminated_length": 659.27587890625, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 4.25, "grad_norm": 0.6904988249895223, "kl": 0.2445068359375, "learning_rate": 2.9174498317605794e-08, "loss": 0.0234, "num_tokens": 40841254.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 759.1875, "completions/mean_terminated_length": 685.0399780273438, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 4.253086419753086, "grad_norm": 0.7484817161709968, "kl": 0.239013671875, "learning_rate": 2.894056819966384e-08, "loss": -0.0261, "num_tokens": 40872180.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 734.4375, "completions/mean_terminated_length": 680.8148193359375, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 4.256172839506172, "grad_norm": 2.3139778781618725, "kl": 0.235107421875, "learning_rate": 2.8707522089894354e-08, "loss": -0.0169, "num_tokens": 40902214.0, "reward": -3.725290298461914e-09, "reward_std": 0.24788418412208557, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 729.90625, "completions/mean_terminated_length": 710.300048828125, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 4.2592592592592595, "grad_norm": 1.6495025569314476, "kl": 0.2481689453125, "learning_rate": 2.8475360920239723e-08, "loss": -0.0115, "num_tokens": 40932035.0, "reward": 0.0, "reward_std": 0.1961304098367691, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 768.875, "completions/mean_terminated_length": 710.0, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 4.262345679012346, "grad_norm": 0.45062565058710474, "kl": 0.3572998046875, "learning_rate": 2.8244085619103546e-08, "loss": 0.0004, "num_tokens": 40963531.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 750.40625, "completions/mean_terminated_length": 673.7999877929688, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 4.265432098765432, "grad_norm": 1.5625064773952522, "kl": 0.24951171875, "learning_rate": 2.8013697111346906e-08, "loss": -0.0752, "num_tokens": 40994400.0, "reward": -3.725290298461914e-09, "reward_std": 0.2023996114730835, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592105805873871, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 754.53125, "completions/mean_terminated_length": 726.6551513671875, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 4.268518518518518, "grad_norm": 1.4024754395459498, "kl": 0.2464599609375, "learning_rate": 2.778419631828463e-08, "loss": -0.0026, "num_tokens": 41025157.0, "reward": 0.0, "reward_std": 0.2116771638393402, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 704.78125, "completions/mean_terminated_length": 659.1785888671875, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 4.271604938271605, "grad_norm": 0.7706216229059599, "kl": 0.2330322265625, "learning_rate": 2.755558415768147e-08, "loss": 0.0102, "num_tokens": 41053978.0, "reward": -3.725290298461914e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 738.46875, "completions/mean_terminated_length": 685.5925903320312, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 4.2746913580246915, "grad_norm": 1.3983392944478534, "kl": 0.24609375, "learning_rate": 2.732786154374869e-08, "loss": 0.0219, "num_tokens": 41083481.0, "reward": 0.0, "reward_std": 0.17821970582008362, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 774.46875, "completions/mean_terminated_length": 691.2916870117188, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 4.277777777777778, "grad_norm": 0.8699763418020097, "kl": 0.4173583984375, "learning_rate": 2.7101029387140318e-08, "loss": 0.0004, "num_tokens": 41114828.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 700.0, "completions/mean_terminated_length": 678.4000244140625, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 4.280864197530864, "grad_norm": 1.2608463260542855, "kl": 0.248291015625, "learning_rate": 2.6875088594949387e-08, "loss": 0.034, "num_tokens": 41143404.0, "reward": 5.587935447692871e-09, "reward_std": 0.1777314394712448, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 770.34375, "completions/mean_terminated_length": 685.7916870117188, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 4.283950617283951, "grad_norm": 1.6406900239739175, "kl": 0.227783203125, "learning_rate": 2.6650040070704484e-08, "loss": -0.0029, "num_tokens": 41174483.0, "reward": 0.0, "reward_std": 0.18626266717910767, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 739.5, "completions/mean_terminated_length": 698.857177734375, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 4.287037037037037, "grad_norm": 0.7099944730023894, "kl": 0.2410888671875, "learning_rate": 2.6425884714365966e-08, "loss": -0.0283, "num_tokens": 41204375.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 763.125, "completions/mean_terminated_length": 690.0799560546875, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 4.290123456790123, "grad_norm": 0.9575861709298303, "kl": 0.2384033203125, "learning_rate": 2.6202623422322546e-08, "loss": -0.0255, "num_tokens": 41235415.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 752.6875, "completions/mean_terminated_length": 702.4444580078125, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 4.29320987654321, "grad_norm": 1.1257990878802664, "kl": 0.2274169921875, "learning_rate": 2.5980257087387546e-08, "loss": 0.0471, "num_tokens": 41265769.0, "reward": -4.6566128730773926e-09, "reward_std": 0.11199356615543365, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 759.15625, "completions/mean_terminated_length": 670.875, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 4.296296296296296, "grad_norm": 1.0121036308932967, "kl": 0.2032470703125, "learning_rate": 2.5758786598795325e-08, "loss": -0.0669, "num_tokens": 41296366.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 755.0625, "completions/mean_terminated_length": 705.25927734375, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 4.299382716049383, "grad_norm": 0.5909457358275607, "kl": 0.2374267578125, "learning_rate": 2.5538212842197926e-08, "loss": -0.0004, "num_tokens": 41326864.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 718.40625, "completions/mean_terminated_length": 686.7930908203125, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 4.302469135802469, "grad_norm": 1.166366330718843, "kl": 0.23663330078125, "learning_rate": 2.5318536699661246e-08, "loss": -0.0035, "num_tokens": 41356101.0, "reward": 0.0, "reward_std": 0.1670861691236496, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 732.75, "completions/mean_terminated_length": 665.5385131835938, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 4.305555555555555, "grad_norm": 1.8184002244796087, "kl": 0.230224609375, "learning_rate": 2.5099759049661802e-08, "loss": -0.0584, "num_tokens": 41385837.0, "reward": 0.0, "reward_std": 0.20034268498420715, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 667.875, "completions/mean_terminated_length": 656.3870849609375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 4.308641975308642, "grad_norm": 1.46973767164458, "kl": 0.2149658203125, "learning_rate": 2.4881880767083002e-08, "loss": -0.0031, "num_tokens": 41413601.0, "reward": 9.313225746154785e-10, "reward_std": 0.18115665018558502, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -4.656612873077393e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 754.0625, "completions/mean_terminated_length": 664.0833740234375, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 4.311728395061729, "grad_norm": 1.8273127095817205, "kl": 0.276123046875, "learning_rate": 2.4664902723211674e-08, "loss": -0.02, "num_tokens": 41444367.0, "reward": -3.725290298461914e-09, "reward_std": 0.15263314545154572, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 708.53125, "completions/mean_terminated_length": 635.7307739257812, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 4.314814814814815, "grad_norm": 1.446883994262247, "kl": 0.3037109375, "learning_rate": 2.444882578573476e-08, "loss": 0.0152, "num_tokens": 41473768.0, "reward": 7.450580596923828e-09, "reward_std": 0.25062650442123413, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 768.0625, "completions/mean_terminated_length": 731.5000610351562, "completions/min_length": 587.0, "completions/min_terminated_length": 587.0, "epoch": 4.317901234567901, "grad_norm": 1.0826332800705931, "kl": 0.23486328125, "learning_rate": 2.4233650818735573e-08, "loss": -0.0763, "num_tokens": 41505074.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 799.625, "completions/mean_terminated_length": 697.6364135742188, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 4.320987654320987, "grad_norm": 0.7950440121724911, "kl": 0.2513427734375, "learning_rate": 2.401937868269058e-08, "loss": -0.0097, "num_tokens": 41537470.0, "reward": 0.028124995529651642, "reward_std": 0.055743563920259476, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 818.15625, "completions/mean_terminated_length": 724.5909423828125, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 4.324074074074074, "grad_norm": 2.0070380457936463, "kl": 0.2208251953125, "learning_rate": 2.380601023446577e-08, "loss": 0.1145, "num_tokens": 41570159.0, "reward": 0.0, "reward_std": 0.3259367048740387, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.5080004930496216, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 766.6875, "completions/mean_terminated_length": 707.3077392578125, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 4.327160493827161, "grad_norm": 1.3185250022760098, "kl": 0.240966796875, "learning_rate": 2.3593546327313364e-08, "loss": -0.0079, "num_tokens": 41601021.0, "reward": -5.587935447692871e-09, "reward_std": 0.23924091458320618, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -7.450580596923828e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 719.9375, "completions/mean_terminated_length": 663.629638671875, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 4.330246913580247, "grad_norm": 0.0095477643266165, "kl": 0.238525390625, "learning_rate": 2.338198781086842e-08, "loss": 0.0002, "num_tokens": 41630547.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 761.78125, "completions/mean_terminated_length": 701.269287109375, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 4.333333333333333, "grad_norm": 0.7769882393509312, "kl": 0.25830078125, "learning_rate": 2.317133553114525e-08, "loss": -0.0128, "num_tokens": 41661440.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 758.75, "completions/mean_terminated_length": 709.629638671875, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 4.33641975308642, "grad_norm": 1.4821057581030834, "kl": 0.2342529296875, "learning_rate": 2.2961590330534298e-08, "loss": -0.0107, "num_tokens": 41692224.0, "reward": 0.0, "reward_std": 0.22113531827926636, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 766.1875, "completions/mean_terminated_length": 718.4444580078125, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 4.339506172839506, "grad_norm": 1.3278965900616837, "kl": 0.267822265625, "learning_rate": 2.2752753047798502e-08, "loss": 0.0647, "num_tokens": 41722846.0, "reward": 0.0, "reward_std": 0.15451930463314056, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 744.8125, "completions/mean_terminated_length": 704.9285888671875, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 4.342592592592593, "grad_norm": 0.7086368539435858, "kl": 0.2275390625, "learning_rate": 2.2544824518070104e-08, "loss": -0.0067, "num_tokens": 41752820.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 733.90625, "completions/mean_terminated_length": 692.4642944335938, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 4.345679012345679, "grad_norm": 1.1133817788663498, "kl": 0.2344970703125, "learning_rate": 2.2337805572847425e-08, "loss": -0.0251, "num_tokens": 41782657.0, "reward": 0.0, "reward_std": 0.17047426104545593, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 704.8125, "completions/mean_terminated_length": 683.5333862304688, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 4.348765432098766, "grad_norm": 0.6683540846668884, "kl": 0.2530517578125, "learning_rate": 2.2131697039991127e-08, "loss": -0.0125, "num_tokens": 41811603.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 725.28125, "completions/mean_terminated_length": 682.607177734375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 4.351851851851852, "grad_norm": 1.2166585199496174, "kl": 0.2332763671875, "learning_rate": 2.1926499743721405e-08, "loss": -0.0369, "num_tokens": 41841168.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 829.5, "completions/mean_terminated_length": 753.3912963867188, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 4.354938271604938, "grad_norm": 2.2801346107215195, "kl": 0.28369140625, "learning_rate": 2.1722214504614313e-08, "loss": 0.1175, "num_tokens": 41874748.0, "reward": 0.0, "reward_std": 0.1789824664592743, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 714.53125, "completions/mean_terminated_length": 670.3214721679688, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 4.3580246913580245, "grad_norm": 0.803676450488695, "kl": 0.2576904296875, "learning_rate": 2.1518842139598674e-08, "loss": 0.0087, "num_tokens": 41903517.0, "reward": 0.0, "reward_std": 0.13877099752426147, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 759.53125, "completions/mean_terminated_length": 685.47998046875, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 4.361111111111111, "grad_norm": 0.019441176387390194, "kl": 0.2454833984375, "learning_rate": 2.1316383461952804e-08, "loss": 0.0002, "num_tokens": 41934410.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 748.90625, "completions/mean_terminated_length": 697.9629516601562, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 4.364197530864198, "grad_norm": 1.1760925156406545, "kl": 0.2637939453125, "learning_rate": 2.1114839281301143e-08, "loss": 0.0219, "num_tokens": 41965215.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 708.5625, "completions/mean_terminated_length": 675.9310302734375, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 4.367283950617284, "grad_norm": 0.8958648811218094, "kl": 0.3037109375, "learning_rate": 2.0914210403611132e-08, "loss": 0.0069, "num_tokens": 41994477.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 741.4375, "completions/mean_terminated_length": 676.2307739257812, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 4.37037037037037, "grad_norm": 1.2453417538583529, "kl": 0.2357177734375, "learning_rate": 2.071449763118993e-08, "loss": 0.0133, "num_tokens": 42024863.0, "reward": -3.725290298461914e-09, "reward_std": 0.1714201271533966, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 758.875, "completions/mean_terminated_length": 638.3636474609375, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 4.3734567901234565, "grad_norm": 0.8876004902813407, "kl": 0.281982421875, "learning_rate": 2.0515701762681304e-08, "loss": -0.0327, "num_tokens": 42055639.0, "reward": 0.02812499925494194, "reward_std": 0.08606424182653427, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 783.625, "completions/mean_terminated_length": 689.5652465820312, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 4.3765432098765435, "grad_norm": 0.6408308831338301, "kl": 0.2276611328125, "learning_rate": 2.0317823593062165e-08, "loss": 0.0097, "num_tokens": 42087427.0, "reward": 0.0, "reward_std": 0.09185586124658585, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 774.84375, "completions/mean_terminated_length": 717.34619140625, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 4.37962962962963, "grad_norm": 1.798583489884588, "kl": 0.254150390625, "learning_rate": 2.0120863913639874e-08, "loss": -0.006, "num_tokens": 42118910.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 804.15625, "completions/mean_terminated_length": 753.423095703125, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 4.382716049382716, "grad_norm": 0.016142977629844284, "kl": 0.2379150390625, "learning_rate": 1.9924823512048438e-08, "loss": 0.0002, "num_tokens": 42151619.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 780.46875, "completions/mean_terminated_length": 724.269287109375, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 4.385802469135802, "grad_norm": 1.1036477575337424, "kl": 0.2420654296875, "learning_rate": 1.972970317224601e-08, "loss": 0.0052, "num_tokens": 42183622.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 769.5625, "completions/mean_terminated_length": 710.84619140625, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 4.388888888888889, "grad_norm": 1.716196947447889, "kl": 0.2664794921875, "learning_rate": 1.9535503674511263e-08, "loss": -0.0756, "num_tokens": 42214680.0, "reward": -1.862645149230957e-09, "reward_std": 0.23742079734802246, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096293926239, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 696.9375, "completions/mean_terminated_length": 675.1333618164062, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 4.3919753086419755, "grad_norm": 0.8053514278849082, "kl": 0.2352294921875, "learning_rate": 1.934222579544059e-08, "loss": 0.0094, "num_tokens": 42243374.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 703.3125, "completions/mean_terminated_length": 681.933349609375, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 4.395061728395062, "grad_norm": 1.5432287425828701, "kl": 0.25634765625, "learning_rate": 1.9149870307944765e-08, "loss": 0.0119, "num_tokens": 42272100.0, "reward": 0.0, "reward_std": 0.1917421817779541, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 757.40625, "completions/mean_terminated_length": 719.3214721679688, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 4.398148148148148, "grad_norm": 0.007119455157275028, "kl": 0.214599609375, "learning_rate": 1.895843798124605e-08, "loss": 0.0002, "num_tokens": 42303153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 740.53125, "completions/mean_terminated_length": 700.0357666015625, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 4.401234567901234, "grad_norm": 1.3647928972555223, "kl": 0.2242431640625, "learning_rate": 1.8767929580874863e-08, "loss": -0.01, "num_tokens": 42333558.0, "reward": 0.0, "reward_std": 0.21369892358779907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 749.40625, "completions/mean_terminated_length": 686.0385131835938, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 4.404320987654321, "grad_norm": 1.0090806499525684, "kl": 0.2540283203125, "learning_rate": 1.8578345868666996e-08, "loss": 0.0519, "num_tokens": 42363883.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 719.53125, "completions/mean_terminated_length": 663.1481323242188, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 4.407407407407407, "grad_norm": 1.1578184340119995, "kl": 0.248046875, "learning_rate": 1.8389687602760495e-08, "loss": -0.0371, "num_tokens": 42393452.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 711.0625, "completions/mean_terminated_length": 653.1111450195312, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 4.410493827160494, "grad_norm": 1.2281974268453777, "kl": 0.26708984375, "learning_rate": 1.820195553759246e-08, "loss": 0.0004, "num_tokens": 42423002.0, "reward": 0.0, "reward_std": 0.12636974453926086, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 696.8125, "completions/mean_terminated_length": 662.9655151367188, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 4.41358024691358, "grad_norm": 1.0386678021098472, "kl": 0.2296142578125, "learning_rate": 1.8015150423896203e-08, "loss": 0.0068, "num_tokens": 42451684.0, "reward": -3.725290298461914e-09, "reward_std": 0.13909262418746948, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 751.9375, "completions/mean_terminated_length": 713.0714721679688, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 4.416666666666667, "grad_norm": 1.7160511739689148, "kl": 0.260498046875, "learning_rate": 1.782927300869827e-08, "loss": 0.0207, "num_tokens": 42482794.0, "reward": -3.725290298461914e-09, "reward_std": 0.2251279354095459, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 723.6875, "completions/mean_terminated_length": 692.6206665039062, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 4.419753086419753, "grad_norm": 0.009155227082101668, "kl": 0.2347412109375, "learning_rate": 1.7644324035315212e-08, "loss": 0.0002, "num_tokens": 42512348.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 764.96875, "completions/mean_terminated_length": 717.0, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 4.422839506172839, "grad_norm": 0.7984414048470345, "kl": 0.2386474609375, "learning_rate": 1.746030424335093e-08, "loss": 0.0002, "num_tokens": 42543131.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 735.5625, "completions/mean_terminated_length": 682.1481323242188, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 4.425925925925926, "grad_norm": 1.7422509028954587, "kl": 0.248291015625, "learning_rate": 1.7277214368693423e-08, "loss": -0.0442, "num_tokens": 42572933.0, "reward": 0.0, "reward_std": 0.22261665761470795, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 816.96875, "completions/mean_terminated_length": 735.95654296875, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 4.429012345679013, "grad_norm": 1.2367044502059985, "kl": 0.199951171875, "learning_rate": 1.7095055143512117e-08, "loss": 0.011, "num_tokens": 42606156.0, "reward": 0.0, "reward_std": 0.15895044803619385, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1435 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 750.15625, "completions/mean_terminated_length": 686.9615478515625, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 4.432098765432099, "grad_norm": 1.432721740278881, "kl": NaN, "learning_rate": 1.6913827296254736e-08, "loss": 0.0051, "num_tokens": 42636689.0, "reward": 0.0, "reward_std": 0.21788108348846436, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 772.28125, "completions/mean_terminated_length": 688.375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 4.435185185185185, "grad_norm": 1.2139756708776193, "kl": 0.28173828125, "learning_rate": 1.6733531551644503e-08, "loss": -0.0349, "num_tokens": 42668014.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 702.78125, "completions/mean_terminated_length": 702.78125, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 4.438271604938271, "grad_norm": 1.286333529259584, "kl": 0.2662353515625, "learning_rate": 1.655416863067713e-08, "loss": -0.0028, "num_tokens": 42696635.0, "reward": 0.0, "reward_std": 0.12554804980754852, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 724.0, "completions/mean_terminated_length": 654.7692260742188, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 4.441358024691358, "grad_norm": 1.1355877171411868, "kl": 0.26513671875, "learning_rate": 1.637573925061808e-08, "loss": -0.062, "num_tokens": 42726715.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 806.0625, "completions/mean_terminated_length": 733.4166870117188, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 4.444444444444445, "grad_norm": 0.7168739194832622, "kl": 0.2152099609375, "learning_rate": 1.6198244124999592e-08, "loss": 0.0137, "num_tokens": 42759561.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 708.875, "completions/mean_terminated_length": 687.86669921875, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 4.447530864197531, "grad_norm": 1.704715348627811, "kl": 0.251220703125, "learning_rate": 1.6021683963617805e-08, "loss": -0.1778, "num_tokens": 42788609.0, "reward": -1.3969838619232178e-09, "reward_std": 0.1641397774219513, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 659.03125, "completions/mean_terminated_length": 634.7000122070312, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 4.450617283950617, "grad_norm": 0.9700046429366972, "kl": 0.256591796875, "learning_rate": 1.5846059472530122e-08, "loss": 0.0444, "num_tokens": 42815726.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 716.53125, "completions/mean_terminated_length": 672.607177734375, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 4.453703703703704, "grad_norm": 1.3777724970846128, "kl": 0.2747802734375, "learning_rate": 1.5671371354051997e-08, "loss": 0.0073, "num_tokens": 42845727.0, "reward": 3.725290298461914e-09, "reward_std": 0.17593824863433838, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 774.3125, "completions/mean_terminated_length": 691.0833740234375, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 4.45679012345679, "grad_norm": 1.4633514790213318, "kl": 0.2613525390625, "learning_rate": 1.5497620306754582e-08, "loss": -0.0771, "num_tokens": 42876685.0, "reward": -1.862645149230957e-09, "reward_std": 0.1540786176919937, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 732.59375, "completions/mean_terminated_length": 665.34619140625, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 4.459876543209877, "grad_norm": 1.1255900694054242, "kl": 0.2542724609375, "learning_rate": 1.5324807025461656e-08, "loss": -0.0055, "num_tokens": 42906312.0, "reward": 0.0, "reward_std": 0.12971608340740204, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 712.5625, "completions/mean_terminated_length": 680.3448486328125, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 4.462962962962963, "grad_norm": 1.1792254002084794, "kl": 0.2718505859375, "learning_rate": 1.515293220124683e-08, "loss": 0.0162, "num_tokens": 42935194.0, "reward": 0.0, "reward_std": 0.19811780750751495, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592105805873871, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 708.9375, "completions/mean_terminated_length": 650.5925903320312, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 4.466049382716049, "grad_norm": 1.5162448328253944, "kl": 0.269287109375, "learning_rate": 1.498199652143092e-08, "loss": -0.0355, "num_tokens": 42964260.0, "reward": -3.725290298461914e-09, "reward_std": 0.2714763879776001, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -5.587935447692871e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 765.5, "completions/mean_terminated_length": 693.1199951171875, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 4.469135802469136, "grad_norm": 0.5510125170453003, "kl": 0.2532958984375, "learning_rate": 1.4812000669579188e-08, "loss": 0.0022, "num_tokens": 42995064.0, "reward": 0.0, "reward_std": 0.09185586124658585, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 717.3125, "completions/mean_terminated_length": 685.586181640625, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 4.472222222222222, "grad_norm": 0.9754741669344269, "kl": 0.2490234375, "learning_rate": 1.4642945325498507e-08, "loss": -0.0105, "num_tokens": 43024414.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 779.78125, "completions/mean_terminated_length": 698.375, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 4.4753086419753085, "grad_norm": 1.1351133490202836, "kl": 0.2525634765625, "learning_rate": 1.4474831165234707e-08, "loss": -0.0089, "num_tokens": 43056151.0, "reward": 0.0, "reward_std": 0.13225091993808746, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 708.0625, "completions/mean_terminated_length": 675.3793334960938, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 4.478395061728395, "grad_norm": 1.11370904437112, "kl": 0.2401123046875, "learning_rate": 1.4307658861069799e-08, "loss": 0.0537, "num_tokens": 43085493.0, "reward": 2.7939677238464355e-09, "reward_std": 0.15649469196796417, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 722.0625, "completions/mean_terminated_length": 690.8275756835938, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 4.481481481481482, "grad_norm": 1.0077098916504739, "kl": 0.235595703125, "learning_rate": 1.414142908151944e-08, "loss": 0.0711, "num_tokens": 43114755.0, "reward": 0.0, "reward_std": 0.1258188784122467, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 799.875, "completions/mean_terminated_length": 737.1199951171875, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 4.484567901234568, "grad_norm": 1.7503621536411276, "kl": 0.2568359375, "learning_rate": 1.3976142491330111e-08, "loss": -0.0195, "num_tokens": 43147339.0, "reward": -1.862645149230957e-09, "reward_std": 0.15790243446826935, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 761.28125, "completions/mean_terminated_length": 700.6538696289062, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 4.487654320987654, "grad_norm": 0.854007605780328, "kl": 0.268310546875, "learning_rate": 1.3811799751476588e-08, "loss": -0.0167, "num_tokens": 43178396.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 778.0, "completions/mean_terminated_length": 732.4444580078125, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 4.4907407407407405, "grad_norm": 1.2822914755192405, "kl": 0.242919921875, "learning_rate": 1.3648401519159109e-08, "loss": 0.0596, "num_tokens": 43209752.0, "reward": 1.862645149230957e-09, "reward_std": 0.15858054161071777, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 761.90625, "completions/mean_terminated_length": 724.4642944335938, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 4.493827160493828, "grad_norm": 3.16452500563885, "kl": 0.2330322265625, "learning_rate": 1.348594844780096e-08, "loss": -0.2042, "num_tokens": 43240645.0, "reward": 0.0, "reward_std": 0.20749691128730774, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 751.46875, "completions/mean_terminated_length": 675.1599731445312, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 4.496913580246914, "grad_norm": 0.697725287440438, "kl": 0.22607421875, "learning_rate": 1.332444118704576e-08, "loss": -0.0031, "num_tokens": 43271192.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 825.9375, "completions/mean_terminated_length": 735.9091186523438, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 4.5, "grad_norm": 1.9629974354596997, "kl": 0.257568359375, "learning_rate": 1.3163880382754761e-08, "loss": -0.0031, "num_tokens": 43305026.0, "reward": 3.725290298461914e-09, "reward_std": 0.19411087036132812, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 717.71875, "completions/mean_terminated_length": 673.9642944335938, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 4.503086419753086, "grad_norm": 0.5816736542401625, "kl": 0.2447509765625, "learning_rate": 1.3004266677004522e-08, "loss": 0.0211, "num_tokens": 43334657.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 714.59375, "completions/mean_terminated_length": 682.586181640625, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 4.506172839506172, "grad_norm": 1.6013058739639459, "kl": 0.274169921875, "learning_rate": 1.2845600708084076e-08, "loss": -0.0656, "num_tokens": 43363748.0, "reward": 0.02812499925494194, "reward_std": 0.08606424182653427, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 763.6875, "completions/mean_terminated_length": 736.7586059570312, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 4.5092592592592595, "grad_norm": 0.9619915024676163, "kl": 0.225830078125, "learning_rate": 1.2687883110492515e-08, "loss": 0.0052, "num_tokens": 43395030.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 660.09375, "completions/mean_terminated_length": 592.7037353515625, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 4.512345679012346, "grad_norm": 0.914771333212492, "kl": 0.2432861328125, "learning_rate": 1.2531114514936491e-08, "loss": -0.0298, "num_tokens": 43422197.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 710.15625, "completions/mean_terminated_length": 689.2333984375, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 4.515432098765432, "grad_norm": 0.008567160631542564, "kl": 0.2672119140625, "learning_rate": 1.2375295548327557e-08, "loss": 0.0003, "num_tokens": 43450938.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 773.28125, "completions/mean_terminated_length": 689.7083740234375, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 4.518518518518518, "grad_norm": 0.6652972884100276, "kl": 0.234375, "learning_rate": 1.222042683377983e-08, "loss": 0.0102, "num_tokens": 43481959.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 747.5, "completions/mean_terminated_length": 670.0799560546875, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 4.521604938271605, "grad_norm": 1.3273276748538272, "kl": 0.213134765625, "learning_rate": 1.2066508990607293e-08, "loss": -0.0103, "num_tokens": 43512859.0, "reward": -3.725290298461914e-09, "reward_std": 0.17703334987163544, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 782.53125, "completions/mean_terminated_length": 726.8077392578125, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 4.5246913580246915, "grad_norm": 0.9467423673398551, "kl": 0.244384765625, "learning_rate": 1.1913542634321538e-08, "loss": -0.0211, "num_tokens": 43544524.0, "reward": 0.0, "reward_std": 0.12943127751350403, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 769.0, "completions/mean_terminated_length": 710.1538696289062, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 4.527777777777778, "grad_norm": 0.6749859640093453, "kl": 0.2437744140625, "learning_rate": 1.1761528376629137e-08, "loss": -0.0008, "num_tokens": 43575480.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 741.75, "completions/mean_terminated_length": 701.4285888671875, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 4.530864197530864, "grad_norm": 0.008226298242311067, "kl": 0.2564697265625, "learning_rate": 1.1610466825429182e-08, "loss": 0.0003, "num_tokens": 43605616.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 773.625, "completions/mean_terminated_length": 703.5199584960938, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 4.533950617283951, "grad_norm": 1.3454237770622883, "kl": 0.2467041015625, "learning_rate": 1.1460358584811091e-08, "loss": -0.0142, "num_tokens": 43637272.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 793.59375, "completions/mean_terminated_length": 740.423095703125, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 4.537037037037037, "grad_norm": 1.2266512251474808, "kl": 0.216796875, "learning_rate": 1.1311204255051942e-08, "loss": -0.0371, "num_tokens": 43668955.0, "reward": 1.862645149230957e-09, "reward_std": 0.15522989630699158, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 9.313225746154785e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 765.4375, "completions/mean_terminated_length": 705.769287109375, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 4.540123456790123, "grad_norm": 1.7762198394012427, "kl": 0.2493896484375, "learning_rate": 1.116300443261417e-08, "loss": -0.0782, "num_tokens": 43700409.0, "reward": -3.725290298461914e-09, "reward_std": 0.15190282464027405, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 736.6875, "completions/mean_terminated_length": 683.4815063476562, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 4.54320987654321, "grad_norm": 0.01160143827927758, "kl": 0.2462158203125, "learning_rate": 1.1015759710143124e-08, "loss": 0.0002, "num_tokens": 43730175.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 735.0625, "completions/mean_terminated_length": 693.7857666015625, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 4.546296296296296, "grad_norm": 1.1958380704020215, "kl": 0.258056640625, "learning_rate": 1.0869470676464848e-08, "loss": 0.0092, "num_tokens": 43759785.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 784.21875, "completions/mean_terminated_length": 690.3912963867188, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 4.549382716049383, "grad_norm": 0.008103284846342927, "kl": 0.241455078125, "learning_rate": 1.0724137916583525e-08, "loss": 0.0002, "num_tokens": 43791444.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 847.625, "completions/mean_terminated_length": 767.45458984375, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 4.552469135802469, "grad_norm": 1.0452344389501882, "kl": 0.244384765625, "learning_rate": 1.0579762011679317e-08, "loss": 0.0184, "num_tokens": 43825792.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 750.875, "completions/mean_terminated_length": 700.2963256835938, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 4.555555555555555, "grad_norm": 1.623900247495174, "kl": 0.250244140625, "learning_rate": 1.0436343539105857e-08, "loss": -0.0896, "num_tokens": 43856188.0, "reward": -7.450580596923828e-09, "reward_std": 0.25143033266067505, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 820.4375, "completions/mean_terminated_length": 763.4400024414062, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 4.5586419753086425, "grad_norm": 1.1065812722542763, "kl": 0.262451171875, "learning_rate": 1.0293883072388154e-08, "loss": 0.0049, "num_tokens": 43888986.0, "reward": 0.0, "reward_std": 0.14708144962787628, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 703.96875, "completions/mean_terminated_length": 644.7037353515625, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 4.561728395061729, "grad_norm": 1.5148359069841493, "kl": 0.2587890625, "learning_rate": 1.015238118122011e-08, "loss": -0.0587, "num_tokens": 43917749.0, "reward": -1.862645149230957e-09, "reward_std": 0.16143634915351868, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 826.78125, "completions/mean_terminated_length": 771.5599975585938, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 4.564814814814815, "grad_norm": 0.7943240804840882, "kl": 0.217529296875, "learning_rate": 1.0011838431462389e-08, "loss": -0.0164, "num_tokens": 43950926.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 754.25, "completions/mean_terminated_length": 692.0, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 4.567901234567901, "grad_norm": 0.9560197383091844, "kl": 0.2296142578125, "learning_rate": 9.872255385140027e-09, "loss": -0.014, "num_tokens": 43981338.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 756.15625, "completions/mean_terminated_length": 666.875, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 4.570987654320987, "grad_norm": 0.9141392876347829, "kl": 0.24658203125, "learning_rate": 9.733632600440245e-09, "loss": 0.0028, "num_tokens": 44011979.0, "reward": 0.02812499739229679, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 780.21875, "completions/mean_terminated_length": 711.9599609375, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 4.574074074074074, "grad_norm": 1.618630101173056, "kl": 0.23486328125, "learning_rate": 9.595970631710248e-09, "loss": 0.0286, "num_tokens": 44043642.0, "reward": -7.450580596923828e-09, "reward_std": 0.2312345653772354, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4399413466453552, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 656.90625, "completions/mean_terminated_length": 632.433349609375, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 4.577160493827161, "grad_norm": 1.6753239581135873, "kl": 0.28173828125, "learning_rate": 9.459270029454986e-09, "loss": -0.0866, "num_tokens": 44070927.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 749.75, "completions/mean_terminated_length": 698.9629516601562, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 4.580246913580247, "grad_norm": 1.3529637418083982, "kl": 0.2352294921875, "learning_rate": 9.323531340334868e-09, "loss": -0.0959, "num_tokens": 44101015.0, "reward": -3.725290298461914e-09, "reward_std": 0.2225850522518158, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 763.46875, "completions/mean_terminated_length": 715.2222290039062, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 4.583333333333333, "grad_norm": 1.2894571737097351, "kl": 0.27978515625, "learning_rate": 9.188755107163743e-09, "loss": -0.0067, "num_tokens": 44131950.0, "reward": 0.0, "reward_std": 0.15898141264915466, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 769.21875, "completions/mean_terminated_length": 697.8800048828125, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 4.58641975308642, "grad_norm": 2.0465573166944764, "kl": 0.2640380859375, "learning_rate": 9.054941868906513e-09, "loss": 0.0193, "num_tokens": 44163125.0, "reward": 9.313225746154785e-10, "reward_std": 0.19464506208896637, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 819.25, "completions/mean_terminated_length": 772.0, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 4.589506172839506, "grad_norm": 0.6598867747085388, "kl": 0.2125244140625, "learning_rate": 8.922092160677242e-09, "loss": 0.0094, "num_tokens": 44196009.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 732.125, "completions/mean_terminated_length": 678.0740966796875, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 4.592592592592593, "grad_norm": 1.0613875861344555, "kl": 0.248779296875, "learning_rate": 8.79020651373677e-09, "loss": -0.0235, "num_tokens": 44225837.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 811.125, "completions/mean_terminated_length": 740.1666870117188, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 4.595679012345679, "grad_norm": 1.0975005863045624, "kl": 0.2247314453125, "learning_rate": 8.659285455490745e-09, "loss": 0.0161, "num_tokens": 44258869.0, "reward": -2.7939677238464355e-09, "reward_std": 0.12777692079544067, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 836.75, "completions/mean_terminated_length": 784.3200073242188, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 4.598765432098766, "grad_norm": 1.1861886423089967, "kl": 0.2210693359375, "learning_rate": 8.529329509487455e-09, "loss": -0.0573, "num_tokens": 44292277.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 745.03125, "completions/mean_terminated_length": 716.1724243164062, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 4.601851851851852, "grad_norm": 0.7326756903585625, "kl": 0.25634765625, "learning_rate": 8.400339195415718e-09, "loss": 0.0043, "num_tokens": 44322610.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 774.5625, "completions/mean_terminated_length": 704.719970703125, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 4.604938271604938, "grad_norm": 1.3326776015722304, "kl": 0.2071533203125, "learning_rate": 8.272315029102888e-09, "loss": 0.0309, "num_tokens": 44354236.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 715.5625, "completions/mean_terminated_length": 683.6551513671875, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 4.6080246913580245, "grad_norm": 0.7337581308267972, "kl": 0.263427734375, "learning_rate": 8.145257522512606e-09, "loss": -0.0121, "num_tokens": 44383478.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 740.6875, "completions/mean_terminated_length": 675.3077392578125, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 4.611111111111111, "grad_norm": 1.4010798081409779, "kl": 0.2664794921875, "learning_rate": 8.019167183743041e-09, "loss": -0.0769, "num_tokens": 44413624.0, "reward": 1.862645149230957e-09, "reward_std": 0.23660650849342346, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 684.5, "completions/mean_terminated_length": 649.3793334960938, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 4.614197530864198, "grad_norm": 1.336641774866964, "kl": 0.26806640625, "learning_rate": 7.89404451702455e-09, "loss": -0.0713, "num_tokens": 44441684.0, "reward": 0.028124995529651642, "reward_std": 0.09217105805873871, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 705.71875, "completions/mean_terminated_length": 672.7930908203125, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 4.617283950617284, "grad_norm": 1.043543092730768, "kl": 0.27276611328125, "learning_rate": 7.769890022717884e-09, "loss": 0.0438, "num_tokens": 44470839.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 763.625, "completions/mean_terminated_length": 715.4074096679688, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 4.62037037037037, "grad_norm": 1.4209127364921927, "kl": 0.21923828125, "learning_rate": 7.646704197312143e-09, "loss": 0.0321, "num_tokens": 44502179.0, "reward": 0.0, "reward_std": 0.14213800430297852, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 724.28125, "completions/mean_terminated_length": 693.27587890625, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 4.6234567901234565, "grad_norm": 0.5440669829807854, "kl": 0.2264404296875, "learning_rate": 7.524487533422635e-09, "loss": 0.0291, "num_tokens": 44532124.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 705.59375, "completions/mean_terminated_length": 672.6551513671875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 4.6265432098765435, "grad_norm": 1.8505941992881312, "kl": 0.24951171875, "learning_rate": 7.403240519789161e-09, "loss": 0.0937, "num_tokens": 44561379.0, "reward": 9.313225746154785e-10, "reward_std": 0.17386360466480255, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 716.40625, "completions/mean_terminated_length": 706.4838256835938, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 4.62962962962963, "grad_norm": 1.0809505779188693, "kl": 0.2236328125, "learning_rate": 7.282963641273842e-09, "loss": 0.0114, "num_tokens": 44590408.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 777.1875, "completions/mean_terminated_length": 694.9166870117188, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 4.632716049382716, "grad_norm": 1.2386119792959611, "kl": 0.2620849609375, "learning_rate": 7.163657378859267e-09, "loss": -0.0661, "num_tokens": 44622134.0, "reward": 0.0, "reward_std": 0.14349564909934998, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 762.09375, "completions/mean_terminated_length": 713.5925903320312, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 4.635802469135802, "grad_norm": 1.603004335218694, "kl": 0.2332763671875, "learning_rate": 7.045322209646654e-09, "loss": 0.0418, "num_tokens": 44652609.0, "reward": -1.862645149230957e-09, "reward_std": 0.15408048033714294, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 681.78125, "completions/mean_terminated_length": 670.741943359375, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 4.638888888888889, "grad_norm": 1.3373995959380525, "kl": 0.25177001953125, "learning_rate": 6.927958606853746e-09, "loss": -0.0089, "num_tokens": 44680562.0, "reward": 0.0, "reward_std": 0.159096360206604, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 687.5, "completions/mean_terminated_length": 676.6451416015625, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 4.6419753086419755, "grad_norm": 1.2645721070428075, "kl": 0.2462158203125, "learning_rate": 6.811567039813087e-09, "loss": 0.0361, "num_tokens": 44708718.0, "reward": 0.0, "reward_std": 0.12781395018100739, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 759.75, "completions/mean_terminated_length": 722.0000610351562, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 4.645061728395062, "grad_norm": 0.7789487740363542, "kl": 0.2628173828125, "learning_rate": 6.696147973970112e-09, "loss": -0.0186, "num_tokens": 44739126.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 757.5625, "completions/mean_terminated_length": 653.3043823242188, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 4.648148148148148, "grad_norm": 1.4287106138991188, "kl": 0.2666015625, "learning_rate": 6.581701870881196e-09, "loss": 0.0116, "num_tokens": 44769684.0, "reward": 0.0, "reward_std": 0.16816477477550507, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 697.46875, "completions/mean_terminated_length": 637.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 4.651234567901234, "grad_norm": 0.01064046985645947, "kl": 0.22314453125, "learning_rate": 6.4682291882119375e-09, "loss": 0.0002, "num_tokens": 44798619.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 749.15625, "completions/mean_terminated_length": 685.7307739257812, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 4.654320987654321, "grad_norm": 1.4356134177204276, "kl": 0.2442626953125, "learning_rate": 6.355730379735219e-09, "loss": -0.0287, "num_tokens": 44828888.0, "reward": 3.725290298461914e-09, "reward_std": 0.1577756106853485, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 747.5, "completions/mean_terminated_length": 708.0000610351562, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 4.657407407407407, "grad_norm": 1.2655852724598151, "kl": 0.2147216796875, "learning_rate": 6.244205895329452e-09, "loss": -0.0887, "num_tokens": 44859128.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 757.15625, "completions/mean_terminated_length": 682.4400024414062, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 4.660493827160494, "grad_norm": 0.007769112039093948, "kl": 0.218505859375, "learning_rate": 6.133656180976776e-09, "loss": 0.0002, "num_tokens": 44889905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 753.375, "completions/mean_terminated_length": 663.1666870117188, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 4.66358024691358, "grad_norm": 1.7984021331093254, "kl": 0.27685546875, "learning_rate": 6.024081678761228e-09, "loss": -0.0534, "num_tokens": 44920517.0, "reward": 0.0, "reward_std": 0.21526148915290833, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 7.450580596923828e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 741.9375, "completions/mean_terminated_length": 701.6428833007812, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 4.666666666666667, "grad_norm": 0.6358532654376484, "kl": 0.2335205078125, "learning_rate": 5.915482826867047e-09, "loss": -0.0241, "num_tokens": 44951011.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 692.25, "completions/mean_terminated_length": 630.8148193359375, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 4.669753086419753, "grad_norm": 0.7904306048798535, "kl": 0.259521484375, "learning_rate": 5.807860059576841e-09, "loss": 0.0035, "num_tokens": 44979279.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 734.59375, "completions/mean_terminated_length": 681.0, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 4.672839506172839, "grad_norm": 0.9593295827412627, "kl": 0.2333984375, "learning_rate": 5.701213807269956e-09, "loss": 0.037, "num_tokens": 45009030.0, "reward": 3.725290298461914e-09, "reward_std": 0.15900850296020508, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 749.53125, "completions/mean_terminated_length": 658.0416870117188, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 4.675925925925926, "grad_norm": 0.8617107703273437, "kl": 0.249267578125, "learning_rate": 5.5955444964206345e-09, "loss": -0.0191, "num_tokens": 45039079.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 776.40625, "completions/mean_terminated_length": 693.875, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 4.679012345679013, "grad_norm": 0.8997391467082752, "kl": 0.2471923828125, "learning_rate": 5.490852549596387e-09, "loss": 0.0058, "num_tokens": 45070608.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 719.75, "completions/mean_terminated_length": 676.2857666015625, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 4.682098765432099, "grad_norm": 0.01289473649206381, "kl": 0.2464599609375, "learning_rate": 5.387138385456319e-09, "loss": 0.0002, "num_tokens": 45099740.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 801.46875, "completions/mean_terminated_length": 700.3181762695312, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 4.685185185185185, "grad_norm": 1.4600700216093516, "kl": 0.2261962890625, "learning_rate": 5.284402418749362e-09, "loss": -0.0393, "num_tokens": 45132163.0, "reward": 0.0, "reward_std": 0.141945943236351, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1518 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 752.28125, "completions/mean_terminated_length": 734.1666870117188, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 4.688271604938271, "grad_norm": 1.2493370767929117, "kl": NaN, "learning_rate": 5.182645060312685e-09, "loss": 0.0287, "num_tokens": 45162660.0, "reward": -3.725290298461914e-09, "reward_std": 0.15481436252593994, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 687.875, "completions/mean_terminated_length": 639.857177734375, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 4.6913580246913575, "grad_norm": 1.0526833986952737, "kl": 0.2335205078125, "learning_rate": 5.081866717070088e-09, "loss": 0.0252, "num_tokens": 45191200.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 746.46875, "completions/mean_terminated_length": 668.760009765625, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 4.694444444444445, "grad_norm": 0.8318476170545551, "kl": 0.2430419921875, "learning_rate": 4.9820677920302534e-09, "loss": -0.0205, "num_tokens": 45221611.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 710.28125, "completions/mean_terminated_length": 677.8275756835938, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 4.697530864197531, "grad_norm": 3.9223276908849445, "kl": 0.2493896484375, "learning_rate": 4.883248684285302e-09, "loss": -0.2162, "num_tokens": 45250884.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 779.78125, "completions/mean_terminated_length": 744.8928833007812, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 4.700617283950617, "grad_norm": 0.7614772196649142, "kl": 0.2451171875, "learning_rate": 4.785409789008988e-09, "loss": -0.0019, "num_tokens": 45282753.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 679.65625, "completions/mean_terminated_length": 630.4642944335938, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 4.703703703703704, "grad_norm": 0.5811976227136649, "kl": 0.2579345703125, "learning_rate": 4.68855149745534e-09, "loss": -0.0108, "num_tokens": 45310642.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 769.4375, "completions/mean_terminated_length": 710.6923217773438, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 4.70679012345679, "grad_norm": 0.9731395835926854, "kl": 0.254638671875, "learning_rate": 4.592674196956914e-09, "loss": 0.0056, "num_tokens": 45341792.0, "reward": 0.0, "reward_std": 0.1462501883506775, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 771.25, "completions/mean_terminated_length": 724.4444580078125, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 4.709876543209877, "grad_norm": 2.1363373228569023, "kl": 0.2135009765625, "learning_rate": 4.497778270923374e-09, "loss": -0.1513, "num_tokens": 45373044.0, "reward": 0.0, "reward_std": 0.14937826991081238, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 792.53125, "completions/mean_terminated_length": 653.6500244140625, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 4.712962962962963, "grad_norm": 0.012083313832523389, "kl": 0.2393798828125, "learning_rate": 4.403864098839833e-09, "loss": 0.0002, "num_tokens": 45404565.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 735.84375, "completions/mean_terminated_length": 655.1599731445312, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 4.716049382716049, "grad_norm": 0.009532706765631062, "kl": 0.2425537109375, "learning_rate": 4.31093205626551e-09, "loss": 0.0002, "num_tokens": 45434516.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 784.9375, "completions/mean_terminated_length": 691.3912963867188, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 4.719135802469136, "grad_norm": 0.6964502391952809, "kl": 0.2406005859375, "learning_rate": 4.218982514832048e-09, "loss": -0.0033, "num_tokens": 45466806.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 800.21875, "completions/mean_terminated_length": 768.2500610351562, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 4.722222222222222, "grad_norm": 3.609378764365454, "kl": 0.2320556640625, "learning_rate": 4.128015842242122e-09, "loss": -0.1872, "num_tokens": 45499197.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 752.75, "completions/mean_terminated_length": 662.3333740234375, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 4.7253086419753085, "grad_norm": 2.032506028024498, "kl": 0.2476806640625, "learning_rate": 4.0380324022679935e-09, "loss": -0.1037, "num_tokens": 45529729.0, "reward": 3.725290298461914e-09, "reward_std": 0.2004832774400711, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 807.53125, "completions/mean_terminated_length": 735.375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 4.728395061728395, "grad_norm": 1.5668059653165416, "kl": 0.2254638671875, "learning_rate": 3.9490325547499316e-09, "loss": 0.1095, "num_tokens": 45562542.0, "reward": -1.862645149230957e-09, "reward_std": 0.17221274971961975, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592105805873871, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 819.59375, "completions/mean_terminated_length": 679.7368774414062, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 4.731481481481482, "grad_norm": 1.5434780132034096, "kl": 0.275146484375, "learning_rate": 3.861016655594962e-09, "loss": -0.0888, "num_tokens": 45595733.0, "reward": 3.725290298461914e-09, "reward_std": 0.17549976706504822, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 751.53125, "completions/mean_terminated_length": 701.0740966796875, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 4.734567901234568, "grad_norm": 0.6816593991016254, "kl": 0.279541015625, "learning_rate": 3.773985056775258e-09, "loss": 0.0017, "num_tokens": 45626302.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 786.9375, "completions/mean_terminated_length": 753.0714721679688, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 4.737654320987654, "grad_norm": 0.5020631044292254, "kl": 0.2392578125, "learning_rate": 3.68793810632681e-09, "loss": 0.0247, "num_tokens": 45657840.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1535 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 810.6875, "completions/mean_terminated_length": 750.9599609375, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 4.7407407407407405, "grad_norm": 0.6664601360080478, "kl": NaN, "learning_rate": 3.602876148348116e-09, "loss": 0.023, "num_tokens": 45690882.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 822.15625, "completions/mean_terminated_length": 730.4091186523438, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 4.743827160493828, "grad_norm": 1.017677491573792, "kl": 0.242431640625, "learning_rate": 3.518799522998661e-09, "loss": 0.0212, "num_tokens": 45723983.0, "reward": -3.725290298461914e-09, "reward_std": 0.13733375072479248, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 787.28125, "completions/mean_terminated_length": 732.6538696289062, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 4.746913580246914, "grad_norm": 0.49859467161351095, "kl": 0.2479248046875, "learning_rate": 3.435708566497608e-09, "loss": 0.021, "num_tokens": 45755436.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 720.875, "completions/mean_terminated_length": 689.5172119140625, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 4.75, "grad_norm": 1.004742760572375, "kl": 0.2672119140625, "learning_rate": 3.353603611122524e-09, "loss": 0.0252, "num_tokens": 45784804.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 803.90625, "completions/mean_terminated_length": 717.7825927734375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 4.753086419753086, "grad_norm": 0.5757932819249877, "kl": 0.270751953125, "learning_rate": 3.2724849852079628e-09, "loss": -0.0036, "num_tokens": 45817269.0, "reward": 0.0, "reward_std": 0.09185586124658585, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 742.4375, "completions/mean_terminated_length": 702.2142944335938, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 4.756172839506172, "grad_norm": 1.8105014997801236, "kl": 0.246337890625, "learning_rate": 3.192353013144189e-09, "loss": 0.0651, "num_tokens": 45847699.0, "reward": -1.862645149230957e-09, "reward_std": 0.18843232095241547, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 754.59375, "completions/mean_terminated_length": 692.423095703125, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 4.7592592592592595, "grad_norm": 1.6201573856275062, "kl": 0.225341796875, "learning_rate": 3.113208015375901e-09, "loss": -0.0074, "num_tokens": 45878718.0, "reward": 0.0, "reward_std": 0.20674464106559753, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 746.78125, "completions/mean_terminated_length": 728.300048828125, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 4.762345679012346, "grad_norm": 1.1305309049887073, "kl": 0.2535400390625, "learning_rate": 3.0350503084008995e-09, "loss": 0.0225, "num_tokens": 45909379.0, "reward": 1.862645149230957e-09, "reward_std": 0.1590670943260193, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 738.34375, "completions/mean_terminated_length": 719.300048828125, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 4.765432098765432, "grad_norm": 1.210494311317426, "kl": 0.20025634765625, "learning_rate": 2.957880204768809e-09, "loss": 0.0753, "num_tokens": 45939098.0, "reward": -3.725290298461914e-09, "reward_std": 0.1588881015777588, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 716.6875, "completions/mean_terminated_length": 672.7857666015625, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 4.768518518518518, "grad_norm": 0.8778223386458659, "kl": 0.2440185546875, "learning_rate": 2.8816980130799418e-09, "loss": 0.0057, "num_tokens": 45968568.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 750.03125, "completions/mean_terminated_length": 686.8077392578125, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 4.771604938271605, "grad_norm": 1.5742560739560612, "kl": 0.25, "learning_rate": 2.806504037983992e-09, "loss": -0.0526, "num_tokens": 45999521.0, "reward": 3.725290298461914e-09, "reward_std": 0.1904844343662262, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 760.78125, "completions/mean_terminated_length": 700.0385131835938, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 4.7746913580246915, "grad_norm": 1.1050697029317205, "kl": 0.2333984375, "learning_rate": 2.7322985801787046e-09, "loss": 0.0259, "num_tokens": 46030090.0, "reward": -9.313225746154785e-10, "reward_std": 0.18398132920265198, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -4.656612873077393e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 796.1875, "completions/mean_terminated_length": 676.857177734375, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 4.777777777777778, "grad_norm": 0.606037516455463, "kl": 0.243408203125, "learning_rate": 2.6590819364088746e-09, "loss": 0.0071, "num_tokens": 46062232.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 744.71875, "completions/mean_terminated_length": 651.625, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 4.780864197530864, "grad_norm": 1.117436020312828, "kl": 0.2637939453125, "learning_rate": 2.5868543994650993e-09, "loss": -0.0237, "num_tokens": 46092539.0, "reward": -1.862645149230957e-09, "reward_std": 0.1641397774219513, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 708.1875, "completions/mean_terminated_length": 675.5172119140625, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 4.783950617283951, "grad_norm": 1.078584452776903, "kl": 0.22119140625, "learning_rate": 2.5156162581824736e-09, "loss": 0.0017, "num_tokens": 46121725.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 706.5625, "completions/mean_terminated_length": 661.2142944335938, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 4.787037037037037, "grad_norm": 1.0119835354365785, "kl": 0.258056640625, "learning_rate": 2.44536779743959e-09, "loss": 0.0094, "num_tokens": 46150779.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 799.90625, "completions/mean_terminated_length": 737.1599731445312, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 4.790123456790123, "grad_norm": 0.0072914569756307, "kl": 0.23046875, "learning_rate": 2.376109298157347e-09, "loss": 0.0002, "num_tokens": 46183192.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 771.875, "completions/mean_terminated_length": 713.6923217773438, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 4.79320987654321, "grad_norm": 1.3457583086868568, "kl": 0.2255859375, "learning_rate": 2.3078410372978084e-09, "loss": 0.0015, "num_tokens": 46214012.0, "reward": 0.0, "reward_std": 0.15882496535778046, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 819.65625, "completions/mean_terminated_length": 712.6190795898438, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 4.796296296296296, "grad_norm": 1.2188384068693174, "kl": 0.234130859375, "learning_rate": 2.240563287863151e-09, "loss": -0.0634, "num_tokens": 46246933.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 771.21875, "completions/mean_terminated_length": 735.107177734375, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 4.799382716049383, "grad_norm": 0.9464528233708589, "kl": 0.2259521484375, "learning_rate": 2.174276318894497e-09, "loss": -0.0379, "num_tokens": 46278204.0, "reward": 0.0, "reward_std": 0.1379297524690628, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 712.15625, "completions/mean_terminated_length": 654.4074096679688, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 4.802469135802469, "grad_norm": 1.393970967244665, "kl": 0.2574462890625, "learning_rate": 2.1089803954708884e-09, "loss": 0.0402, "num_tokens": 46307229.0, "reward": 3.725290298461914e-09, "reward_std": 0.18818417191505432, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 742.90625, "completions/mean_terminated_length": 702.7500610351562, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 4.805555555555555, "grad_norm": 0.7378650056322856, "kl": 0.2532958984375, "learning_rate": 2.0446757787082324e-09, "loss": -0.0256, "num_tokens": 46337242.0, "reward": -9.313225746154785e-10, "reward_std": 0.035921063274145126, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 737.28125, "completions/mean_terminated_length": 707.6206665039062, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 4.8086419753086425, "grad_norm": 0.9522607155052633, "kl": 0.255126953125, "learning_rate": 1.98136272575819e-09, "loss": -0.0401, "num_tokens": 46367175.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 674.71875, "completions/mean_terminated_length": 674.71875, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 4.811728395061729, "grad_norm": 1.9686056044058784, "kl": 0.251953125, "learning_rate": 1.919041489807233e-09, "loss": -0.1019, "num_tokens": 46395134.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 803.1875, "completions/mean_terminated_length": 741.3599853515625, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 4.814814814814815, "grad_norm": 0.5787964598445375, "kl": 0.2301025390625, "learning_rate": 1.857712320075616e-09, "loss": -0.0004, "num_tokens": 46427420.0, "reward": -9.313225746154785e-10, "reward_std": 0.03592105954885483, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 759.3125, "completions/mean_terminated_length": 721.5000610351562, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 4.817901234567901, "grad_norm": 0.008318506868947691, "kl": 0.2283935546875, "learning_rate": 1.7973754618162972e-09, "loss": 0.0002, "num_tokens": 46458254.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 710.0, "completions/mean_terminated_length": 689.0667114257812, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 4.820987654320987, "grad_norm": 0.9935498356911902, "kl": 0.2708740234375, "learning_rate": 1.7380311563140737e-09, "loss": 0.0279, "num_tokens": 46486778.0, "reward": 3.725290298461914e-09, "reward_std": 0.13203482329845428, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 725.5, "completions/mean_terminated_length": 682.857177734375, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 4.824074074074074, "grad_norm": 1.0664536050126223, "kl": 0.260986328125, "learning_rate": 1.6796796408845292e-09, "loss": -0.0221, "num_tokens": 46516430.0, "reward": 0.0, "reward_std": 0.14713431894779205, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 772.40625, "completions/mean_terminated_length": 725.8148193359375, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 4.827160493827161, "grad_norm": 1.2974184614299895, "kl": 0.2332763671875, "learning_rate": 1.622321148873146e-09, "loss": -0.0662, "num_tokens": 46547979.0, "reward": -3.725290298461914e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 797.71875, "completions/mean_terminated_length": 722.2916870117188, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 4.830246913580247, "grad_norm": 0.011663174895283918, "kl": 0.2509765625, "learning_rate": 1.5659559096543318e-09, "loss": 0.0003, "num_tokens": 46580806.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 797.0, "completions/mean_terminated_length": 754.9629516601562, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 4.833333333333333, "grad_norm": 1.600415293230517, "kl": 0.234130859375, "learning_rate": 1.5105841486304783e-09, "loss": -0.0301, "num_tokens": 46613394.0, "reward": -1.6643753042444587e-10, "reward_std": 0.1449076235294342, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.8462742445990443e-10, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 744.15625, "completions/mean_terminated_length": 650.875, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 4.83641975308642, "grad_norm": 1.4874112922698708, "kl": 0.25146484375, "learning_rate": 1.456206087231182e-09, "loss": 0.0409, "num_tokens": 46644039.0, "reward": -3.4924596548080444e-10, "reward_std": 0.1406289041042328, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.6298145055770874e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 773.84375, "completions/mean_terminated_length": 727.5184936523438, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 4.839506172839506, "grad_norm": 1.2939224766821031, "kl": 0.2393798828125, "learning_rate": 1.4028219429121912e-09, "loss": -0.0968, "num_tokens": 46674846.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 804.59375, "completions/mean_terminated_length": 731.4583740234375, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 4.842592592592593, "grad_norm": 1.8806684615615579, "kl": 0.208984375, "learning_rate": 1.350431929154655e-09, "loss": 0.1084, "num_tokens": 46707317.0, "reward": 0.0, "reward_std": 0.13786308467388153, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 799.84375, "completions/mean_terminated_length": 697.95458984375, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 4.845679012345679, "grad_norm": 0.7110150230135566, "kl": 0.2266845703125, "learning_rate": 1.2990362554642087e-09, "loss": 0.0182, "num_tokens": 46739888.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 788.34375, "completions/mean_terminated_length": 744.7037353515625, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 4.848765432098766, "grad_norm": 0.8514734727466381, "kl": 0.2490234375, "learning_rate": 1.2486351273701678e-09, "loss": 0.0329, "num_tokens": 46771275.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 731.21875, "completions/mean_terminated_length": 649.239990234375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 4.851851851851852, "grad_norm": 1.3542612170669153, "kl": 0.247802734375, "learning_rate": 1.199228746424752e-09, "loss": 0.0584, "num_tokens": 46801066.0, "reward": -1.862645149230957e-09, "reward_std": 0.15539078414440155, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 718.15625, "completions/mean_terminated_length": 661.5184936523438, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 4.854938271604938, "grad_norm": 1.4127554384969723, "kl": 0.2669677734375, "learning_rate": 1.1508173102021402e-09, "loss": -0.0285, "num_tokens": 46830431.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 720.625, "completions/mean_terminated_length": 677.2857666015625, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 4.8580246913580245, "grad_norm": 0.9196112215753206, "kl": 0.2386474609375, "learning_rate": 1.1034010122978332e-09, "loss": 0.0096, "num_tokens": 46859731.0, "reward": 0.0, "reward_std": 0.14854103326797485, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 764.8125, "completions/mean_terminated_length": 727.7857666015625, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 4.861111111111111, "grad_norm": 1.3968624974201325, "kl": 0.236083984375, "learning_rate": 1.0569800423277652e-09, "loss": 0.0415, "num_tokens": 46891285.0, "reward": 9.313225746154785e-10, "reward_std": 0.15788862109184265, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110855221748352, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 786.4375, "completions/mean_terminated_length": 761.862060546875, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 4.864197530864198, "grad_norm": 1.9272550504602735, "kl": 0.2047119140625, "learning_rate": 1.0115545859276098e-09, "loss": 0.2195, "num_tokens": 46923767.0, "reward": -1.862645149230957e-09, "reward_std": 0.16878576576709747, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 747.90625, "completions/mean_terminated_length": 655.875, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 4.867283950617284, "grad_norm": 1.1959292652167877, "kl": 0.271728515625, "learning_rate": 9.67124824752058e-10, "loss": -0.0131, "num_tokens": 46954216.0, "reward": 3.725290298461914e-09, "reward_std": 0.1254967600107193, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 630.5, "completions/mean_terminated_length": 630.5, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 4.87037037037037, "grad_norm": 0.6496505531564528, "kl": 0.267578125, "learning_rate": 9.236909364739587e-10, "loss": 0.0128, "num_tokens": 46980476.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 714.09375, "completions/mean_terminated_length": 642.5769653320312, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 4.8734567901234565, "grad_norm": 0.8103005809067142, "kl": 0.2293701171875, "learning_rate": 8.812530947837904e-10, "loss": 0.005, "num_tokens": 47009999.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 722.1875, "completions/mean_terminated_length": 679.0714721679688, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 4.8765432098765435, "grad_norm": 0.7007164334439001, "kl": 0.24365234375, "learning_rate": 8.39811469388857e-10, "loss": -0.0081, "num_tokens": 47039945.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 745.96875, "completions/mean_terminated_length": 727.433349609375, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 4.87962962962963, "grad_norm": 1.1087163703421545, "kl": 0.227783203125, "learning_rate": 7.99366226012621e-10, "loss": 0.0238, "num_tokens": 47070264.0, "reward": 3.725290298461914e-09, "reward_std": 0.1315683275461197, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 743.78125, "completions/mean_terminated_length": 691.888916015625, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 4.882716049382716, "grad_norm": 0.8626702521618844, "kl": 0.2255859375, "learning_rate": 7.59917526394066e-10, "loss": 0.0123, "num_tokens": 47100733.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 810.5, "completions/mean_terminated_length": 750.719970703125, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 4.885802469135802, "grad_norm": 1.2176014332942944, "kl": 0.2484130859375, "learning_rate": 7.214655282870019e-10, "loss": 0.0664, "num_tokens": 47133577.0, "reward": 0.0, "reward_std": 0.1497526913881302, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 695.5, "completions/mean_terminated_length": 661.5172119140625, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 4.888888888888889, "grad_norm": 0.703182392089653, "kl": 0.2611083984375, "learning_rate": 6.840103854595103e-10, "loss": -0.0116, "num_tokens": 47162541.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 814.65625, "completions/mean_terminated_length": 689.0499877929688, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 4.8919753086419755, "grad_norm": 1.2877794159444154, "kl": 0.2601318359375, "learning_rate": 6.475522476932504e-10, "loss": -0.0381, "num_tokens": 47195418.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 722.0, "completions/mean_terminated_length": 621.3333740234375, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 4.895061728395062, "grad_norm": 0.007863565238588218, "kl": 0.247802734375, "learning_rate": 6.120912607829598e-10, "loss": 0.0002, "num_tokens": 47225034.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 788.25, "completions/mean_terminated_length": 709.6666870117188, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 4.898148148148148, "grad_norm": 1.6012108367231617, "kl": 0.2738037109375, "learning_rate": 5.776275665357045e-10, "loss": -0.003, "num_tokens": 47256506.0, "reward": 0.0, "reward_std": 0.19149985909461975, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 703.46875, "completions/mean_terminated_length": 682.1000366210938, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 4.901234567901234, "grad_norm": 2.8066887905064943, "kl": 0.244140625, "learning_rate": 5.441613027704905e-10, "loss": 0.1544, "num_tokens": 47285165.0, "reward": 0.0, "reward_std": 0.17153745889663696, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 773.9375, "completions/mean_terminated_length": 690.5833740234375, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 4.904320987654321, "grad_norm": 0.8189136246632134, "kl": 0.240478515625, "learning_rate": 5.116926033176261e-10, "loss": 0.0101, "num_tokens": 47315951.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 678.4375, "completions/mean_terminated_length": 655.4000244140625, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 4.907407407407407, "grad_norm": 0.934045545733305, "kl": 0.2373046875, "learning_rate": 4.802215980182212e-10, "loss": -0.02, "num_tokens": 47343765.0, "reward": 7.450580596923828e-09, "reward_std": 0.15339991450309753, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 736.90625, "completions/mean_terminated_length": 683.74072265625, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 4.910493827160494, "grad_norm": 1.7751233737098953, "kl": 0.2236328125, "learning_rate": 4.4974841272357734e-10, "loss": 0.0371, "num_tokens": 47374326.0, "reward": -1.3969838619232178e-09, "reward_std": 0.1641397774219513, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 696.0, "completions/mean_terminated_length": 662.0689697265625, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 4.91358024691358, "grad_norm": 2.528655484875274, "kl": 0.2281494140625, "learning_rate": 4.2027316929479916e-10, "loss": 0.1401, "num_tokens": 47402598.0, "reward": -3.725290298461914e-09, "reward_std": 0.22804805636405945, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1592 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 735.375, "completions/mean_terminated_length": 681.9259033203125, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 4.916666666666667, "grad_norm": 0.006457239826496087, "kl": NaN, "learning_rate": 3.917959856022668e-10, "loss": 0.0002, "num_tokens": 47432426.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1593 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 861.46875, "completions/mean_terminated_length": 750.26318359375, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 4.919753086419753, "grad_norm": 0.9350426960796004, "kl": NaN, "learning_rate": 3.6431697552510853e-10, "loss": -0.0278, "num_tokens": 47466845.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 787.25, "completions/mean_terminated_length": 762.7586059570312, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 4.922839506172839, "grad_norm": 0.8366877610922199, "kl": 0.2413330078125, "learning_rate": 3.3783624895086795e-10, "loss": 0.0106, "num_tokens": 47498905.0, "reward": -1.862645149230957e-09, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 747.78125, "completions/mean_terminated_length": 684.0385131835938, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 4.925925925925926, "grad_norm": 0.6858457371625005, "kl": 0.2178955078125, "learning_rate": 3.123539117749485e-10, "loss": 0.0136, "num_tokens": 47529498.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 700.8125, "completions/mean_terminated_length": 679.2667236328125, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 4.929012345679013, "grad_norm": 0.7573476255525109, "kl": 0.2576904296875, "learning_rate": 2.8787006590022535e-10, "loss": -0.0105, "num_tokens": 47558172.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 711.84375, "completions/mean_terminated_length": 639.8077392578125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 4.932098765432099, "grad_norm": 1.0341470735788636, "kl": 0.2425537109375, "learning_rate": 2.6438480923665627e-10, "loss": 0.0008, "num_tokens": 47587251.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 728.0625, "completions/mean_terminated_length": 728.0625, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 4.935185185185185, "grad_norm": 0.9476703452669963, "kl": 0.2508544921875, "learning_rate": 2.418982357008936e-10, "loss": 0.0235, "num_tokens": 47617221.0, "reward": 0.0, "reward_std": 0.14281287789344788, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": -2.3283064365386963e-10, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 769.21875, "completions/mean_terminated_length": 710.423095703125, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 4.938271604938271, "grad_norm": 1.2376217489149188, "kl": 0.232666015625, "learning_rate": 2.2041043521586756e-10, "loss": 0.0108, "num_tokens": 47648400.0, "reward": 0.0, "reward_std": 0.15834946930408478, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 780.90625, "completions/mean_terminated_length": 724.8077392578125, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 4.9413580246913575, "grad_norm": 1.2657420931607952, "kl": 0.21630859375, "learning_rate": 1.999214937104532e-10, "loss": -0.0838, "num_tokens": 47680125.0, "reward": 9.313225746154785e-10, "reward_std": 0.148421049118042, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 732.03125, "completions/mean_terminated_length": 712.5667114257812, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 4.944444444444445, "grad_norm": 0.5502229114422817, "kl": 0.25732421875, "learning_rate": 1.8043149311916529e-10, "loss": 0.0278, "num_tokens": 47709962.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 804.71875, "completions/mean_terminated_length": 689.857177734375, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 4.947530864197531, "grad_norm": 2.687994689418515, "kl": 0.2730712890625, "learning_rate": 1.6194051138176955e-10, "loss": 0.0248, "num_tokens": 47742369.0, "reward": 1.862645149230957e-09, "reward_std": 0.15876935422420502, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 725.53125, "completions/mean_terminated_length": 670.25927734375, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 4.950617283950617, "grad_norm": 1.5022934438874624, "kl": 0.3040771484375, "learning_rate": 1.444486224429775e-10, "loss": -0.0243, "num_tokens": 47771978.0, "reward": -7.450580596923828e-09, "reward_std": 0.1789141744375229, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 777.78125, "completions/mean_terminated_length": 708.8399658203125, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 4.953703703703704, "grad_norm": 1.2135377538704588, "kl": 0.226806640625, "learning_rate": 1.2795589625216875e-10, "loss": -0.0884, "num_tokens": 47803923.0, "reward": 0.0, "reward_std": 0.1385057121515274, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 722.1875, "completions/mean_terminated_length": 679.0714721679688, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 4.95679012345679, "grad_norm": 1.0589730453922852, "kl": 0.24609375, "learning_rate": 1.1246239876316899e-10, "loss": -0.0525, "num_tokens": 47833509.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 749.40625, "completions/mean_terminated_length": 731.1000366210938, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 4.959876543209877, "grad_norm": 1.1596273741453609, "kl": 0.2667236328125, "learning_rate": 9.796819193383376e-11, "loss": 0.0125, "num_tokens": 47864114.0, "reward": 2.7939677238464355e-09, "reward_std": 0.16112922132015228, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 9.313225746154785e-10, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 777.65625, "completions/mean_terminated_length": 695.5416870117188, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 4.962962962962963, "grad_norm": 0.590046245932475, "kl": 0.229248046875, "learning_rate": 8.447333372593735e-11, "loss": -0.0017, "num_tokens": 47895803.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 726.53125, "completions/mean_terminated_length": 695.7586059570312, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 4.966049382716049, "grad_norm": 1.441134207155461, "kl": 0.224365234375, "learning_rate": 7.197787810492295e-11, "loss": -0.0356, "num_tokens": 47925560.0, "reward": -7.450580596923828e-09, "reward_std": 0.20232830941677094, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": -3.725290298461914e-09, "rewards/logprob_reward/std": 0.4016096591949463, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 731.3125, "completions/mean_terminated_length": 701.0344848632812, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 4.969135802469136, "grad_norm": 1.4331554563402127, "kl": 0.232666015625, "learning_rate": 6.04818750396252e-11, "loss": 0.0416, "num_tokens": 47955610.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 785.4375, "completions/mean_terminated_length": 730.3846435546875, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 4.972222222222222, "grad_norm": 1.3903098779452725, "kl": 0.249267578125, "learning_rate": 4.9985370502131366e-11, "loss": 0.0078, "num_tokens": 47987544.0, "reward": -5.587935447692871e-09, "reward_std": 0.1821010708808899, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3110854923725128, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 755.46875, "completions/mean_terminated_length": 633.4091186523438, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 4.9753086419753085, "grad_norm": 0.949442702717208, "kl": 0.2928466796875, "learning_rate": 4.0488406467559245e-11, "loss": -0.005, "num_tokens": 48018491.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 751.75, "completions/mean_terminated_length": 723.586181640625, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 4.978395061728395, "grad_norm": 0.9170827216344197, "kl": 0.2598876953125, "learning_rate": 3.1991020913890723e-11, "loss": -0.042, "num_tokens": 48048823.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 725.0, "completions/mean_terminated_length": 705.0667114257812, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 4.981481481481482, "grad_norm": 3.9707542582075344, "kl": 0.2298583984375, "learning_rate": 2.449324782183293e-11, "loss": -0.2022, "num_tokens": 48079019.0, "reward": 0.0, "reward_std": 0.3038697838783264, "rewards/format_reward_func/mean": -1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 3.725290298461914e-09, "rewards/logprob_reward/std": 0.4399413466453552, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 749.375, "completions/mean_terminated_length": 720.9655151367188, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 4.984567901234568, "grad_norm": 1.1168254213397273, "kl": 0.235107421875, "learning_rate": 1.799511717470725e-11, "loss": 0.0139, "num_tokens": 48109531.0, "reward": -1.862645149230957e-09, "reward_std": 0.1596047431230545, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 4.656612873077393e-10, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 776.5, "completions/mean_terminated_length": 694.0, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 4.987654320987654, "grad_norm": 0.8447286813106216, "kl": 0.2164306640625, "learning_rate": 1.2496654958310537e-11, "loss": -0.0075, "num_tokens": 48140567.0, "reward": 9.313225746154785e-10, "reward_std": 0.05163978040218353, "rewards/format_reward_func/mean": 1.4901161193847656e-08, "rewards/format_reward_func/std": 1.0160009860992432, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 677.84375, "completions/mean_terminated_length": 654.7667236328125, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 4.9907407407407405, "grad_norm": 1.1225410716703794, "kl": 0.255615234375, "learning_rate": 7.997883160748563e-12, "loss": -0.0305, "num_tokens": 48168210.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 736.96875, "completions/mean_terminated_length": 695.9642944335938, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 4.993827160493828, "grad_norm": 0.7400796420431668, "kl": 0.2340087890625, "learning_rate": 4.4988197724360465e-12, "loss": -0.0066, "num_tokens": 48198485.0, "reward": 0.0, "reward_std": 0.11249999701976776, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.2540002465248108, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 696.125, "completions/mean_terminated_length": 662.2069091796875, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 4.996913580246914, "grad_norm": 1.1668935899083517, "kl": 0.2413330078125, "learning_rate": 1.9994787860133646e-12, "loss": -0.0613, "num_tokens": 48227069.0, "reward": 3.725290298461914e-09, "reward_std": 0.17175260186195374, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.862645149230957e-09, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 693.125, "completions/mean_terminated_length": 671.0667114257812, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 5.0, "grad_norm": 1.4493082750300388, "kl": 0.256103515625, "learning_rate": 4.998701962355412e-13, "loss": -0.0149, "num_tokens": 48255557.0, "reward": 3.725290298461914e-09, "reward_std": 0.17160141468048096, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.3592106103897095, "step": 1620 }, { "epoch": 5.0, "step": 1620, "total_flos": 0.0, "train_loss": -0.025403620972905414, "train_runtime": 20225.882, "train_samples_per_second": 0.641, "train_steps_per_second": 0.08 } ], "logging_steps": 1, "max_steps": 1620, "num_input_tokens_seen": 48255557, "num_train_epochs": 5, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }