{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 1620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 506.46875, "completions/mean_terminated_length": 506.46875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.0030864197530864196, "grad_norm": 0.0, "kl": NaN, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 22567.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 1 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 485.34375, "completions/mean_terminated_length": 485.34375, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.006172839506172839, "grad_norm": 3.4509658393516776, "kl": NaN, "learning_rate": 1.020408163265306e-08, "loss": -0.1875, "num_tokens": 44354.0, "reward": 0.03125, "reward_std": 0.0625, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 2 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 524.5, "completions/mean_terminated_length": 491.20001220703125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.009259259259259259, "grad_norm": 4.246804763804015, "kl": NaN, "learning_rate": 2.040816326530612e-08, "loss": -0.1875, "num_tokens": 67998.0, "reward": 0.03125, "reward_std": 0.0625, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 3 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 502.34375, "completions/mean_terminated_length": 485.51611328125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.012345679012345678, "grad_norm": 7.781874372659449, "kl": NaN, "learning_rate": 3.0612244897959183e-08, "loss": -0.4483, "num_tokens": 90469.0, "reward": 0.048903919756412506, "reward_std": 0.09780783951282501, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.04739324375987053, "rewards/logprob_reward/std": 0.19399255514144897, "step": 4 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 574.84375, "completions/mean_terminated_length": 510.6785888671875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.015432098765432098, "grad_norm": 4.269787026608326, "kl": NaN, "learning_rate": 4.081632653061224e-08, "loss": -0.1875, "num_tokens": 115200.0, "reward": 0.03125, "reward_std": 0.0625, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 5 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 497.59375, "completions/mean_terminated_length": 480.6128845214844, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.018518518518518517, "grad_norm": 4.264151517938587, "kl": NaN, "learning_rate": 5.1020408163265303e-08, "loss": -0.1875, "num_tokens": 137839.0, "reward": 0.03125, "reward_std": 0.0625, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 6 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 518.03125, "completions/mean_terminated_length": 518.03125, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.021604938271604937, "grad_norm": 0.0, "kl": NaN, "learning_rate": 6.122448979591837e-08, "loss": 0.0, "num_tokens": 160840.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 7 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 519.125, "completions/mean_terminated_length": 502.83868408203125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.024691358024691357, "grad_norm": 5.6437629411242085, "kl": NaN, "learning_rate": 7.142857142857142e-08, "loss": -0.1875, "num_tokens": 184080.0, "reward": 0.03125, "reward_std": 0.0625, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 8 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 549.0625, "completions/mean_terminated_length": 533.741943359375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.027777777777777776, "grad_norm": 6.645086675331711, "kl": NaN, "learning_rate": 8.163265306122448e-08, "loss": -0.1875, "num_tokens": 207982.0, "reward": 0.03125, "reward_std": 0.0625, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 9 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 522.25, "completions/mean_terminated_length": 506.06451416015625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.030864197530864196, "grad_norm": 5.613521737280568, "kl": NaN, "learning_rate": 9.183673469387755e-08, "loss": -0.3746, "num_tokens": 231058.0, "reward": 0.03437500074505806, "reward_std": 0.06875000149011612, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 10 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 521.75, "completions/mean_terminated_length": 521.75, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.033950617283950615, "grad_norm": 6.384497396317718, "kl": NaN, "learning_rate": 1.0204081632653061e-07, "loss": -0.1875, "num_tokens": 254194.0, "reward": 0.03125, "reward_std": 0.0625, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 11 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 537.9375, "completions/mean_terminated_length": 537.9375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.037037037037037035, "grad_norm": 0.0, "kl": NaN, "learning_rate": 1.1224489795918366e-07, "loss": 0.0, "num_tokens": 277620.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 12 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 526.53125, "completions/mean_terminated_length": 510.4838562011719, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.040123456790123455, "grad_norm": 0.0, "kl": NaN, "learning_rate": 1.2244897959183673e-07, "loss": 0.0, "num_tokens": 300633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 13 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 526.125, "completions/mean_terminated_length": 474.6206970214844, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.043209876543209874, "grad_norm": 0.0, "kl": NaN, "learning_rate": 1.326530612244898e-07, "loss": 0.0, "num_tokens": 324345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 14 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 566.40625, "completions/mean_terminated_length": 551.6451416015625, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.046296296296296294, "grad_norm": 5.162104763216476, "kl": NaN, "learning_rate": 1.4285714285714285e-07, "loss": -0.1875, "num_tokens": 348974.0, "reward": 0.03125, "reward_std": 0.0625, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 15 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 584.71875, "completions/mean_terminated_length": 539.27587890625, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.04938271604938271, "grad_norm": 0.0, "kl": NaN, "learning_rate": 1.5306122448979592e-07, "loss": 0.0, "num_tokens": 374349.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 16 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 561.75, "completions/mean_terminated_length": 513.9310302734375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.05246913580246913, "grad_norm": 3.973786059295557, "kl": NaN, "learning_rate": 1.6326530612244896e-07, "loss": -0.1875, "num_tokens": 398973.0, "reward": 0.03125, "reward_std": 0.0625, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 17 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 517.5, "completions/mean_terminated_length": 517.5, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.05555555555555555, "grad_norm": 0.0, "kl": NaN, "learning_rate": 1.7346938775510203e-07, "loss": 0.0, "num_tokens": 421749.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 18 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 539.5625, "completions/mean_terminated_length": 507.2666931152344, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.05864197530864197, "grad_norm": 0.0, "kl": NaN, "learning_rate": 1.836734693877551e-07, "loss": 0.0, "num_tokens": 445243.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 19 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 482.53125, "completions/mean_terminated_length": 482.53125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.06172839506172839, "grad_norm": 0.0, "kl": NaN, "learning_rate": 1.9387755102040814e-07, "loss": 0.0, "num_tokens": 467024.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 20 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 509.65625, "completions/mean_terminated_length": 509.65625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.06481481481481481, "grad_norm": 0.0, "kl": NaN, "learning_rate": 2.0408163265306121e-07, "loss": 0.0, "num_tokens": 490189.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 21 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 471.78125, "completions/mean_terminated_length": 471.78125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.06790123456790123, "grad_norm": 0.0, "kl": NaN, "learning_rate": 2.1428571428571426e-07, "loss": 0.0, "num_tokens": 511402.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 22 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 533.09375, "completions/mean_terminated_length": 500.36669921875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.07098765432098765, "grad_norm": 8.014220734091229, "kl": NaN, "learning_rate": 2.2448979591836733e-07, "loss": -0.3746, "num_tokens": 534965.0, "reward": 0.03437500074505806, "reward_std": 0.06875000149011612, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 23 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 459.90625, "completions/mean_terminated_length": 459.90625, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.07407407407407407, "grad_norm": 6.78847761104372, "kl": NaN, "learning_rate": 2.346938775510204e-07, "loss": -0.3746, "num_tokens": 555994.0, "reward": 0.03437500074505806, "reward_std": 0.06875000149011612, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 24 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 528.5, "completions/mean_terminated_length": 512.51611328125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.07716049382716049, "grad_norm": 3.812277139683094, "kl": NaN, "learning_rate": 2.4489795918367347e-07, "loss": -0.1875, "num_tokens": 579922.0, "reward": 0.03125, "reward_std": 0.0625, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 25 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 568.40625, "completions/mean_terminated_length": 521.27587890625, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.08024691358024691, "grad_norm": 5.845796876389305, "kl": NaN, "learning_rate": 2.551020408163265e-07, "loss": -0.1875, "num_tokens": 604679.0, "reward": 0.03125, "reward_std": 0.0625, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 26 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 519.75, "completions/mean_terminated_length": 486.13336181640625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.08333333333333333, "grad_norm": 7.791927843931823, "kl": NaN, "learning_rate": 2.653061224489796e-07, "loss": -0.2818, "num_tokens": 627495.0, "reward": 0.03868355602025986, "reward_std": 0.06932045519351959, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.03256506100296974, "rewards/logprob_reward/std": 0.17669323086738586, "step": 27 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 530.6875, "completions/mean_terminated_length": 514.774169921875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.08641975308641975, "grad_norm": 0.0, "kl": NaN, "learning_rate": 2.755102040816326e-07, "loss": 0.0, "num_tokens": 651041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 28 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 531.5625, "completions/mean_terminated_length": 531.5625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.08950617283950617, "grad_norm": 3.8877790271469674, "kl": NaN, "learning_rate": 2.857142857142857e-07, "loss": -0.1875, "num_tokens": 674451.0, "reward": 0.02812499925494194, "reward_std": 0.05624999850988388, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 29 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 501.28125, "completions/mean_terminated_length": 466.433349609375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.09259259259259259, "grad_norm": 5.319974732060697, "kl": NaN, "learning_rate": 2.9591836734693874e-07, "loss": -0.2808, "num_tokens": 697364.0, "reward": 0.041397932916879654, "reward_std": 0.07583508640527725, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.035581037402153015, "rewards/logprob_reward/std": 0.1776818186044693, "step": 30 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 535.15625, "completions/mean_terminated_length": 519.3870849609375, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.09567901234567901, "grad_norm": 8.195932538978067, "kl": NaN, "learning_rate": 3.0612244897959183e-07, "loss": -0.3746, "num_tokens": 721133.0, "reward": 0.03437500074505806, "reward_std": 0.06875000149011612, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 31 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 542.6875, "completions/mean_terminated_length": 510.60003662109375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.09876543209876543, "grad_norm": 0.0, "kl": NaN, "learning_rate": 3.163265306122449e-07, "loss": 0.0, "num_tokens": 745207.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 32 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 517.6875, "completions/mean_terminated_length": 517.6875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.10185185185185185, "grad_norm": 5.256402959307523, "kl": NaN, "learning_rate": 3.265306122448979e-07, "loss": -0.1875, "num_tokens": 768121.0, "reward": 0.03125, "reward_std": 0.0625, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 33 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 525.40625, "completions/mean_terminated_length": 492.16668701171875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.10493827160493827, "grad_norm": 6.024079664251176, "kl": NaN, "learning_rate": 3.3673469387755096e-07, "loss": -0.3746, "num_tokens": 791202.0, "reward": 0.03437500074505806, "reward_std": 0.06875000149011612, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 34 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 583.75, "completions/mean_terminated_length": 538.2069091796875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.10802469135802469, "grad_norm": 9.413583045805924, "kl": NaN, "learning_rate": 3.4693877551020406e-07, "loss": -0.3746, "num_tokens": 816602.0, "reward": 0.03437500074505806, "reward_std": 0.06875000149011612, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 35 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 508.4375, "completions/mean_terminated_length": 474.0666809082031, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.1111111111111111, "grad_norm": 6.438782601549385, "kl": NaN, "learning_rate": 3.5714285714285716e-07, "loss": -0.3012, "num_tokens": 839188.0, "reward": 0.06050289794802666, "reward_std": 0.07303018867969513, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.056808773428201675, "rewards/logprob_reward/std": 0.22473333775997162, "step": 36 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 562.375, "completions/mean_terminated_length": 531.6000366210938, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.11419753086419752, "grad_norm": 4.524111203304163, "kl": NaN, "learning_rate": 3.673469387755102e-07, "loss": -0.2501, "num_tokens": 863784.0, "reward": 0.03750000149011612, "reward_std": 0.06695333868265152, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 37 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 447.40625, "completions/mean_terminated_length": 428.8064270019531, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.11728395061728394, "grad_norm": 0.0, "kl": NaN, "learning_rate": 3.7755102040816324e-07, "loss": 0.0, "num_tokens": 884097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 38 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 554.4375, "completions/mean_terminated_length": 487.357177734375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.12037037037037036, "grad_norm": 4.244625742138723, "kl": NaN, "learning_rate": 3.877551020408163e-07, "loss": -0.1875, "num_tokens": 908715.0, "reward": 0.03125, "reward_std": 0.0625, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 39 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 511.78125, "completions/mean_terminated_length": 458.7930908203125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.12345679012345678, "grad_norm": 5.308344950060322, "kl": NaN, "learning_rate": 3.979591836734694e-07, "loss": -0.2555, "num_tokens": 931740.0, "reward": 0.03437500074505806, "reward_std": 0.06875000149011612, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 40 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 521.21875, "completions/mean_terminated_length": 469.2069091796875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.12654320987654322, "grad_norm": 20.94438932090929, "kl": NaN, "learning_rate": 4.0816326530612243e-07, "loss": -1.1235, "num_tokens": 955279.0, "reward": 0.05479752644896507, "reward_std": 0.10959505289793015, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.040052805095911026, "rewards/logprob_reward/std": 0.17958880960941315, "step": 41 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 492.21875, "completions/mean_terminated_length": 475.06451416015625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.12962962962962962, "grad_norm": 7.179305629998426, "kl": NaN, "learning_rate": 4.183673469387755e-07, "loss": -0.3746, "num_tokens": 977542.0, "reward": 0.03437500074505806, "reward_std": 0.06875000149011612, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 42 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 507.21875, "completions/mean_terminated_length": 472.7666931152344, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.13271604938271606, "grad_norm": 4.599621408753089, "kl": NaN, "learning_rate": 4.285714285714285e-07, "loss": -0.2968, "num_tokens": 1000109.0, "reward": 0.037758518010377884, "reward_std": 0.07002723217010498, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.031537242233753204, "rewards/logprob_reward/std": 0.17673175036907196, "step": 43 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 472.8125, "completions/mean_terminated_length": 472.8125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.13580246913580246, "grad_norm": 0.0, "kl": NaN, "learning_rate": 4.387755102040816e-07, "loss": 0.0, "num_tokens": 1021639.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 44 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 479.84375, "completions/mean_terminated_length": 462.2903137207031, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.1388888888888889, "grad_norm": 10.399042644113639, "kl": NaN, "learning_rate": 4.4897959183673465e-07, "loss": -0.6698, "num_tokens": 1043418.0, "reward": 0.08710119128227234, "reward_std": 0.1484055370092392, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.08289020508527756, "rewards/logprob_reward/std": 0.23931169509887695, "step": 45 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 445.90625, "completions/mean_terminated_length": 445.90625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.1419753086419753, "grad_norm": 5.895752205573987, "kl": NaN, "learning_rate": 4.5918367346938775e-07, "loss": -0.2721, "num_tokens": 1064063.0, "reward": 0.048732124269008636, "reward_std": 0.09746424853801727, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.04720236361026764, "rewards/logprob_reward/std": 0.19586758315563202, "step": 46 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 471.125, "completions/mean_terminated_length": 453.2903137207031, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.14506172839506173, "grad_norm": 8.31008159267593, "kl": NaN, "learning_rate": 4.693877551020408e-07, "loss": -0.562, "num_tokens": 1085371.0, "reward": 0.04915725067257881, "reward_std": 0.09831449389457703, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.044202499091625214, "rewards/logprob_reward/std": 0.1891636997461319, "step": 47 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 509.78125, "completions/mean_terminated_length": 509.78125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.14814814814814814, "grad_norm": 9.416716192026355, "kl": NaN, "learning_rate": 4.795918367346938e-07, "loss": -0.562, "num_tokens": 1108152.0, "reward": 0.05913570150732994, "reward_std": 0.11827140301465988, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.05528966709971428, "rewards/logprob_reward/std": 0.21952684223651886, "step": 48 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 568.03125, "completions/mean_terminated_length": 537.6333618164062, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.15123456790123457, "grad_norm": 3.5250370079337587, "kl": NaN, "learning_rate": 4.897959183673469e-07, "loss": -0.1875, "num_tokens": 1132897.0, "reward": 0.03125, "reward_std": 0.0625, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.03125, "rewards/logprob_reward/std": 0.1767766922712326, "step": 49 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 478.46875, "completions/mean_terminated_length": 478.46875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.15432098765432098, "grad_norm": 11.539358016352574, "kl": NaN, "learning_rate": 5e-07, "loss": -0.749, "num_tokens": 1154636.0, "reward": 0.04246839880943298, "reward_std": 0.08493679761886597, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.03677044063806534, "rewards/logprob_reward/std": 0.17851904034614563, "step": 50 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 594.46875, "completions/mean_terminated_length": 495.3461608886719, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.1574074074074074, "grad_norm": 10.590733278125992, "kl": NaN, "learning_rate": 4.999995001298037e-07, "loss": -0.7493, "num_tokens": 1180539.0, "reward": 0.05029338598251343, "reward_std": 0.10058677196502686, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.04199264943599701, "rewards/logprob_reward/std": 0.17984452843666077, "step": 51 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 443.3125, "completions/mean_terminated_length": 443.3125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.16049382716049382, "grad_norm": 6.222120464033787, "kl": NaN, "learning_rate": 4.99998000521214e-07, "loss": -0.485, "num_tokens": 1200953.0, "reward": 0.07088068872690201, "reward_std": 0.09627532213926315, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.06486742943525314, "rewards/logprob_reward/std": 0.22174827754497528, "step": 52 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 535.0, "completions/mean_terminated_length": 444.4444580078125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.16358024691358025, "grad_norm": 6.97653265003262, "kl": NaN, "learning_rate": 4.999955011802275e-07, "loss": -0.697, "num_tokens": 1224485.0, "reward": 0.1710825115442276, "reward_std": 0.22367669641971588, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.16231390833854675, "rewards/logprob_reward/std": 0.33025458455085754, "step": 53 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 531.0625, "completions/mean_terminated_length": 439.77777099609375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.16666666666666666, "grad_norm": 10.834153276280711, "kl": NaN, "learning_rate": 4.999920021168393e-07, "loss": -0.9998, "num_tokens": 1248031.0, "reward": 0.10972357541322708, "reward_std": 0.1637464463710785, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.09413730353116989, "rewards/logprob_reward/std": 0.23089353740215302, "step": 54 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 407.375, "completions/mean_terminated_length": 407.375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.1697530864197531, "grad_norm": 8.928425225460616, "kl": NaN, "learning_rate": 4.999875033450417e-07, "loss": -0.7368, "num_tokens": 1267427.0, "reward": 0.08555983006954193, "reward_std": 0.13326585292816162, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.06728870421648026, "rewards/logprob_reward/std": 0.19334517419338226, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 494.34375, "completions/mean_terminated_length": 459.0333557128906, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.1728395061728395, "grad_norm": 11.693307094863533, "kl": 0.009975433349609375, "learning_rate": 4.999820048828253e-07, "loss": -1.1992, "num_tokens": 1290002.0, "reward": 0.09973689168691635, "reward_std": 0.16025592386722565, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0691521018743515, "rewards/logprob_reward/std": 0.19271141290664673, "step": 56 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 578.96875, "completions/mean_terminated_length": 515.3928833007812, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.17592592592592593, "grad_norm": 8.106457040199288, "kl": NaN, "learning_rate": 4.999755067521781e-07, "loss": -0.9474, "num_tokens": 1315129.0, "reward": 0.16995838284492493, "reward_std": 0.22592423856258392, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.14717598259449005, "rewards/logprob_reward/std": 0.2878720164299011, "step": 57 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 457.46875, "completions/mean_terminated_length": 439.19354248046875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.17901234567901234, "grad_norm": 11.645326994885682, "kl": NaN, "learning_rate": 4.999680089790861e-07, "loss": -0.8753, "num_tokens": 1335796.0, "reward": 0.23502200841903687, "reward_std": 0.2729259431362152, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.21599668264389038, "rewards/logprob_reward/std": 0.345114529132843, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 543.75, "completions/mean_terminated_length": 511.7333679199219, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.18209876543209877, "grad_norm": 9.541173836832073, "kl": 0.012622833251953125, "learning_rate": 4.999595115935325e-07, "loss": -1.185, "num_tokens": 1359776.0, "reward": 0.11253378540277481, "reward_std": 0.1975400745868683, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0868430882692337, "rewards/logprob_reward/std": 0.22413228452205658, "step": 59 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 455.625, "completions/mean_terminated_length": 437.2903137207031, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.18518518518518517, "grad_norm": 7.184674813321227, "kl": NaN, "learning_rate": 4.999500146294979e-07, "loss": -0.5542, "num_tokens": 1380804.0, "reward": 0.15369969606399536, "reward_std": 0.11026473343372345, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.11522187292575836, "rewards/logprob_reward/std": 0.27276960015296936, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 507.625, "completions/mean_terminated_length": 473.20001220703125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.1882716049382716, "grad_norm": 8.279650657085908, "kl": 0.015605926513671875, "learning_rate": 4.999395181249604e-07, "loss": -0.9438, "num_tokens": 1403484.0, "reward": 0.19617488980293274, "reward_std": 0.19970621168613434, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.16936099529266357, "rewards/logprob_reward/std": 0.30594542622566223, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 554.0625, "completions/mean_terminated_length": 505.4482727050781, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.19135802469135801, "grad_norm": 4.687780416861181, "kl": 0.01894378662109375, "learning_rate": 4.99928022121895e-07, "loss": -0.5844, "num_tokens": 1427870.0, "reward": 0.26156121492385864, "reward_std": 0.2345581203699112, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.22465136647224426, "rewards/logprob_reward/std": 0.35051751136779785, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 451.75, "completions/mean_terminated_length": 451.75, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.19444444444444445, "grad_norm": 6.298736053517519, "kl": 0.02700042724609375, "learning_rate": 4.99915526666274e-07, "loss": -0.8576, "num_tokens": 1448366.0, "reward": 0.17902016639709473, "reward_std": 0.20423871278762817, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.14335574209690094, "rewards/logprob_reward/std": 0.25425371527671814, "step": 63 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 444.40625, "completions/mean_terminated_length": 444.40625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.19753086419753085, "grad_norm": 4.522090015154584, "kl": NaN, "learning_rate": 4.999020318080661e-07, "loss": -0.4789, "num_tokens": 1468819.0, "reward": 0.18481415510177612, "reward_std": 0.17588186264038086, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.15326571464538574, "rewards/logprob_reward/std": 0.3093109130859375, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 564.46875, "completions/mean_terminated_length": 458.423095703125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.2006172839506173, "grad_norm": 6.315831609499494, "kl": 0.0258941650390625, "learning_rate": 4.998875376012368e-07, "loss": -0.6224, "num_tokens": 1493786.0, "reward": 0.12562020123004913, "reward_std": 0.17694973945617676, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.09096689522266388, "rewards/logprob_reward/std": 0.20007851719856262, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 542.34375, "completions/mean_terminated_length": 473.5357360839844, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.2037037037037037, "grad_norm": 5.333416768047741, "kl": 0.0214996337890625, "learning_rate": 4.998720441037479e-07, "loss": -0.5094, "num_tokens": 1517937.0, "reward": 0.3101924955844879, "reward_std": 0.282688170671463, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.2786861062049866, "rewards/logprob_reward/std": 0.3630853593349457, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 638.4375, "completions/mean_terminated_length": 549.4615478515625, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.20679012345679013, "grad_norm": 4.881603144671631, "kl": 0.02561187744140625, "learning_rate": 4.99855551377557e-07, "loss": -0.3048, "num_tokens": 1545503.0, "reward": 0.13900087773799896, "reward_std": 0.153377503156662, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.09194542467594147, "rewards/logprob_reward/std": 0.20398110151290894, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 501.1875, "completions/mean_terminated_length": 484.32257080078125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.20987654320987653, "grad_norm": 4.510156247582212, "kl": 0.0214691162109375, "learning_rate": 4.998380594886182e-07, "loss": -0.4988, "num_tokens": 1567849.0, "reward": 0.2345755696296692, "reward_std": 0.1995140165090561, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.1842506229877472, "rewards/logprob_reward/std": 0.29011353850364685, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 602.03125, "completions/mean_terminated_length": 523.888916015625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.21296296296296297, "grad_norm": 4.480910121079602, "kl": 0.027496337890625, "learning_rate": 4.998195685068808e-07, "loss": -0.3268, "num_tokens": 1594550.0, "reward": 0.12782837450504303, "reward_std": 0.1595286726951599, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.09342041611671448, "rewards/logprob_reward/std": 0.19039879739284515, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 540.21875, "completions/mean_terminated_length": 524.6129150390625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.21604938271604937, "grad_norm": 3.48668133361254, "kl": 0.02984619140625, "learning_rate": 4.998000785062895e-07, "loss": -0.2484, "num_tokens": 1618813.0, "reward": 0.2392576038837433, "reward_std": 0.09408827126026154, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.1859806627035141, "rewards/logprob_reward/std": 0.31693094968795776, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 522.75, "completions/mean_terminated_length": 506.58062744140625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.2191358024691358, "grad_norm": 3.45239510452741, "kl": 0.0247650146484375, "learning_rate": 4.997795895647841e-07, "loss": -0.4334, "num_tokens": 1642409.0, "reward": 0.3271559476852417, "reward_std": 0.1929042637348175, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.29406213760375977, "rewards/logprob_reward/std": 0.3053942918777466, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 443.4375, "completions/mean_terminated_length": 443.4375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.2222222222222222, "grad_norm": 3.6207519614499155, "kl": 0.0290069580078125, "learning_rate": 4.997581017642991e-07, "loss": -0.3843, "num_tokens": 1663251.0, "reward": 0.18356288969516754, "reward_std": 0.1930927336215973, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.13451431691646576, "rewards/logprob_reward/std": 0.22353149950504303, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 518.625, "completions/mean_terminated_length": 484.933349609375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.22530864197530864, "grad_norm": 3.069737987548535, "kl": 0.027496337890625, "learning_rate": 4.997356151907633e-07, "loss": -0.3075, "num_tokens": 1686803.0, "reward": 0.2638769745826721, "reward_std": 0.17780598998069763, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.21680772304534912, "rewards/logprob_reward/std": 0.3047768473625183, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 536.3125, "completions/mean_terminated_length": 503.8000183105469, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.22839506172839505, "grad_norm": 3.063735704203007, "kl": 0.029693603515625, "learning_rate": 4.997121299340997e-07, "loss": -0.409, "num_tokens": 1710657.0, "reward": 0.3883135914802551, "reward_std": 0.22004754841327667, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.35507065057754517, "rewards/logprob_reward/std": 0.40682366490364075, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 557.625, "completions/mean_terminated_length": 491.0000305175781, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.23148148148148148, "grad_norm": 4.248195358111389, "kl": 0.0317840576171875, "learning_rate": 4.99687646088225e-07, "loss": -0.2849, "num_tokens": 1734585.0, "reward": 0.23744824528694153, "reward_std": 0.2058483213186264, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.19091472029685974, "rewards/logprob_reward/std": 0.2590171694755554, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 503.78125, "completions/mean_terminated_length": 469.10003662109375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.2345679012345679, "grad_norm": 6.336251125288074, "kl": 0.033599853515625, "learning_rate": 4.996621637510491e-07, "loss": -0.4936, "num_tokens": 1756858.0, "reward": 0.22311162948608398, "reward_std": 0.1544628143310547, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.1888740360736847, "rewards/logprob_reward/std": 0.28443700075149536, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 594.375, "completions/mean_terminated_length": 514.8148193359375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.23765432098765432, "grad_norm": 4.181222667197991, "kl": 0.0264129638671875, "learning_rate": 4.996356830244749e-07, "loss": -0.1533, "num_tokens": 1782790.0, "reward": 0.15943455696105957, "reward_std": 0.15070059895515442, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.10423284024000168, "rewards/logprob_reward/std": 0.24906344711780548, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 565.625, "completions/mean_terminated_length": 518.2069091796875, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.24074074074074073, "grad_norm": 3.0005130603335686, "kl": 0.02911376953125, "learning_rate": 4.996082040143977e-07, "loss": -0.155, "num_tokens": 1807394.0, "reward": 0.24044674634933472, "reward_std": 0.15323665738105774, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.18382972478866577, "rewards/logprob_reward/std": 0.2569461464881897, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 527.4375, "completions/mean_terminated_length": 494.3333740234375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.24382716049382716, "grad_norm": 3.1493192571331616, "kl": 0.03411865234375, "learning_rate": 4.995797268307051e-07, "loss": -0.0809, "num_tokens": 1830924.0, "reward": 0.23828719556331635, "reward_std": 0.127736896276474, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.18490245938301086, "rewards/logprob_reward/std": 0.2833430767059326, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 469.65625, "completions/mean_terminated_length": 451.774169921875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.24691358024691357, "grad_norm": 3.495785446700489, "kl": 0.0334625244140625, "learning_rate": 4.995502515872763e-07, "loss": 0.0714, "num_tokens": 1852469.0, "reward": 0.2704458236694336, "reward_std": 0.12159737944602966, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.20327314734458923, "rewards/logprob_reward/std": 0.24542374908924103, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 618.96875, "completions/mean_terminated_length": 543.9629516601562, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.25, "grad_norm": 2.790484708130881, "kl": 0.0276336669921875, "learning_rate": 4.995197784019818e-07, "loss": -0.0214, "num_tokens": 1878728.0, "reward": 0.2727452516555786, "reward_std": 0.17900536954402924, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.22318917512893677, "rewards/logprob_reward/std": 0.3217361271381378, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 483.46875, "completions/mean_terminated_length": 466.0322570800781, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.25308641975308643, "grad_norm": 3.1063710492971683, "kl": 0.032318115234375, "learning_rate": 4.994883073966823e-07, "loss": -0.301, "num_tokens": 1900443.0, "reward": 0.460162878036499, "reward_std": 0.2618337869644165, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.42101430892944336, "rewards/logprob_reward/std": 0.35217228531837463, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 533.5625, "completions/mean_terminated_length": 500.86669921875, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.25617283950617287, "grad_norm": 3.696684411354887, "kl": 0.040771484375, "learning_rate": 4.994558386972295e-07, "loss": -0.1657, "num_tokens": 1924593.0, "reward": 0.4195968806743622, "reward_std": 0.22001971304416656, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3828854560852051, "rewards/logprob_reward/std": 0.3916252553462982, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 556.125, "completions/mean_terminated_length": 489.2857360839844, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.25925925925925924, "grad_norm": 3.212920651644247, "kl": 0.0326995849609375, "learning_rate": 4.994223724334643e-07, "loss": 0.0263, "num_tokens": 1948817.0, "reward": 0.12078209221363068, "reward_std": 0.08068147301673889, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.047396764159202576, "rewards/logprob_reward/std": 0.17543712258338928, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 584.46875, "completions/mean_terminated_length": 539.0, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.2623456790123457, "grad_norm": 4.720269643233655, "kl": 0.0408935546875, "learning_rate": 4.99387908739217e-07, "loss": -0.1467, "num_tokens": 1974704.0, "reward": 0.37718161940574646, "reward_std": 0.16998913884162903, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3253406882286072, "rewards/logprob_reward/std": 0.37088391184806824, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 474.03125, "completions/mean_terminated_length": 474.03125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.2654320987654321, "grad_norm": 4.824128610987967, "kl": 0.038726806640625, "learning_rate": 4.993524477523067e-07, "loss": -0.3556, "num_tokens": 1996245.0, "reward": 0.29565373063087463, "reward_std": 0.16616299748420715, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.24864304065704346, "rewards/logprob_reward/std": 0.3138037919998169, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 518.25, "completions/mean_terminated_length": 501.9354553222656, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.26851851851851855, "grad_norm": 3.1580079710182156, "kl": 0.0414886474609375, "learning_rate": 4.993159896145405e-07, "loss": -0.09, "num_tokens": 2019509.0, "reward": 0.37848952412605286, "reward_std": 0.2256731539964676, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.326793909072876, "rewards/logprob_reward/std": 0.3878656029701233, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 510.03125, "completions/mean_terminated_length": 475.7666931152344, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.2716049382716049, "grad_norm": 4.301727758073238, "kl": 0.046661376953125, "learning_rate": 4.99278534471713e-07, "loss": -0.1559, "num_tokens": 2042906.0, "reward": 0.4223316013813019, "reward_std": 0.2101093828678131, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.38245177268981934, "rewards/logprob_reward/std": 0.3617812395095825, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 581.375, "completions/mean_terminated_length": 499.40740966796875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.27469135802469136, "grad_norm": 3.384143766146398, "kl": 0.040435791015625, "learning_rate": 4.992400824736059e-07, "loss": -0.1327, "num_tokens": 2067842.0, "reward": 0.37457239627838135, "reward_std": 0.21836638450622559, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.32938599586486816, "rewards/logprob_reward/std": 0.3359249532222748, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 537.0, "completions/mean_terminated_length": 467.4285888671875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.2777777777777778, "grad_norm": 3.590215553593014, "kl": 0.0431365966796875, "learning_rate": 4.992006337739874e-07, "loss": -0.2622, "num_tokens": 2091698.0, "reward": 0.2568208575248718, "reward_std": 0.1660316437482834, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.2124398946762085, "rewards/logprob_reward/std": 0.299672931432724, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 524.21875, "completions/mean_terminated_length": 490.9000244140625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.2808641975308642, "grad_norm": 3.440639168948707, "kl": 0.0378265380859375, "learning_rate": 4.991601885306111e-07, "loss": -0.0892, "num_tokens": 2114597.0, "reward": 0.21924841403961182, "reward_std": 0.10752985626459122, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.14638713002204895, "rewards/logprob_reward/std": 0.24282750487327576, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 542.3125, "completions/mean_terminated_length": 510.20001220703125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.2839506172839506, "grad_norm": 3.6947864069240492, "kl": 0.044708251953125, "learning_rate": 4.991187469052162e-07, "loss": -0.055, "num_tokens": 2138691.0, "reward": 0.32845377922058105, "reward_std": 0.09630337357521057, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.27467089891433716, "rewards/logprob_reward/std": 0.3240615129470825, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 462.125, "completions/mean_terminated_length": 444.0, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.28703703703703703, "grad_norm": 3.0269963309921337, "kl": 0.03948974609375, "learning_rate": 4.99076309063526e-07, "loss": -0.259, "num_tokens": 2159787.0, "reward": 0.3884456753730774, "reward_std": 0.1847994029521942, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.33785635232925415, "rewards/logprob_reward/std": 0.3278330862522125, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 488.34375, "completions/mean_terminated_length": 452.63336181640625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.29012345679012347, "grad_norm": 5.283890452350916, "kl": 0.040008544921875, "learning_rate": 4.99032875175248e-07, "loss": -0.292, "num_tokens": 2182014.0, "reward": 0.17228937149047852, "reward_std": 0.09457872807979584, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.1185159683227539, "rewards/logprob_reward/std": 0.23186810314655304, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 486.625, "completions/mean_terminated_length": 469.2903137207031, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.2932098765432099, "grad_norm": 3.070492600012361, "kl": 0.0366973876953125, "learning_rate": 4.989884454140724e-07, "loss": -0.184, "num_tokens": 2203982.0, "reward": 0.35218995809555054, "reward_std": 0.11176910996437073, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.29409995675086975, "rewards/logprob_reward/std": 0.3235725164413452, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 558.21875, "completions/mean_terminated_length": 510.03448486328125, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.2962962962962963, "grad_norm": 3.5931378242042062, "kl": 0.0417327880859375, "learning_rate": 4.989430199576722e-07, "loss": -0.2256, "num_tokens": 2228285.0, "reward": 0.3443182110786438, "reward_std": 0.2188708484172821, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.29577022790908813, "rewards/logprob_reward/std": 0.2920750379562378, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 544.15625, "completions/mean_terminated_length": 544.15625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.2993827160493827, "grad_norm": 3.016482538962336, "kl": 0.0377044677734375, "learning_rate": 4.988965989877022e-07, "loss": -0.1856, "num_tokens": 2252350.0, "reward": 0.3726438283920288, "reward_std": 0.24674266576766968, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3202987015247345, "rewards/logprob_reward/std": 0.3325502276420593, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 533.53125, "completions/mean_terminated_length": 500.8333740234375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.30246913580246915, "grad_norm": 3.6021764165627292, "kl": 0.04388427734375, "learning_rate": 4.988491826897978e-07, "loss": -0.1139, "num_tokens": 2276335.0, "reward": 0.2911481559276581, "reward_std": 0.09113110601902008, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2332201898097992, "rewards/logprob_reward/std": 0.3233642578125, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 504.21875, "completions/mean_terminated_length": 504.21875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.3055555555555556, "grad_norm": 3.0079042465350825, "kl": 0.0355377197265625, "learning_rate": 4.988007712535752e-07, "loss": -0.1933, "num_tokens": 2298558.0, "reward": 0.3005990982055664, "reward_std": 0.11567462980747223, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.24024897813796997, "rewards/logprob_reward/std": 0.35584473609924316, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 512.15625, "completions/mean_terminated_length": 495.6451416015625, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.30864197530864196, "grad_norm": 3.1645684992851115, "kl": 0.04278564453125, "learning_rate": 4.987513648726298e-07, "loss": -0.163, "num_tokens": 2321383.0, "reward": 0.3990842401981354, "reward_std": 0.18724200129508972, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3462046980857849, "rewards/logprob_reward/std": 0.3177835941314697, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 501.0, "completions/mean_terminated_length": 484.1290283203125, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.3117283950617284, "grad_norm": 2.9145078322282183, "kl": 0.03887939453125, "learning_rate": 4.987009637445358e-07, "loss": -0.1504, "num_tokens": 2343391.0, "reward": 0.29791563749313354, "reward_std": 0.13121336698532104, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.2442118376493454, "rewards/logprob_reward/std": 0.2757895290851593, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 516.40625, "completions/mean_terminated_length": 482.5666809082031, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.3148148148148148, "grad_norm": 3.5556269418304445, "kl": 0.0387420654296875, "learning_rate": 4.986495680708453e-07, "loss": -0.282, "num_tokens": 2366364.0, "reward": 0.20970594882965088, "reward_std": 0.11993538588285446, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.14272883534431458, "rewards/logprob_reward/std": 0.2326190024614334, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 596.28125, "completions/mean_terminated_length": 497.5769348144531, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.31790123456790126, "grad_norm": 3.6081167269329257, "kl": 0.0425872802734375, "learning_rate": 4.985971780570878e-07, "loss": -0.1747, "num_tokens": 2392177.0, "reward": 0.1974845677614212, "reward_std": 0.12636981904506683, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.13956618309020996, "rewards/logprob_reward/std": 0.25323083996772766, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 533.375, "completions/mean_terminated_length": 500.66668701171875, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.32098765432098764, "grad_norm": 2.854529012946991, "kl": 0.0382537841796875, "learning_rate": 4.985437939127687e-07, "loss": -0.2615, "num_tokens": 2415529.0, "reward": 0.28740477561950684, "reward_std": 0.15293873846530914, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.22906090319156647, "rewards/logprob_reward/std": 0.28886404633522034, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 488.9375, "completions/mean_terminated_length": 488.9375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.32407407407407407, "grad_norm": 3.3718975317744135, "kl": 0.0408935546875, "learning_rate": 4.984894158513696e-07, "loss": -0.1096, "num_tokens": 2437627.0, "reward": 0.32780158519744873, "reward_std": 0.1162356287240982, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.26352953910827637, "rewards/logprob_reward/std": 0.24996651709079742, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 569.03125, "completions/mean_terminated_length": 504.0357360839844, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.3271604938271605, "grad_norm": 4.393992811448587, "kl": 0.040252685546875, "learning_rate": 4.984340440903456e-07, "loss": -0.2046, "num_tokens": 2462220.0, "reward": 0.24317356944084167, "reward_std": 0.1577545404434204, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.18685951828956604, "rewards/logprob_reward/std": 0.305131733417511, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 564.46875, "completions/mean_terminated_length": 533.8333740234375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.33024691358024694, "grad_norm": 2.9407013881751696, "kl": 0.045562744140625, "learning_rate": 4.983776788511268e-07, "loss": -0.124, "num_tokens": 2486607.0, "reward": 0.22956174612045288, "reward_std": 0.07610921561717987, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.1578463762998581, "rewards/logprob_reward/std": 0.2404603809118271, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 546.9375, "completions/mean_terminated_length": 531.5484008789062, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.3333333333333333, "grad_norm": 2.7378217122922757, "kl": 0.045501708984375, "learning_rate": 4.983203203591154e-07, "loss": -0.2022, "num_tokens": 2510329.0, "reward": 0.3016606867313385, "reward_std": 0.1461724042892456, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.24142853915691376, "rewards/logprob_reward/std": 0.32246190309524536, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 553.8125, "completions/mean_terminated_length": 522.4666748046875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.33641975308641975, "grad_norm": 2.8540118388259197, "kl": 0.0474395751953125, "learning_rate": 4.982619688436859e-07, "loss": -0.2755, "num_tokens": 2534619.0, "reward": 0.2591847777366638, "reward_std": 0.06958003342151642, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.19423305988311768, "rewards/logprob_reward/std": 0.26822128891944885, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 543.125, "completions/mean_terminated_length": 511.0666809082031, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.3395061728395062, "grad_norm": 2.4369975380371875, "kl": 0.04107666015625, "learning_rate": 4.982026245381837e-07, "loss": -0.2928, "num_tokens": 2558475.0, "reward": 0.4581526517868042, "reward_std": 0.15789499878883362, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.41530853509902954, "rewards/logprob_reward/std": 0.31935182213783264, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 516.375, "completions/mean_terminated_length": 482.5333557128906, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.3425925925925926, "grad_norm": 2.8592266272545697, "kl": 0.048858642578125, "learning_rate": 4.981422876799244e-07, "loss": -0.2919, "num_tokens": 2581491.0, "reward": 0.2646980881690979, "reward_std": 0.08699201792478561, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.20730343461036682, "rewards/logprob_reward/std": 0.28746798634529114, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 559.6875, "completions/mean_terminated_length": 544.7096557617188, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.345679012345679, "grad_norm": 2.9684584807708294, "kl": 0.0567779541015625, "learning_rate": 4.980809585101927e-07, "loss": -0.1649, "num_tokens": 2606321.0, "reward": 0.46308934688568115, "reward_std": 0.16915422677993774, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.420793741941452, "rewards/logprob_reward/std": 0.30669888854026794, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 544.34375, "completions/mean_terminated_length": 494.72412109375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.3487654320987654, "grad_norm": 2.916292982508738, "kl": 0.0526123046875, "learning_rate": 4.980186372742417e-07, "loss": -0.244, "num_tokens": 2630332.0, "reward": 0.1995975822210312, "reward_std": 0.10023873299360275, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.1349695324897766, "rewards/logprob_reward/std": 0.2431170642375946, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 619.625, "completions/mean_terminated_length": 592.6666870117188, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.35185185185185186, "grad_norm": 2.898468945504761, "kl": 0.055511474609375, "learning_rate": 4.979553242212917e-07, "loss": -0.1643, "num_tokens": 2657120.0, "reward": 0.3568793535232544, "reward_std": 0.15801353752613068, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3027825951576233, "rewards/logprob_reward/std": 0.30247846245765686, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 507.28125, "completions/mean_terminated_length": 490.6128845214844, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.3549382716049383, "grad_norm": 2.836045681525242, "kl": 0.05926513671875, "learning_rate": 4.978910196045291e-07, "loss": -0.2747, "num_tokens": 2679565.0, "reward": 0.367550253868103, "reward_std": 0.19948378205299377, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.31463921070098877, "rewards/logprob_reward/std": 0.34520113468170166, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 517.15625, "completions/mean_terminated_length": 500.8064270019531, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.35802469135802467, "grad_norm": 3.0007868014087924, "kl": 0.053436279296875, "learning_rate": 4.978257236811055e-07, "loss": -0.1895, "num_tokens": 2702750.0, "reward": 0.2890568971633911, "reward_std": 0.12840858101844788, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.22742432355880737, "rewards/logprob_reward/std": 0.27868959307670593, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 491.34375, "completions/mean_terminated_length": 491.34375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.3611111111111111, "grad_norm": 2.526199755335383, "kl": 0.042144775390625, "learning_rate": 4.977594367121369e-07, "loss": -0.1603, "num_tokens": 2724549.0, "reward": 0.41362032294273376, "reward_std": 0.148383229970932, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.36235591769218445, "rewards/logprob_reward/std": 0.29718974232673645, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 598.4375, "completions/mean_terminated_length": 554.413818359375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.36419753086419754, "grad_norm": 3.2992782258743443, "kl": 0.049896240234375, "learning_rate": 4.976921589627021e-07, "loss": -0.2729, "num_tokens": 2749919.0, "reward": 0.2445828914642334, "reward_std": 0.08128540217876434, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.1849532127380371, "rewards/logprob_reward/std": 0.31966593861579895, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 526.25, "completions/mean_terminated_length": 526.25, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.36728395061728397, "grad_norm": 3.180497449488794, "kl": 0.051971435546875, "learning_rate": 4.976238907018427e-07, "loss": -0.1866, "num_tokens": 2773295.0, "reward": 0.31716644763946533, "reward_std": 0.06050901114940643, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.24476826190948486, "rewards/logprob_reward/std": 0.26013559103012085, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 577.5, "completions/mean_terminated_length": 513.7142944335938, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.37037037037037035, "grad_norm": 2.9943320459906912, "kl": 0.06011962890625, "learning_rate": 4.975546322025605e-07, "loss": -0.1388, "num_tokens": 2798347.0, "reward": 0.350216269493103, "reward_std": 0.11822676658630371, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2953791916370392, "rewards/logprob_reward/std": 0.337100088596344, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 537.3125, "completions/mean_terminated_length": 467.7857360839844, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.3734567901234568, "grad_norm": 3.762173416488056, "kl": 0.057952880859375, "learning_rate": 4.974843837418175e-07, "loss": -0.0989, "num_tokens": 2822001.0, "reward": 0.17610597610473633, "reward_std": 0.09911122173070908, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.10539551079273224, "rewards/logprob_reward/std": 0.18491879105567932, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 526.15625, "completions/mean_terminated_length": 526.15625, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.3765432098765432, "grad_norm": 2.7464503187343188, "kl": 0.060760498046875, "learning_rate": 4.974131456005349e-07, "loss": -0.227, "num_tokens": 2845346.0, "reward": 0.3630925118923187, "reward_std": 0.06585537642240524, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2992694675922394, "rewards/logprob_reward/std": 0.32076671719551086, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 593.4375, "completions/mean_terminated_length": 564.7333374023438, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.37962962962962965, "grad_norm": 2.62296543895522, "kl": 0.049468994140625, "learning_rate": 4.973409180635911e-07, "loss": -0.235, "num_tokens": 2870796.0, "reward": 0.36671194434165955, "reward_std": 0.11020010709762573, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.31370770931243896, "rewards/logprob_reward/std": 0.28464174270629883, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 613.28125, "completions/mean_terminated_length": 585.9000244140625, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.38271604938271603, "grad_norm": 2.7240558489193956, "kl": 0.0506591796875, "learning_rate": 4.972677014198213e-07, "loss": -0.1318, "num_tokens": 2897249.0, "reward": 0.4552130699157715, "reward_std": 0.13436511158943176, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4085700809955597, "rewards/logprob_reward/std": 0.3071467876434326, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 568.4375, "completions/mean_terminated_length": 484.0740661621094, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.38580246913580246, "grad_norm": 3.5807502206733757, "kl": 0.10040283203125, "learning_rate": 4.97193495962016e-07, "loss": -0.1162, "num_tokens": 2921783.0, "reward": 0.18961748480796814, "reward_std": 0.10474695265293121, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.1204083263874054, "rewards/logprob_reward/std": 0.18304675817489624, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 567.09375, "completions/mean_terminated_length": 536.6333618164062, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.3888888888888889, "grad_norm": 3.237884207399784, "kl": 0.05731201171875, "learning_rate": 4.971183019869201e-07, "loss": -0.1612, "num_tokens": 2946270.0, "reward": 0.36089056730270386, "reward_std": 0.10060998052358627, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3107117712497711, "rewards/logprob_reward/std": 0.29036495089530945, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 604.09375, "completions/mean_terminated_length": 544.107177734375, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.39197530864197533, "grad_norm": 3.228261885520651, "kl": 0.0513916015625, "learning_rate": 4.970421197952311e-07, "loss": -0.1171, "num_tokens": 2972297.0, "reward": 0.31217315793037415, "reward_std": 0.10372491180896759, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.26005351543426514, "rewards/logprob_reward/std": 0.29411664605140686, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 588.21875, "completions/mean_terminated_length": 543.137939453125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.3950617283950617, "grad_norm": 2.6731167100130633, "kl": 0.057861328125, "learning_rate": 4.969649496915991e-07, "loss": -0.0795, "num_tokens": 2997644.0, "reward": 0.3341195583343506, "reward_std": 0.08132977783679962, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2740217447280884, "rewards/logprob_reward/std": 0.31395161151885986, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 583.8125, "completions/mean_terminated_length": 554.4666748046875, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.39814814814814814, "grad_norm": 3.1794408715839673, "kl": 0.056182861328125, "learning_rate": 4.96886791984624e-07, "loss": -0.27, "num_tokens": 3022610.0, "reward": 0.3170411288738251, "reward_std": 0.08985304832458496, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.25504571199417114, "rewards/logprob_reward/std": 0.28948092460632324, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 612.5, "completions/mean_terminated_length": 599.2257690429688, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.4012345679012346, "grad_norm": 2.4492688606854904, "kl": 0.052734375, "learning_rate": 4.968076469868558e-07, "loss": -0.2671, "num_tokens": 3048730.0, "reward": 0.2742547392845154, "reward_std": 0.05315082520246506, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2040330469608307, "rewards/logprob_reward/std": 0.2957220673561096, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 559.3125, "completions/mean_terminated_length": 559.3125, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.404320987654321, "grad_norm": 3.163560387230562, "kl": 0.051300048828125, "learning_rate": 4.967275150147921e-07, "loss": -0.0394, "num_tokens": 3073080.0, "reward": 0.5323173999786377, "reward_std": 0.0788436233997345, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.4838249087333679, "rewards/logprob_reward/std": 0.33439284563064575, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 625.6875, "completions/mean_terminated_length": 584.4827270507812, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.4074074074074074, "grad_norm": 2.739941434567374, "kl": 0.05987548828125, "learning_rate": 4.966463963888775e-07, "loss": -0.3883, "num_tokens": 3099518.0, "reward": 0.2553619146347046, "reward_std": 0.16264769434928894, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.1969299018383026, "rewards/logprob_reward/std": 0.2804294526576996, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 580.8125, "completions/mean_terminated_length": 534.9655151367188, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.4104938271604938, "grad_norm": 5.1354030719281445, "kl": 0.064239501953125, "learning_rate": 4.965642914335025e-07, "loss": -0.2941, "num_tokens": 3124244.0, "reward": 0.30074912309646606, "reward_std": 0.11045147478580475, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.24041570723056793, "rewards/logprob_reward/std": 0.3462699055671692, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 549.75, "completions/mean_terminated_length": 482.0000305175781, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.41358024691358025, "grad_norm": 3.380142371752241, "kl": 0.0693359375, "learning_rate": 4.964812004770013e-07, "loss": -0.1073, "num_tokens": 3148280.0, "reward": 0.18707197904586792, "reward_std": 0.097461998462677, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.11757997423410416, "rewards/logprob_reward/std": 0.1801103800535202, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 644.5, "completions/mean_terminated_length": 605.2413940429688, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.4166666666666667, "grad_norm": 2.883501920199981, "kl": 0.06781005859375, "learning_rate": 4.963971238516519e-07, "loss": -0.1861, "num_tokens": 3175536.0, "reward": 0.4376097321510315, "reward_std": 0.10557529330253601, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.39248308539390564, "rewards/logprob_reward/std": 0.3678987920284271, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 596.84375, "completions/mean_terminated_length": 552.6551513671875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.41975308641975306, "grad_norm": 2.7417162838614537, "kl": 0.067413330078125, "learning_rate": 4.963120618936732e-07, "loss": -0.0941, "num_tokens": 3200887.0, "reward": 0.37339746952056885, "reward_std": 0.12638214230537415, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3176638185977936, "rewards/logprob_reward/std": 0.33372142910957336, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 514.34375, "completions/mean_terminated_length": 480.36669921875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.4228395061728395, "grad_norm": 2.1980509725501185, "kl": 0.0645751953125, "learning_rate": 4.962260149432247e-07, "loss": -0.1094, "num_tokens": 3223698.0, "reward": 0.38345110416412354, "reward_std": 0.1483292579650879, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.35314011573791504, "rewards/logprob_reward/std": 0.34856802225112915, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 605.03125, "completions/mean_terminated_length": 577.1000366210938, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.42592592592592593, "grad_norm": 2.6299689644388766, "kl": 0.06787109375, "learning_rate": 4.96138983344405e-07, "loss": 0.011, "num_tokens": 3249535.0, "reward": 0.46557852625846863, "reward_std": 0.08943602442741394, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4131428003311157, "rewards/logprob_reward/std": 0.337747186422348, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 610.875, "completions/mean_terminated_length": 551.857177734375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.42901234567901236, "grad_norm": 2.6346150760085365, "kl": 0.053131103515625, "learning_rate": 4.9605096744525e-07, "loss": -0.42, "num_tokens": 3275323.0, "reward": 0.44501233100891113, "reward_std": 0.23392929136753082, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.4076525866985321, "rewards/logprob_reward/std": 0.3489828407764435, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 548.15625, "completions/mean_terminated_length": 532.8064575195312, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.43209876543209874, "grad_norm": 2.8127054729485903, "kl": 0.060089111328125, "learning_rate": 4.95961967597732e-07, "loss": -0.0399, "num_tokens": 3299072.0, "reward": 0.4449176788330078, "reward_std": 0.12956109642982483, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3901863098144531, "rewards/logprob_reward/std": 0.3427717685699463, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 596.15625, "completions/mean_terminated_length": 516.9259033203125, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.4351851851851852, "grad_norm": 4.843967078083494, "kl": 0.076202392578125, "learning_rate": 4.958719841577579e-07, "loss": -0.4285, "num_tokens": 3324513.0, "reward": 0.29998451471328735, "reward_std": 0.1500430703163147, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.2534550428390503, "rewards/logprob_reward/std": 0.31193649768829346, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 652.0, "completions/mean_terminated_length": 583.1111450195312, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.4382716049382716, "grad_norm": 2.5672155434712964, "kl": 0.05999755859375, "learning_rate": 4.957810174851679e-07, "loss": -0.1594, "num_tokens": 3352305.0, "reward": 0.35246115922927856, "reward_std": 0.048143431544303894, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.30481797456741333, "rewards/logprob_reward/std": 0.3429834544658661, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 603.96875, "completions/mean_terminated_length": 560.5172119140625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.44135802469135804, "grad_norm": 2.5315181901034505, "kl": 0.06329345703125, "learning_rate": 4.956890679437345e-07, "loss": -0.1072, "num_tokens": 3377536.0, "reward": 0.37108248472213745, "reward_std": 0.23813706636428833, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.31856387853622437, "rewards/logprob_reward/std": 0.3625701665878296, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 595.25, "completions/mean_terminated_length": 581.4193115234375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.4444444444444444, "grad_norm": 3.28639860446714, "kl": 0.071563720703125, "learning_rate": 4.955961359011601e-07, "loss": -0.0355, "num_tokens": 3403148.0, "reward": 0.28397655487060547, "reward_std": 0.13400745391845703, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.21136286854743958, "rewards/logprob_reward/std": 0.28332725167274475, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 560.5, "completions/mean_terminated_length": 560.5, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.44753086419753085, "grad_norm": 2.5853029827540466, "kl": 0.0687255859375, "learning_rate": 4.955022217290766e-07, "loss": -0.1856, "num_tokens": 3427348.0, "reward": 0.4285385012626648, "reward_std": 0.11641817539930344, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3719872236251831, "rewards/logprob_reward/std": 0.32447078824043274, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 582.875, "completions/mean_terminated_length": 568.6451416015625, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.4506172839506173, "grad_norm": 2.6415227236323826, "kl": 0.066864013671875, "learning_rate": 4.954073258030431e-07, "loss": -0.1034, "num_tokens": 3452556.0, "reward": 0.3039282560348511, "reward_std": 0.05062127858400345, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2335314005613327, "rewards/logprob_reward/std": 0.28446364402770996, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 584.28125, "completions/mean_terminated_length": 584.28125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.4537037037037037, "grad_norm": 2.384877747251839, "kl": 0.08062744140625, "learning_rate": 4.953114485025446e-07, "loss": -0.2307, "num_tokens": 3477865.0, "reward": 0.2491493970155716, "reward_std": 0.0557827353477478, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.17613820731639862, "rewards/logprob_reward/std": 0.2766711711883545, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 567.6875, "completions/mean_terminated_length": 537.2667236328125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.4567901234567901, "grad_norm": 2.5142484247147934, "kl": 0.063690185546875, "learning_rate": 4.95214590210991e-07, "loss": -0.2858, "num_tokens": 3502343.0, "reward": 0.3492678105831146, "reward_std": 0.10985951125621796, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2908531427383423, "rewards/logprob_reward/std": 0.27680277824401855, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 556.65625, "completions/mean_terminated_length": 556.65625, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.45987654320987653, "grad_norm": 2.4538137615652125, "kl": 0.065582275390625, "learning_rate": 4.951167513157147e-07, "loss": 0.022, "num_tokens": 3526324.0, "reward": 0.3178936839103699, "reward_std": 0.0527193546295166, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.24210406839847565, "rewards/logprob_reward/std": 0.2950250804424286, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 590.3125, "completions/mean_terminated_length": 561.4000244140625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.46296296296296297, "grad_norm": 2.7603691122921887, "kl": 0.0772705078125, "learning_rate": 4.950179322079697e-07, "loss": -0.0355, "num_tokens": 3551990.0, "reward": 0.35770902037620544, "reward_std": 0.0876719206571579, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.29676002264022827, "rewards/logprob_reward/std": 0.33888646960258484, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 574.0625, "completions/mean_terminated_length": 559.5484008789062, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.4660493827160494, "grad_norm": 2.48361405694046, "kl": 0.06036376953125, "learning_rate": 4.949181332829299e-07, "loss": -0.3125, "num_tokens": 3576552.0, "reward": 0.42269521951675415, "reward_std": 0.1726590096950531, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3759113550186157, "rewards/logprob_reward/std": 0.3010243773460388, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 572.0, "completions/mean_terminated_length": 541.86669921875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.4691358024691358, "grad_norm": 2.8593107005772413, "kl": 0.076568603515625, "learning_rate": 4.948173549396873e-07, "loss": -0.1294, "num_tokens": 3601472.0, "reward": 0.23823416233062744, "reward_std": 0.1403721272945404, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.170954629778862, "rewards/logprob_reward/std": 0.25463491678237915, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 590.34375, "completions/mean_terminated_length": 590.34375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.4722222222222222, "grad_norm": 2.3111474856648146, "kl": 0.0660400390625, "learning_rate": 4.947155975812506e-07, "loss": -0.2627, "num_tokens": 3626955.0, "reward": 0.38112589716911316, "reward_std": 0.08985978364944458, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.32625100016593933, "rewards/logprob_reward/std": 0.30355149507522583, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 658.90625, "completions/mean_terminated_length": 606.75, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.47530864197530864, "grad_norm": 2.4781066467678388, "kl": 0.077911376953125, "learning_rate": 4.946128616145436e-07, "loss": -0.2042, "num_tokens": 3654684.0, "reward": 0.3178520202636719, "reward_std": 0.2169586420059204, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.27678000926971436, "rewards/logprob_reward/std": 0.3401184678077698, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 672.9375, "completions/mean_terminated_length": 591.923095703125, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.4783950617283951, "grad_norm": 2.4916821122503636, "kl": 0.0723876953125, "learning_rate": 4.945091474504037e-07, "loss": 0.0073, "num_tokens": 3683118.0, "reward": 0.30239272117614746, "reward_std": 0.16403862833976746, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.24918633699417114, "rewards/logprob_reward/std": 0.28751179575920105, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 701.59375, "completions/mean_terminated_length": 668.2413940429688, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.48148148148148145, "grad_norm": 6.079478554540518, "kl": 0.079833984375, "learning_rate": 4.944044555035793e-07, "loss": -0.4376, "num_tokens": 3712877.0, "reward": 0.3551192879676819, "reward_std": 0.07864440977573395, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3042992055416107, "rewards/logprob_reward/std": 0.34509626030921936, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 577.40625, "completions/mean_terminated_length": 563.0, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.4845679012345679, "grad_norm": 2.3168256811966996, "kl": 0.07464599609375, "learning_rate": 4.9429878619273e-07, "loss": -0.0684, "num_tokens": 3737834.0, "reward": 0.44436115026474, "reward_std": 0.10703182220458984, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3895679712295532, "rewards/logprob_reward/std": 0.2950931191444397, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 652.375, "completions/mean_terminated_length": 566.6154174804688, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.4876543209876543, "grad_norm": 3.7333030863074326, "kl": 0.094482421875, "learning_rate": 4.941921399404232e-07, "loss": -0.3482, "num_tokens": 3765618.0, "reward": 0.2629079818725586, "reward_std": 0.17510250210762024, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.20878666639328003, "rewards/logprob_reward/std": 0.2797011137008667, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 575.9375, "completions/mean_terminated_length": 575.9375, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.49074074074074076, "grad_norm": 2.6900648586396034, "kl": 0.06158447265625, "learning_rate": 4.940845171731329e-07, "loss": -0.0032, "num_tokens": 3789976.0, "reward": 0.4342363178730011, "reward_std": 0.09861146658658981, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.3713736832141876, "rewards/logprob_reward/std": 0.3737526535987854, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 596.375, "completions/mean_terminated_length": 596.375, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.49382716049382713, "grad_norm": 2.3034189772749625, "kl": 0.064422607421875, "learning_rate": 4.939759183212388e-07, "loss": -0.2572, "num_tokens": 3815308.0, "reward": 0.2933869957923889, "reward_std": 0.10281737893819809, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.22529110312461853, "rewards/logprob_reward/std": 0.2551559507846832, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 660.25, "completions/mean_terminated_length": 622.6206665039062, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.49691358024691357, "grad_norm": 2.825985822509588, "kl": 0.06512451171875, "learning_rate": 4.938663438190232e-07, "loss": -0.1452, "num_tokens": 3842920.0, "reward": 0.3631977438926697, "reward_std": 0.12723186612129211, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3098030090332031, "rewards/logprob_reward/std": 0.2671440541744232, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 673.3125, "completions/mean_terminated_length": 637.0344848632812, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.5, "grad_norm": 2.343714730849349, "kl": 0.0682373046875, "learning_rate": 4.937557941046705e-07, "loss": -0.3634, "num_tokens": 3870846.0, "reward": 0.3806036114692688, "reward_std": 0.17782384157180786, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.33261510729789734, "rewards/logprob_reward/std": 0.33964598178863525, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 644.5, "completions/mean_terminated_length": 619.2000122070312, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.5030864197530864, "grad_norm": 2.2312084897990063, "kl": 0.088836669921875, "learning_rate": 4.936442696202648e-07, "loss": -0.0925, "num_tokens": 3898210.0, "reward": 0.39643919467926025, "reward_std": 0.10379578173160553, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.33979353308677673, "rewards/logprob_reward/std": 0.37235793471336365, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 694.1875, "completions/mean_terminated_length": 633.1111450195312, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.5061728395061729, "grad_norm": 2.510783154747024, "kl": 0.073272705078125, "learning_rate": 4.935317708117881e-07, "loss": -0.1724, "num_tokens": 3926996.0, "reward": 0.3609701991081238, "reward_std": 0.1595182865858078, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3177446722984314, "rewards/logprob_reward/std": 0.322990745306015, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 657.4375, "completions/mean_terminated_length": 605.0714721679688, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.5092592592592593, "grad_norm": 4.123563694331599, "kl": 0.075714111328125, "learning_rate": 4.934182981291187e-07, "loss": -0.3846, "num_tokens": 3954394.0, "reward": 0.22762924432754517, "reward_std": 0.1099647730588913, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.16611582040786743, "rewards/logprob_reward/std": 0.2857367992401123, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 605.125, "completions/mean_terminated_length": 605.125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.5123456790123457, "grad_norm": 2.400249740681278, "kl": 0.091796875, "learning_rate": 4.933038520260299e-07, "loss": -0.2543, "num_tokens": 3980530.0, "reward": 0.3509305715560913, "reward_std": 0.10118050128221512, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2892284393310547, "rewards/logprob_reward/std": 0.3070206344127655, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 641.09375, "completions/mean_terminated_length": 615.5667114257812, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.5154320987654321, "grad_norm": 2.410501488636766, "kl": 0.067657470703125, "learning_rate": 4.931884329601869e-07, "loss": -0.1324, "num_tokens": 4007157.0, "reward": 0.43487927317619324, "reward_std": 0.0762660875916481, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.38250482082366943, "rewards/logprob_reward/std": 0.3260001540184021, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 611.09375, "completions/mean_terminated_length": 568.3793334960938, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.5185185185185185, "grad_norm": 2.6868419876731617, "kl": 0.08270263671875, "learning_rate": 4.930720413931463e-07, "loss": -0.3057, "num_tokens": 4033020.0, "reward": 0.3721313774585724, "reward_std": 0.17463266849517822, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3232015371322632, "rewards/logprob_reward/std": 0.30637747049331665, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 705.375, "completions/mean_terminated_length": 659.857177734375, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.5216049382716049, "grad_norm": 2.896125576147124, "kl": 0.072357177734375, "learning_rate": 4.929546777903534e-07, "loss": -0.2209, "num_tokens": 4062540.0, "reward": 0.14834773540496826, "reward_std": 0.09812422096729279, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.07802525162696838, "rewards/logprob_reward/std": 0.17348940670490265, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 598.5, "completions/mean_terminated_length": 598.5, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.5246913580246914, "grad_norm": 2.185073265539977, "kl": 0.0728759765625, "learning_rate": 4.928363426211407e-07, "loss": -0.139, "num_tokens": 4087632.0, "reward": 0.3291938304901123, "reward_std": 0.06604912877082825, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2616042494773865, "rewards/logprob_reward/std": 0.2542729377746582, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 711.46875, "completions/mean_terminated_length": 653.5925903320312, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.5277777777777778, "grad_norm": 2.0572582565724296, "kl": 0.08544921875, "learning_rate": 4.927170363587262e-07, "loss": -0.0297, "num_tokens": 4117595.0, "reward": 0.22415044903755188, "reward_std": 0.04535282775759697, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.16572272777557373, "rewards/logprob_reward/std": 0.29184073209762573, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 584.5, "completions/mean_terminated_length": 539.0344848632812, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.5308641975308642, "grad_norm": 2.4994384502107128, "kl": 0.09344482421875, "learning_rate": 4.925967594802109e-07, "loss": -0.2519, "num_tokens": 4142755.0, "reward": 0.22198599576950073, "reward_std": 0.17108027637004852, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.15637332201004028, "rewards/logprob_reward/std": 0.2614561915397644, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 597.1875, "completions/mean_terminated_length": 553.0344848632812, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.5339506172839507, "grad_norm": 2.1594985599693977, "kl": 0.0799560546875, "learning_rate": 4.924755124665774e-07, "loss": -0.3179, "num_tokens": 4168141.0, "reward": 0.39722633361816406, "reward_std": 0.14717230200767517, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.34761255979537964, "rewards/logprob_reward/std": 0.31317365169525146, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 664.4375, "completions/mean_terminated_length": 652.8386840820312, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.5370370370370371, "grad_norm": 2.239359306258462, "kl": 0.073516845703125, "learning_rate": 4.923532958026878e-07, "loss": -0.1921, "num_tokens": 4196215.0, "reward": 0.3362463712692261, "reward_std": 0.0717792809009552, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2729126214981079, "rewards/logprob_reward/std": 0.28039759397506714, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 609.09375, "completions/mean_terminated_length": 595.7096557617188, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 0.5401234567901234, "grad_norm": 2.231045209098031, "kl": 0.0860595703125, "learning_rate": 4.922301099772821e-07, "loss": -0.2431, "num_tokens": 4222238.0, "reward": 0.37652531266212463, "reward_std": 0.16798089444637299, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.31766700744628906, "rewards/logprob_reward/std": 0.29198724031448364, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 622.84375, "completions/mean_terminated_length": 622.84375, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.5432098765432098, "grad_norm": 2.181481872244806, "kl": 0.073699951171875, "learning_rate": 4.921059554829753e-07, "loss": -0.1645, "num_tokens": 4248445.0, "reward": 0.39419400691986084, "reward_std": 0.11792518198490143, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3338267207145691, "rewards/logprob_reward/std": 0.27028995752334595, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 631.03125, "completions/mean_terminated_length": 618.3547973632812, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.5462962962962963, "grad_norm": 2.319265576990343, "kl": 0.0743408203125, "learning_rate": 4.91980832816257e-07, "loss": -0.2435, "num_tokens": 4274934.0, "reward": 0.5276014804840088, "reward_std": 0.08251938223838806, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.48552945256233215, "rewards/logprob_reward/std": 0.3513021469116211, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 656.375, "completions/mean_terminated_length": 631.86669921875, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.5493827160493827, "grad_norm": 2.174994405160599, "kl": 0.068328857421875, "learning_rate": 4.918547424774873e-07, "loss": -0.1179, "num_tokens": 4302322.0, "reward": 0.297809362411499, "reward_std": 0.09942954033613205, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.23020483553409576, "rewards/logprob_reward/std": 0.27339404821395874, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 681.40625, "completions/mean_terminated_length": 617.9629516601562, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.5524691358024691, "grad_norm": 2.2731262069837093, "kl": 0.08062744140625, "learning_rate": 4.917276849708972e-07, "loss": -0.2283, "num_tokens": 4330331.0, "reward": 0.41262900829315186, "reward_std": 0.18734759092330933, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3647266924381256, "rewards/logprob_reward/std": 0.31652891635894775, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 696.1875, "completions/mean_terminated_length": 649.357177734375, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.5555555555555556, "grad_norm": 3.741081974953418, "kl": 0.0869140625, "learning_rate": 4.915996608045842e-07, "loss": -0.4203, "num_tokens": 4359533.0, "reward": 0.321063369512558, "reward_std": 0.1342373639345169, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.2699315547943115, "rewards/logprob_reward/std": 0.2612871825695038, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 656.3125, "completions/mean_terminated_length": 631.800048828125, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.558641975308642, "grad_norm": 2.0788247124334642, "kl": 0.07781982421875, "learning_rate": 4.914706704905125e-07, "loss": -0.1349, "num_tokens": 4387287.0, "reward": 0.38793525099754333, "reward_std": 0.08924485743045807, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.33034467697143555, "rewards/logprob_reward/std": 0.32517802715301514, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 606.1875, "completions/mean_terminated_length": 606.1875, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.5617283950617284, "grad_norm": 2.3239893598132113, "kl": 0.1229248046875, "learning_rate": 4.913407145445093e-07, "loss": -0.0115, "num_tokens": 4413185.0, "reward": 0.49428683519363403, "reward_std": 0.0516868457198143, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.4380964934825897, "rewards/logprob_reward/std": 0.35709667205810547, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 619.25, "completions/mean_terminated_length": 606.1935424804688, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.5648148148148148, "grad_norm": 2.073467341549551, "kl": 0.075775146484375, "learning_rate": 4.912097934862632e-07, "loss": -0.1396, "num_tokens": 4439149.0, "reward": 0.3201667070388794, "reward_std": 0.04057762771844864, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2515741288661957, "rewards/logprob_reward/std": 0.2978927791118622, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 634.25, "completions/mean_terminated_length": 608.2667236328125, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.5679012345679012, "grad_norm": 2.1401447267647953, "kl": 0.09478759765625, "learning_rate": 4.910779078393228e-07, "loss": -0.2577, "num_tokens": 4465729.0, "reward": 0.2616020143032074, "reward_std": 0.1414054036140442, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.20039114356040955, "rewards/logprob_reward/std": 0.3049760162830353, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 678.34375, "completions/mean_terminated_length": 667.1935424804688, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.5709876543209876, "grad_norm": 2.2199435758499444, "kl": 0.08087158203125, "learning_rate": 4.909450581310935e-07, "loss": -0.1529, "num_tokens": 4494124.0, "reward": 0.33925968408584595, "reward_std": 0.100409597158432, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2762607932090759, "rewards/logprob_reward/std": 0.2844822108745575, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 632.3125, "completions/mean_terminated_length": 619.6774291992188, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.5740740740740741, "grad_norm": 1.9116000290682145, "kl": 0.08917236328125, "learning_rate": 4.908112448928363e-07, "loss": -0.0616, "num_tokens": 4520678.0, "reward": 0.43628013134002686, "reward_std": 0.05092755705118179, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3805890381336212, "rewards/logprob_reward/std": 0.3744647204875946, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 743.0625, "completions/mean_terminated_length": 691.0370483398438, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.5771604938271605, "grad_norm": 2.24188917893121, "kl": 0.07012939453125, "learning_rate": 4.906764686596651e-07, "loss": -0.3092, "num_tokens": 4550848.0, "reward": 0.2989751696586609, "reward_std": 0.17239981889724731, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.24191686511039734, "rewards/logprob_reward/std": 0.26681989431381226, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 675.75, "completions/mean_terminated_length": 652.5333862304688, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.5802469135802469, "grad_norm": 2.288647525520225, "kl": 0.07598876953125, "learning_rate": 4.90540729970545e-07, "loss": -0.0704, "num_tokens": 4578892.0, "reward": 0.32477274537086487, "reward_std": 0.08307977020740509, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.26016414165496826, "rewards/logprob_reward/std": 0.256683349609375, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 677.90625, "completions/mean_terminated_length": 654.8333740234375, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.5833333333333334, "grad_norm": 1.9545596092933872, "kl": 0.08441162109375, "learning_rate": 4.904040293682897e-07, "loss": -0.2145, "num_tokens": 4607149.0, "reward": 0.29227733612060547, "reward_std": 0.10324156284332275, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2310025990009308, "rewards/logprob_reward/std": 0.30287083983421326, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 606.8125, "completions/mean_terminated_length": 579.0, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.5864197530864198, "grad_norm": 2.78803616536964, "kl": 0.08599853515625, "learning_rate": 4.902663673995597e-07, "loss": -0.2388, "num_tokens": 4632515.0, "reward": 0.35690605640411377, "reward_std": 0.16773314774036407, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.29586780071258545, "rewards/logprob_reward/std": 0.3051312267780304, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 727.71875, "completions/mean_terminated_length": 685.3928833007812, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.5895061728395061, "grad_norm": 2.434541207207683, "kl": 0.102264404296875, "learning_rate": 4.9012774461486e-07, "loss": -0.2067, "num_tokens": 4662870.0, "reward": 0.20571520924568176, "reward_std": 0.11737848073244095, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.13482245802879333, "rewards/logprob_reward/std": 0.22802451252937317, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 718.6875, "completions/mean_terminated_length": 662.1481323242188, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.5925925925925926, "grad_norm": 2.7618389802939953, "kl": 0.0848388671875, "learning_rate": 4.899881615685376e-07, "loss": -0.1872, "num_tokens": 4692660.0, "reward": 0.3774571418762207, "reward_std": 0.050332263112068176, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.3430079519748688, "rewards/logprob_reward/std": 0.38438180088996887, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 657.3125, "completions/mean_terminated_length": 657.3125, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.595679012345679, "grad_norm": 2.3646412712131846, "kl": 0.08233642578125, "learning_rate": 4.898476188187798e-07, "loss": -0.2473, "num_tokens": 4720494.0, "reward": 0.35640278458595276, "reward_std": 0.07208658754825592, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2953086495399475, "rewards/logprob_reward/std": 0.2825443744659424, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 683.5625, "completions/mean_terminated_length": 648.3448486328125, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.5987654320987654, "grad_norm": 2.3221691680264196, "kl": 0.1004638671875, "learning_rate": 4.897061169276118e-07, "loss": -0.1015, "num_tokens": 4749228.0, "reward": 0.22160539031028748, "reward_std": 0.10898517072200775, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.1524782031774521, "rewards/logprob_reward/std": 0.2569512128829956, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 769.3125, "completions/mean_terminated_length": 635.90478515625, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.6018518518518519, "grad_norm": 4.06230874023849, "kl": 0.10076904296875, "learning_rate": 4.895636564608942e-07, "loss": -0.43, "num_tokens": 4780770.0, "reward": 0.23588848114013672, "reward_std": 0.21076138317584991, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.1961260735988617, "rewards/logprob_reward/std": 0.3016069829463959, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 666.78125, "completions/mean_terminated_length": 655.258056640625, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.6049382716049383, "grad_norm": 1.883871960195334, "kl": 0.078948974609375, "learning_rate": 4.894202379883206e-07, "loss": -0.0941, "num_tokens": 4808799.0, "reward": 0.33930277824401855, "reward_std": 0.06344066560268402, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.27630865573883057, "rewards/logprob_reward/std": 0.3004859387874603, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 695.8125, "completions/mean_terminated_length": 661.862060546875, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.6080246913580247, "grad_norm": 2.3985144507734177, "kl": 0.09942626953125, "learning_rate": 4.892758620834165e-07, "loss": -0.2803, "num_tokens": 4837977.0, "reward": 0.41984474658966064, "reward_std": 0.061386026442050934, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3692719340324402, "rewards/logprob_reward/std": 0.33502286672592163, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 718.8125, "completions/mean_terminated_length": 662.2963256835938, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.6111111111111112, "grad_norm": 1.8111324622489133, "kl": 0.0926513671875, "learning_rate": 4.891305293235351e-07, "loss": -0.3418, "num_tokens": 4867691.0, "reward": 0.39126724004745483, "reward_std": 0.19937953352928162, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3479357957839966, "rewards/logprob_reward/std": 0.3535456657409668, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 736.5, "completions/mean_terminated_length": 670.1538696289062, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.6141975308641975, "grad_norm": 2.2051283144800857, "kl": 0.080841064453125, "learning_rate": 4.889842402898569e-07, "loss": -0.2338, "num_tokens": 4897715.0, "reward": 0.29225510358810425, "reward_std": 0.1917974352836609, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.2413945496082306, "rewards/logprob_reward/std": 0.28485146164894104, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 677.84375, "completions/mean_terminated_length": 628.3928833007812, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.6172839506172839, "grad_norm": 2.4896623779154545, "kl": 0.105499267578125, "learning_rate": 4.888369955673858e-07, "loss": -0.1518, "num_tokens": 4926218.0, "reward": 0.3781619369983673, "reward_std": 0.1798817217350006, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.32990211248397827, "rewards/logprob_reward/std": 0.3951297700405121, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 646.15625, "completions/mean_terminated_length": 620.9666748046875, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.6203703703703703, "grad_norm": 4.934636600648849, "kl": 0.110595703125, "learning_rate": 4.88688795744948e-07, "loss": -0.1108, "num_tokens": 4953279.0, "reward": 0.22090423107147217, "reward_std": 0.08543352782726288, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.14475470781326294, "rewards/logprob_reward/std": 0.18370603024959564, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 749.90625, "completions/mean_terminated_length": 673.1599731445312, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.6234567901234568, "grad_norm": 1.9209734621045913, "kl": 0.09967041015625, "learning_rate": 4.885396414151888e-07, "loss": -0.1969, "num_tokens": 4984220.0, "reward": 0.197572261095047, "reward_std": 0.13941723108291626, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.14660805463790894, "rewards/logprob_reward/std": 0.24207018315792084, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 718.8125, "completions/mean_terminated_length": 687.2413940429688, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.6265432098765432, "grad_norm": 2.520095736202435, "kl": 0.0948486328125, "learning_rate": 4.883895331745707e-07, "loss": -0.3135, "num_tokens": 5013746.0, "reward": 0.3268764317035675, "reward_std": 0.19260549545288086, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.2798627018928528, "rewards/logprob_reward/std": 0.28963401913642883, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 642.53125, "completions/mean_terminated_length": 630.2257690429688, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.6296296296296297, "grad_norm": 2.2038126782903373, "kl": 0.0816650390625, "learning_rate": 4.882384716233709e-07, "loss": 0.0166, "num_tokens": 5040431.0, "reward": 0.3898358941078186, "reward_std": 0.09005182981491089, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.3255121111869812, "rewards/logprob_reward/std": 0.3180224597454071, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 687.5, "completions/mean_terminated_length": 652.6896362304688, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.6327160493827161, "grad_norm": 2.2306632626185814, "kl": 0.092926025390625, "learning_rate": 4.880864573656785e-07, "loss": -0.2344, "num_tokens": 5068851.0, "reward": 0.2512916624546051, "reward_std": 0.10544370114803314, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.18893519043922424, "rewards/logprob_reward/std": 0.25276386737823486, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 715.4375, "completions/mean_terminated_length": 683.5172119140625, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.6358024691358025, "grad_norm": 2.0654991880541957, "kl": 0.10076904296875, "learning_rate": 4.879334910093926e-07, "loss": -0.2016, "num_tokens": 5099057.0, "reward": 0.48595261573791504, "reward_std": 0.14742732048034668, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.44272512197494507, "rewards/logprob_reward/std": 0.3460351228713989, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 674.78125, "completions/mean_terminated_length": 651.5000610351562, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.6388888888888888, "grad_norm": 2.226426546883943, "kl": 0.1026611328125, "learning_rate": 4.877795731662202e-07, "loss": -0.0338, "num_tokens": 5127234.0, "reward": 0.33810973167419434, "reward_std": 0.19857513904571533, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2819274663925171, "rewards/logprob_reward/std": 0.3356766998767853, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 742.8125, "completions/mean_terminated_length": 677.923095703125, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.6419753086419753, "grad_norm": 2.22327682608418, "kl": 0.1048583984375, "learning_rate": 4.876247044516724e-07, "loss": -0.1828, "num_tokens": 5158072.0, "reward": 0.3183407187461853, "reward_std": 0.09724409133195877, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.27037858963012695, "rewards/logprob_reward/std": 0.34532850980758667, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 640.84375, "completions/mean_terminated_length": 601.2069091796875, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.6450617283950617, "grad_norm": 2.5736294182664534, "kl": 0.09326171875, "learning_rate": 4.874688854850635e-07, "loss": -0.1787, "num_tokens": 5184423.0, "reward": 0.34399086236953735, "reward_std": 0.12161408364772797, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2849898636341095, "rewards/logprob_reward/std": 0.27221575379371643, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 720.5625, "completions/mean_terminated_length": 700.3333740234375, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.6481481481481481, "grad_norm": 1.9387386287283574, "kl": 0.077728271484375, "learning_rate": 4.873121168895075e-07, "loss": -0.1242, "num_tokens": 5214013.0, "reward": 0.5308045744895935, "reward_std": 0.12567628920078278, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.48561617732048035, "rewards/logprob_reward/std": 0.3168899118900299, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 709.21875, "completions/mean_terminated_length": 676.6551513671875, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.6512345679012346, "grad_norm": 2.1692617071671445, "kl": 0.09173583984375, "learning_rate": 4.87154399291916e-07, "loss": -0.0007, "num_tokens": 5243592.0, "reward": 0.3520561456680298, "reward_std": 0.12499307096004486, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.29395127296447754, "rewards/logprob_reward/std": 0.2709003984928131, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 680.5, "completions/mean_terminated_length": 644.9655151367188, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.654320987654321, "grad_norm": 2.338252301779473, "kl": 0.094512939453125, "learning_rate": 4.869957333229955e-07, "loss": -0.4259, "num_tokens": 5271984.0, "reward": 0.312272846698761, "reward_std": 0.19007419049739838, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.2601642608642578, "rewards/logprob_reward/std": 0.3032526671886444, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 703.375, "completions/mean_terminated_length": 693.0322265625, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.6574074074074074, "grad_norm": 1.9652132064088863, "kl": 0.08453369140625, "learning_rate": 4.868361196172453e-07, "loss": -0.1062, "num_tokens": 5300860.0, "reward": 0.5366116762161255, "reward_std": 0.08457967638969421, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4920685291290283, "rewards/logprob_reward/std": 0.32778722047805786, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 748.8125, "completions/mean_terminated_length": 709.5000610351562, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.6604938271604939, "grad_norm": 1.7422296563928903, "kl": 0.072052001953125, "learning_rate": 4.866755588129542e-07, "loss": -0.236, "num_tokens": 5331294.0, "reward": 0.3879185616970062, "reward_std": 0.1506936103105545, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.340742826461792, "rewards/logprob_reward/std": 0.38254591822624207, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 730.59375, "completions/mean_terminated_length": 676.25927734375, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.6635802469135802, "grad_norm": 2.0452720800171496, "kl": 0.08587646484375, "learning_rate": 4.86514051552199e-07, "loss": -0.0148, "num_tokens": 5361241.0, "reward": 0.24427306652069092, "reward_std": 0.12986135482788086, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.17766451835632324, "rewards/logprob_reward/std": 0.22817516326904297, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 687.8125, "completions/mean_terminated_length": 665.4000244140625, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.6666666666666666, "grad_norm": 2.227291104844581, "kl": 0.080596923828125, "learning_rate": 4.863515984808408e-07, "loss": -0.2363, "num_tokens": 5389555.0, "reward": 0.4179430305957794, "reward_std": 0.1435537338256836, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3671589195728302, "rewards/logprob_reward/std": 0.37102246284484863, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 687.84375, "completions/mean_terminated_length": 665.433349609375, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.6697530864197531, "grad_norm": 1.7395452064504249, "kl": 0.10162353515625, "learning_rate": 4.861882002485234e-07, "loss": -0.3227, "num_tokens": 5417926.0, "reward": 0.3230472803115845, "reward_std": 0.15676045417785645, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.26519137620925903, "rewards/logprob_reward/std": 0.2443922460079193, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 795.8125, "completions/mean_terminated_length": 731.9199829101562, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 0.6728395061728395, "grad_norm": 2.1231399343484165, "kl": 0.0921630859375, "learning_rate": 4.860238575086699e-07, "loss": 0.0536, "num_tokens": 5449976.0, "reward": 0.2749477028846741, "reward_std": 0.1642875075340271, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.218691885471344, "rewards/logprob_reward/std": 0.2859751582145691, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 811.90625, "completions/mean_terminated_length": 762.9615478515625, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.6759259259259259, "grad_norm": 2.544574699732086, "kl": 0.08770751953125, "learning_rate": 4.858585709184806e-07, "loss": -0.3042, "num_tokens": 5482533.0, "reward": 0.30098211765289307, "reward_std": 0.18488019704818726, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.26150792837142944, "rewards/logprob_reward/std": 0.32398635149002075, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 715.8125, "completions/mean_terminated_length": 671.7857666015625, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.6790123456790124, "grad_norm": 2.112435547755203, "kl": 0.09173583984375, "learning_rate": 4.856923411389302e-07, "loss": -0.0934, "num_tokens": 5511555.0, "reward": 0.37127482891082764, "reward_std": 0.1326422393321991, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.31877756118774414, "rewards/logprob_reward/std": 0.35618674755096436, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 735.78125, "completions/mean_terminated_length": 716.5667114257812, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.6820987654320988, "grad_norm": 1.841303603753745, "kl": 0.08319091796875, "learning_rate": 4.855251688347653e-07, "loss": -0.3203, "num_tokens": 5541180.0, "reward": 0.2010015994310379, "reward_std": 0.11758687347173691, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.1295851171016693, "rewards/logprob_reward/std": 0.2320018708705902, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 694.78125, "completions/mean_terminated_length": 672.8333740234375, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.6851851851851852, "grad_norm": 2.003922125897438, "kl": 0.0931396484375, "learning_rate": 4.853570546745014e-07, "loss": -0.1349, "num_tokens": 5570009.0, "reward": 0.24111150205135345, "reward_std": 0.04307954013347626, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.1706794649362564, "rewards/logprob_reward/std": 0.3048188090324402, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 745.40625, "completions/mean_terminated_length": 705.607177734375, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.6882716049382716, "grad_norm": 2.4282773667483397, "kl": 0.1141357421875, "learning_rate": 4.851879993304208e-07, "loss": -0.1822, "num_tokens": 5601046.0, "reward": 0.43995678424835205, "reward_std": 0.2552984952926636, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.40203532576560974, "rewards/logprob_reward/std": 0.33179235458374023, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 703.40625, "completions/mean_terminated_length": 682.0333862304688, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.691358024691358, "grad_norm": 2.0749316493559946, "kl": 0.0931396484375, "learning_rate": 4.850180034785691e-07, "loss": -0.0973, "num_tokens": 5629707.0, "reward": 0.48421037197113037, "reward_std": 0.15176457166671753, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.43731704354286194, "rewards/logprob_reward/std": 0.3791246712207794, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 677.625, "completions/mean_terminated_length": 654.5333862304688, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.6944444444444444, "grad_norm": 2.294575833138646, "kl": 0.09112548828125, "learning_rate": 4.848470677987532e-07, "loss": -0.197, "num_tokens": 5657715.0, "reward": 0.5136059522628784, "reward_std": 0.05443252623081207, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4665066599845886, "rewards/logprob_reward/std": 0.30884963274002075, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 777.15625, "completions/mean_terminated_length": 680.5652465820312, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.6975308641975309, "grad_norm": 1.5509618835010677, "kl": 0.09674072265625, "learning_rate": 4.846751929745383e-07, "loss": -0.2875, "num_tokens": 5689200.0, "reward": 0.27743375301361084, "reward_std": 0.18377280235290527, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.2318708449602127, "rewards/logprob_reward/std": 0.3346301019191742, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 753.0, "completions/mean_terminated_length": 662.6666870117188, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.7006172839506173, "grad_norm": 2.2353307983772437, "kl": 0.09295654296875, "learning_rate": 4.845023796932454e-07, "loss": -0.2794, "num_tokens": 5719848.0, "reward": 0.2837667465209961, "reward_std": 0.20083284378051758, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.24237972497940063, "rewards/logprob_reward/std": 0.31649428606033325, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 758.625, "completions/mean_terminated_length": 697.3846435546875, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 0.7037037037037037, "grad_norm": 2.074424121844091, "kl": 0.0965576171875, "learning_rate": 4.84328628645948e-07, "loss": -0.186, "num_tokens": 5751084.0, "reward": 0.35601890087127686, "reward_std": 0.26967331767082214, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.31224319338798523, "rewards/logprob_reward/std": 0.32784223556518555, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 676.78125, "completions/mean_terminated_length": 653.6333618164062, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.7067901234567902, "grad_norm": 1.8463823175057392, "kl": 0.09515380859375, "learning_rate": 4.841539405274698e-07, "loss": -0.1342, "num_tokens": 5779157.0, "reward": 0.24320384860038757, "reward_std": 0.10634133964776993, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.17647649347782135, "rewards/logprob_reward/std": 0.2956610918045044, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 713.65625, "completions/mean_terminated_length": 681.5516967773438, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.7098765432098766, "grad_norm": 2.038286486729589, "kl": 0.11761474609375, "learning_rate": 4.839783160363821e-07, "loss": -0.105, "num_tokens": 5808654.0, "reward": 0.38105782866477966, "reward_std": 0.10874638706445694, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.326175332069397, "rewards/logprob_reward/std": 0.3062019646167755, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 754.09375, "completions/mean_terminated_length": 678.5199584960938, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.7129629629629629, "grad_norm": 2.1010632555060735, "kl": 0.10198974609375, "learning_rate": 4.838017558750004e-07, "loss": -0.2953, "num_tokens": 5839113.0, "reward": 0.18882985413074493, "reward_std": 0.1580466628074646, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.12994983792304993, "rewards/logprob_reward/std": 0.2455306053161621, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 726.5, "completions/mean_terminated_length": 671.4074096679688, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.7160493827160493, "grad_norm": 2.492354049705988, "kl": 0.1112060546875, "learning_rate": 4.836242607493819e-07, "loss": -0.1973, "num_tokens": 5868717.0, "reward": 0.2798643708229065, "reward_std": 0.1862841248512268, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.22762708365917206, "rewards/logprob_reward/std": 0.28932100534439087, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 757.28125, "completions/mean_terminated_length": 695.7307739257812, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.7191358024691358, "grad_norm": 4.1017117487913515, "kl": 0.1031494140625, "learning_rate": 4.834458313693228e-07, "loss": -0.2711, "num_tokens": 5900082.0, "reward": 0.2795795798301697, "reward_std": 0.15968964993953705, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.2238384187221527, "rewards/logprob_reward/std": 0.28970155119895935, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 716.125, "completions/mean_terminated_length": 684.27587890625, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.7222222222222222, "grad_norm": 2.119697827044121, "kl": 0.09783935546875, "learning_rate": 4.832664684483555e-07, "loss": -0.0437, "num_tokens": 5929802.0, "reward": 0.4019584655761719, "reward_std": 0.15852200984954834, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.35287052392959595, "rewards/logprob_reward/std": 0.33685198426246643, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 707.4375, "completions/mean_terminated_length": 686.3333740234375, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.7253086419753086, "grad_norm": 1.9652007297918639, "kl": 0.089599609375, "learning_rate": 4.830861727037453e-07, "loss": -0.0638, "num_tokens": 5958812.0, "reward": 0.2758699953556061, "reward_std": 0.0699482411146164, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2058277726173401, "rewards/logprob_reward/std": 0.24846069514751434, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 721.03125, "completions/mean_terminated_length": 664.9259033203125, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.7283950617283951, "grad_norm": 3.904425767194542, "kl": 0.10546875, "learning_rate": 4.82904944856488e-07, "loss": -0.4219, "num_tokens": 5988081.0, "reward": 0.23152267932891846, "reward_std": 0.13742724061012268, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.1773863136768341, "rewards/logprob_reward/std": 0.23576560616493225, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 736.6875, "completions/mean_terminated_length": 706.9655151367188, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.7314814814814815, "grad_norm": 2.037214039927317, "kl": 0.08575439453125, "learning_rate": 4.827227856313066e-07, "loss": -0.1731, "num_tokens": 6018055.0, "reward": 0.4582667350769043, "reward_std": 0.14312902092933655, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4119630753993988, "rewards/logprob_reward/std": 0.37829849123954773, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 708.21875, "completions/mean_terminated_length": 649.74072265625, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.7345679012345679, "grad_norm": 1.9225924357012916, "kl": 0.09820556640625, "learning_rate": 4.825396957566491e-07, "loss": -0.0542, "num_tokens": 6046870.0, "reward": 0.2767597734928131, "reward_std": 0.07239697873592377, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.2207053154706955, "rewards/logprob_reward/std": 0.3079781234264374, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 734.84375, "completions/mean_terminated_length": 693.5357666015625, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.7376543209876543, "grad_norm": 2.498066103023731, "kl": 0.0985107421875, "learning_rate": 4.823556759646847e-07, "loss": -0.0605, "num_tokens": 6077153.0, "reward": 0.338235467672348, "reward_std": 0.14264845848083496, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.278594970703125, "rewards/logprob_reward/std": 0.28185608983039856, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 755.34375, "completions/mean_terminated_length": 716.9642944335938, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.7407407407407407, "grad_norm": 1.6447000342865457, "kl": 0.0831298828125, "learning_rate": 4.821707269913016e-07, "loss": -0.2409, "num_tokens": 6107796.0, "reward": 0.2723207473754883, "reward_std": 0.18069088459014893, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.2192452847957611, "rewards/logprob_reward/std": 0.271605908870697, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 778.28125, "completions/mean_terminated_length": 732.7777709960938, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.7438271604938271, "grad_norm": 1.9966430203808947, "kl": 0.104736328125, "learning_rate": 4.819848495761037e-07, "loss": -0.0272, "num_tokens": 6139433.0, "reward": 0.19184328615665436, "reward_std": 0.09577944874763489, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.1298258900642395, "rewards/logprob_reward/std": 0.2797870635986328, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 664.75, "completions/mean_terminated_length": 664.75, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.7469135802469136, "grad_norm": 1.937399915565168, "kl": 0.10223388671875, "learning_rate": 4.817980444624076e-07, "loss": -0.1338, "num_tokens": 6167061.0, "reward": 0.29171323776245117, "reward_std": 0.034755174070596695, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.21995915472507477, "rewards/logprob_reward/std": 0.29154738783836365, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 775.71875, "completions/mean_terminated_length": 729.74072265625, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 0.75, "grad_norm": 1.9284257784970529, "kl": 0.1107177734375, "learning_rate": 4.816103123972395e-07, "loss": -0.096, "num_tokens": 6198380.0, "reward": 0.36926549673080444, "reward_std": 0.1388901025056839, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3269616663455963, "rewards/logprob_reward/std": 0.32015466690063477, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 720.0625, "completions/mean_terminated_length": 676.6428833007812, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.7530864197530864, "grad_norm": 1.7453639649506112, "kl": 0.10125732421875, "learning_rate": 4.814216541313329e-07, "loss": -0.244, "num_tokens": 6227946.0, "reward": 0.44019919633865356, "reward_std": 0.25532835721969604, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.39883241057395935, "rewards/logprob_reward/std": 0.3622204661369324, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 741.09375, "completions/mean_terminated_length": 700.6785888671875, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.7561728395061729, "grad_norm": 2.3175228644105603, "kl": 0.11602783203125, "learning_rate": 4.812320704191252e-07, "loss": -0.2051, "num_tokens": 6258301.0, "reward": 0.19641299545764923, "reward_std": 0.10468196868896484, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.13143110275268555, "rewards/logprob_reward/std": 0.28800371289253235, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 699.5625, "completions/mean_terminated_length": 666.0, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.7592592592592593, "grad_norm": 2.302978785112872, "kl": 0.114593505859375, "learning_rate": 4.81041562018754e-07, "loss": -0.1101, "num_tokens": 6286927.0, "reward": 0.35996103286743164, "reward_std": 0.11623206734657288, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.30273449420928955, "rewards/logprob_reward/std": 0.33887872099876404, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 732.25, "completions/mean_terminated_length": 712.800048828125, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.7623456790123457, "grad_norm": 2.1879066389408868, "kl": 0.10333251953125, "learning_rate": 4.808501296920552e-07, "loss": 0.0304, "num_tokens": 6316967.0, "reward": 0.48715314269065857, "reward_std": 0.12873363494873047, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4371146261692047, "rewards/logprob_reward/std": 0.3410239815711975, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 702.34375, "completions/mean_terminated_length": 702.34375, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.7654320987654321, "grad_norm": 2.018977310365683, "kl": 0.0953369140625, "learning_rate": 4.806577742045593e-07, "loss": -0.2515, "num_tokens": 6345946.0, "reward": 0.2761445641517639, "reward_std": 0.09076890349388123, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.20960503816604614, "rewards/logprob_reward/std": 0.3140547573566437, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 721.75, "completions/mean_terminated_length": 652.0, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 0.7685185185185185, "grad_norm": 2.126308665049609, "kl": 0.11529541015625, "learning_rate": 4.804644963254887e-07, "loss": -0.1383, "num_tokens": 6375750.0, "reward": 0.3710324764251709, "reward_std": 0.21785402297973633, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3254527449607849, "rewards/logprob_reward/std": 0.32378634810447693, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 790.78125, "completions/mean_terminated_length": 736.9615478515625, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.7716049382716049, "grad_norm": 1.9683595436016477, "kl": 0.094970703125, "learning_rate": 4.80270296827754e-07, "loss": -0.0648, "num_tokens": 6407931.0, "reward": 0.2568047046661377, "reward_std": 0.1467161774635315, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.20200523734092712, "rewards/logprob_reward/std": 0.29080674052238464, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 709.03125, "completions/mean_terminated_length": 664.0357666015625, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.7746913580246914, "grad_norm": 2.02219621428941, "kl": 0.0965576171875, "learning_rate": 4.800751764879516e-07, "loss": -0.1333, "num_tokens": 6437172.0, "reward": 0.2869783341884613, "reward_std": 0.1374148726463318, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2251148372888565, "rewards/logprob_reward/std": 0.2661113440990448, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 707.0625, "completions/mean_terminated_length": 648.370361328125, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.7777777777777778, "grad_norm": 2.650275837802306, "kl": 0.09918212890625, "learning_rate": 4.798791360863602e-07, "loss": -0.2191, "num_tokens": 6465794.0, "reward": 0.15893587470054626, "reward_std": 0.10087311267852783, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.08978985995054245, "rewards/logprob_reward/std": 0.17918741703033447, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 766.3125, "completions/mean_terminated_length": 665.478271484375, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.7808641975308642, "grad_norm": 2.1681189054338885, "kl": 0.11700439453125, "learning_rate": 4.796821764069378e-07, "loss": -0.1173, "num_tokens": 6496868.0, "reward": 0.21960289776325226, "reward_std": 0.1470838040113449, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.16414210200309753, "rewards/logprob_reward/std": 0.2283983677625656, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 681.34375, "completions/mean_terminated_length": 670.290283203125, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.7839506172839507, "grad_norm": 2.0348175170345644, "kl": 0.10504150390625, "learning_rate": 4.794842982373188e-07, "loss": -0.1501, "num_tokens": 6524935.0, "reward": 0.4150526225566864, "reward_std": 0.10051368176937103, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3570029139518738, "rewards/logprob_reward/std": 0.3018735647201538, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 760.875, "completions/mean_terminated_length": 687.2000122070312, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 0.7870370370370371, "grad_norm": 2.277477024934102, "kl": 0.1044921875, "learning_rate": 4.7928550236881e-07, "loss": 0.1017, "num_tokens": 6556051.0, "reward": 0.34174492955207825, "reward_std": 0.15150824189186096, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.296383261680603, "rewards/logprob_reward/std": 0.2948094606399536, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 727.40625, "completions/mean_terminated_length": 707.6333618164062, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 0.7901234567901234, "grad_norm": 2.010386321028003, "kl": 0.10223388671875, "learning_rate": 4.790857895963888e-07, "loss": -0.0483, "num_tokens": 6586000.0, "reward": 0.4248571991920471, "reward_std": 0.11917746812105179, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3713691234588623, "rewards/logprob_reward/std": 0.2981397807598114, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 739.625, "completions/mean_terminated_length": 686.9629516601562, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "epoch": 0.7932098765432098, "grad_norm": 3.48178511274401, "kl": 0.12030029296875, "learning_rate": 4.788851607186988e-07, "loss": -0.3382, "num_tokens": 6616112.0, "reward": 0.2827301323413849, "reward_std": 0.14029467105865479, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.23081126809120178, "rewards/logprob_reward/std": 0.2792452573776245, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 686.857177734375, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.7962962962962963, "grad_norm": 2.3879332441911156, "kl": 0.10546875, "learning_rate": 4.786836165380472e-07, "loss": -0.0436, "num_tokens": 6645916.0, "reward": 0.2804918885231018, "reward_std": 0.11741897463798523, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.21443545818328857, "rewards/logprob_reward/std": 0.2457936704158783, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 654.0625, "completions/mean_terminated_length": 654.0625, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.7993827160493827, "grad_norm": 2.105496974052558, "kl": 0.1217041015625, "learning_rate": 4.784811578604013e-07, "loss": -0.0349, "num_tokens": 6673290.0, "reward": 0.39803528785705566, "reward_std": 0.08641330897808075, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.3346225619316101, "rewards/logprob_reward/std": 0.2850218713283539, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 720.15625, "completions/mean_terminated_length": 699.9000244140625, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 0.8024691358024691, "grad_norm": 2.2732558389688924, "kl": 0.1175537109375, "learning_rate": 4.782777854953857e-07, "loss": -0.115, "num_tokens": 6703079.0, "reward": 0.5350664854049683, "reward_std": 0.17675259709358215, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4972960948944092, "rewards/logprob_reward/std": 0.31846997141838074, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 707.0625, "completions/mean_terminated_length": 674.27587890625, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.8055555555555556, "grad_norm": 2.090397292742311, "kl": 0.09576416015625, "learning_rate": 4.780735002562785e-07, "loss": -0.0547, "num_tokens": 6731757.0, "reward": 0.4585440158843994, "reward_std": 0.18908634781837463, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.41227108240127563, "rewards/logprob_reward/std": 0.34671926498413086, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 682.75, "completions/mean_terminated_length": 634.0, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.808641975308642, "grad_norm": 2.259186209969013, "kl": 0.1063232421875, "learning_rate": 4.778683029600089e-07, "loss": -0.1848, "num_tokens": 6760133.0, "reward": 0.4434816241264343, "reward_std": 0.20177386701107025, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3990073502063751, "rewards/logprob_reward/std": 0.32247433066368103, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 725.03125, "completions/mean_terminated_length": 669.6666870117188, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.8117283950617284, "grad_norm": 2.1504278652890245, "kl": 0.10552978515625, "learning_rate": 4.776621944271526e-07, "loss": -0.1676, "num_tokens": 6789802.0, "reward": 0.3800508379936218, "reward_std": 0.14062321186065674, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3389453589916229, "rewards/logprob_reward/std": 0.3451632857322693, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 685.03125, "completions/mean_terminated_length": 636.607177734375, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.8148148148148148, "grad_norm": 1.9486295609493005, "kl": 0.095703125, "learning_rate": 4.774551754819299e-07, "loss": -0.246, "num_tokens": 6817795.0, "reward": 0.5113019943237305, "reward_std": 0.2092399150133133, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.4743633568286896, "rewards/logprob_reward/std": 0.33239826560020447, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 711.96875, "completions/mean_terminated_length": 667.3928833007812, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.8179012345679012, "grad_norm": 1.8120780116867488, "kl": 0.10137939453125, "learning_rate": 4.772472469522015e-07, "loss": -0.3078, "num_tokens": 6846490.0, "reward": 0.2990865707397461, "reward_std": 0.17820273339748383, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.24898508191108704, "rewards/logprob_reward/std": 0.27821919322013855, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 678.375, "completions/mean_terminated_length": 642.6206665039062, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.8209876543209876, "grad_norm": 2.3405117272589933, "kl": 0.11163330078125, "learning_rate": 4.770384096694658e-07, "loss": -0.0506, "num_tokens": 6874766.0, "reward": 0.37974855303764343, "reward_std": 0.11727327108383179, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3247206211090088, "rewards/logprob_reward/std": 0.29630932211875916, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 748.15625, "completions/mean_terminated_length": 697.0740966796875, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.8240740740740741, "grad_norm": 2.1302140637966676, "kl": 0.113006591796875, "learning_rate": 4.7682866446885475e-07, "loss": -0.2283, "num_tokens": 6905571.0, "reward": 0.26630598306655884, "reward_std": 0.12580719590187073, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.209089994430542, "rewards/logprob_reward/std": 0.27123624086380005, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 700.71875, "completions/mean_terminated_length": 679.1666870117188, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.8271604938271605, "grad_norm": 1.9590249611373451, "kl": 0.10089111328125, "learning_rate": 4.766180121891316e-07, "loss": -0.1551, "num_tokens": 6933902.0, "reward": 0.37193965911865234, "reward_std": 0.0778612494468689, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.31257182359695435, "rewards/logprob_reward/std": 0.31078892946243286, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 694.3125, "completions/mean_terminated_length": 683.6774291992188, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.8302469135802469, "grad_norm": 1.9929876862465215, "kl": 0.1029052734375, "learning_rate": 4.7640645367268663e-07, "loss": 0.0214, "num_tokens": 6962620.0, "reward": 0.47372347116470337, "reward_std": 0.03608018904924393, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.418720543384552, "rewards/logprob_reward/std": 0.294933021068573, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 672.28125, "completions/mean_terminated_length": 648.8333740234375, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.8333333333333334, "grad_norm": 2.2922377246519456, "kl": 0.12060546875, "learning_rate": 4.761939897655343e-07, "loss": -0.0849, "num_tokens": 6990777.0, "reward": 0.4232385456562042, "reward_std": 0.15004797279834747, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3730428218841553, "rewards/logprob_reward/std": 0.3006185293197632, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 707.28125, "completions/mean_terminated_length": 662.0357666015625, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.8364197530864198, "grad_norm": 2.1954762823102363, "kl": 0.1026611328125, "learning_rate": 4.7598062131730943e-07, "loss": -0.1382, "num_tokens": 7019722.0, "reward": 0.30681371688842773, "reward_std": 0.14030200242996216, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2471541464328766, "rewards/logprob_reward/std": 0.2665797472000122, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 649.8125, "completions/mean_terminated_length": 637.741943359375, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.8395061728395061, "grad_norm": 2.005757867403437, "kl": 0.12823486328125, "learning_rate": 4.757663491812644e-07, "loss": -0.1721, "num_tokens": 7047068.0, "reward": 0.3860414922237396, "reward_std": 0.13933084905147552, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3282405138015747, "rewards/logprob_reward/std": 0.3020530939102173, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 683.9375, "completions/mean_terminated_length": 648.7586059570312, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.8425925925925926, "grad_norm": 1.9488945057858018, "kl": 0.109619140625, "learning_rate": 4.755511742142652e-07, "loss": -0.2613, "num_tokens": 7075522.0, "reward": 0.2645326852798462, "reward_std": 0.1746543049812317, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.2105918824672699, "rewards/logprob_reward/std": 0.2603277862071991, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 718.40625, "completions/mean_terminated_length": 661.8148193359375, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.845679012345679, "grad_norm": 1.892600402930633, "kl": 0.12042236328125, "learning_rate": 4.753350972767883e-07, "loss": -0.2625, "num_tokens": 7105043.0, "reward": 0.37988555431365967, "reward_std": 0.21918469667434692, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.33528950810432434, "rewards/logprob_reward/std": 0.3081396222114563, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 651.8125, "completions/mean_terminated_length": 639.8064575195312, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.8487654320987654, "grad_norm": 2.051206042554904, "kl": 0.1229248046875, "learning_rate": 4.75118119232917e-07, "loss": -0.2496, "num_tokens": 7132297.0, "reward": 0.3214179575443268, "reward_std": 0.08594992011785507, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.26338106393814087, "rewards/logprob_reward/std": 0.30437248945236206, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 751.375, "completions/mean_terminated_length": 688.4615478515625, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.8518518518518519, "grad_norm": 2.7647632957341846, "kl": 0.12005615234375, "learning_rate": 4.749002409503382e-07, "loss": -0.2191, "num_tokens": 7162805.0, "reward": 0.43399733304977417, "reward_std": 0.24642342329025269, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.39541369676589966, "rewards/logprob_reward/std": 0.3361734449863434, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 672.5625, "completions/mean_terminated_length": 636.2069091796875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.8549382716049383, "grad_norm": 2.408109282019256, "kl": 0.11505126953125, "learning_rate": 4.7468146330033874e-07, "loss": -0.3092, "num_tokens": 7190803.0, "reward": 0.38066565990448, "reward_std": 0.2220057249069214, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.33615630865097046, "rewards/logprob_reward/std": 0.32265549898147583, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 687.28125, "completions/mean_terminated_length": 687.28125, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.8580246913580247, "grad_norm": 1.9371262428910607, "kl": 0.100341796875, "learning_rate": 4.7446178715780213e-07, "loss": -0.0487, "num_tokens": 7219380.0, "reward": 0.2970954179763794, "reward_std": 0.018885593861341476, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.2189948856830597, "rewards/logprob_reward/std": 0.29848241806030273, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 685.65625, "completions/mean_terminated_length": 663.1000366210938, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.8611111111111112, "grad_norm": 2.1007398216508286, "kl": 0.10491943359375, "learning_rate": 4.742412134012047e-07, "loss": -0.0085, "num_tokens": 7247869.0, "reward": 0.3925120532512665, "reward_std": 0.12278018891811371, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3354300856590271, "rewards/logprob_reward/std": 0.3034224510192871, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 755.28125, "completions/mean_terminated_length": 650.1304321289062, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.8641975308641975, "grad_norm": 2.237300518042569, "kl": 0.1414794921875, "learning_rate": 4.740197429126125e-07, "loss": -0.0329, "num_tokens": 7278878.0, "reward": 0.35994982719421387, "reward_std": 0.1673746556043625, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.3270275592803955, "rewards/logprob_reward/std": 0.37508079409599304, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 727.3125, "completions/mean_terminated_length": 717.741943359375, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.8672839506172839, "grad_norm": 2.0750234082205523, "kl": 0.11077880859375, "learning_rate": 4.7379737657767745e-07, "loss": 0.0197, "num_tokens": 7308996.0, "reward": 0.2957283854484558, "reward_std": 0.08468891680240631, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2313648760318756, "rewards/logprob_reward/std": 0.26769381761550903, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 644.125, "completions/mean_terminated_length": 631.8709716796875, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.8703703703703703, "grad_norm": 2.2410120495236976, "kl": 0.1204833984375, "learning_rate": 4.7357411528563393e-07, "loss": -0.0206, "num_tokens": 7335968.0, "reward": 0.34602269530296326, "reward_std": 0.09784792363643646, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.276830792427063, "rewards/logprob_reward/std": 0.2851298749446869, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 712.96875, "completions/mean_terminated_length": 680.7930908203125, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.8734567901234568, "grad_norm": 2.084560893074803, "kl": 0.1170654296875, "learning_rate": 4.733499599292955e-07, "loss": -0.1244, "num_tokens": 7365387.0, "reward": 0.39114806056022644, "reward_std": 0.19552147388458252, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.34433120489120483, "rewards/logprob_reward/std": 0.3126448094844818, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 737.75, "completions/mean_terminated_length": 671.6923217773438, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.8765432098765432, "grad_norm": 1.9122752266727643, "kl": 0.11822509765625, "learning_rate": 4.7312491140505064e-07, "loss": -0.2139, "num_tokens": 7395647.0, "reward": 0.17777985334396362, "reward_std": 0.11817943304777145, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.12461650371551514, "rewards/logprob_reward/std": 0.2463804930448532, "step": 284 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 732.1875, "completions/mean_terminated_length": 678.1481323242188, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 0.8796296296296297, "grad_norm": 1.7911690041769892, "kl": NaN, "learning_rate": 4.7289897061285965e-07, "loss": -0.1698, "num_tokens": 7425817.0, "reward": 0.2818986475467682, "reward_std": 0.1355532854795456, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.22988739609718323, "rewards/logprob_reward/std": 0.2847767174243927, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 747.1875, "completions/mean_terminated_length": 669.6799926757812, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.8827160493827161, "grad_norm": 2.035653918904492, "kl": 0.14349365234375, "learning_rate": 4.726721384562513e-07, "loss": -0.1046, "num_tokens": 7456347.0, "reward": 0.32456451654434204, "reward_std": 0.13731923699378967, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.2738216519355774, "rewards/logprob_reward/std": 0.34797921776771545, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 686.875, "completions/mean_terminated_length": 652.0, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.8858024691358025, "grad_norm": 2.1449611837929754, "kl": 0.1302490234375, "learning_rate": 4.724444158423185e-07, "loss": -0.1496, "num_tokens": 7484391.0, "reward": 0.28367137908935547, "reward_std": 0.1436767578125, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.22144046425819397, "rewards/logprob_reward/std": 0.2796669006347656, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 718.375, "completions/mean_terminated_length": 686.7586059570312, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.8888888888888888, "grad_norm": 2.3084130069818953, "kl": 0.115234375, "learning_rate": 4.722158036817154e-07, "loss": -0.0224, "num_tokens": 7514167.0, "reward": 0.3251020312309265, "reward_std": 0.13172954320907593, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.26400226354599, "rewards/logprob_reward/std": 0.2288665473461151, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 748.3125, "completions/mean_terminated_length": 729.933349609375, "completions/min_length": 595.0, "completions/min_terminated_length": 595.0, "epoch": 0.8919753086419753, "grad_norm": 2.3676136004634407, "kl": 0.12091064453125, "learning_rate": 4.7198630288865304e-07, "loss": -0.1594, "num_tokens": 7545213.0, "reward": 0.4276943504810333, "reward_std": 0.08860848844051361, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.38146597146987915, "rewards/logprob_reward/std": 0.3491162359714508, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 710.125, "completions/mean_terminated_length": 677.6551513671875, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 0.8950617283950617, "grad_norm": 1.9780092258513708, "kl": 0.1328125, "learning_rate": 4.7175591438089646e-07, "loss": -0.2018, "num_tokens": 7574849.0, "reward": 0.34616440534591675, "reward_std": 0.1409263163805008, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2943493723869324, "rewards/logprob_reward/std": 0.31612512469291687, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 690.3125, "completions/mean_terminated_length": 679.54833984375, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.8981481481481481, "grad_norm": 1.9459370408708565, "kl": 0.11358642578125, "learning_rate": 4.7152463907976024e-07, "loss": -0.1033, "num_tokens": 7603247.0, "reward": 0.44349992275238037, "reward_std": 0.13985233008861542, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3920832872390747, "rewards/logprob_reward/std": 0.3231857717037201, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 729.5, "completions/mean_terminated_length": 687.4285888671875, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 0.9012345679012346, "grad_norm": 1.7459681063228274, "kl": 0.151611328125, "learning_rate": 4.7129247791010563e-07, "loss": -0.1025, "num_tokens": 7633691.0, "reward": 0.35797709226608276, "reward_std": 0.033678025007247925, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3040023148059845, "rewards/logprob_reward/std": 0.30642253160476685, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 671.25, "completions/mean_terminated_length": 647.7333374023438, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.904320987654321, "grad_norm": 1.7416547461094307, "kl": 0.12432861328125, "learning_rate": 4.710594318003361e-07, "loss": -0.1443, "num_tokens": 7661111.0, "reward": 0.2736697196960449, "reward_std": 0.06426462531089783, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.20338305830955505, "rewards/logprob_reward/std": 0.27567237615585327, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 688.25, "completions/mean_terminated_length": 677.4193115234375, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.9074074074074074, "grad_norm": 2.0382329525723026, "kl": 0.11956787109375, "learning_rate": 4.7082550168239423e-07, "loss": -0.0941, "num_tokens": 7689359.0, "reward": 0.45611751079559326, "reward_std": 0.0611182302236557, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.3991583585739136, "rewards/logprob_reward/std": 0.24638453125953674, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 705.875, "completions/mean_terminated_length": 672.9655151367188, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 0.9104938271604939, "grad_norm": 1.6997186800695174, "kl": 0.1123046875, "learning_rate": 4.705906884917573e-07, "loss": -0.1247, "num_tokens": 7718163.0, "reward": 0.36115822196006775, "reward_std": 0.07814788073301315, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.30406469106674194, "rewards/logprob_reward/std": 0.3603592813014984, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 726.9375, "completions/mean_terminated_length": 696.2069091796875, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.9135802469135802, "grad_norm": 2.128888901073959, "kl": 0.140869140625, "learning_rate": 4.703549931674345e-07, "loss": -0.0638, "num_tokens": 7748681.0, "reward": 0.31171292066574097, "reward_std": 0.03939829021692276, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.24912545084953308, "rewards/logprob_reward/std": 0.28425532579421997, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 698.03125, "completions/mean_terminated_length": 698.03125, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 0.9166666666666666, "grad_norm": 1.9763276403942804, "kl": 0.117431640625, "learning_rate": 4.7011841665196227e-07, "loss": 0.0102, "num_tokens": 7777134.0, "reward": 0.48798149824142456, "reward_std": 0.06521964818239212, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.4310905337333679, "rewards/logprob_reward/std": 0.27356192469596863, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 695.5, "completions/mean_terminated_length": 648.5714721679688, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 0.9197530864197531, "grad_norm": 1.8169000743040138, "kl": 0.16180419921875, "learning_rate": 4.6988095989140096e-07, "loss": -0.1007, "num_tokens": 7805910.0, "reward": 0.32878902554512024, "reward_std": 0.04120553657412529, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2750433683395386, "rewards/logprob_reward/std": 0.3153917193412781, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 729.03125, "completions/mean_terminated_length": 719.51611328125, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 0.9228395061728395, "grad_norm": 2.073674555529647, "kl": 0.12060546875, "learning_rate": 4.6964262383533114e-07, "loss": 0.0252, "num_tokens": 7835863.0, "reward": 0.3372179865837097, "reward_std": 0.030503489077091217, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.2670477628707886, "rewards/logprob_reward/std": 0.31318360567092896, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 776.46875, "completions/mean_terminated_length": 719.34619140625, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.9259259259259259, "grad_norm": 1.7220736316361636, "kl": 0.1512451171875, "learning_rate": 4.694034094368495e-07, "loss": -0.0767, "num_tokens": 7867530.0, "reward": 0.26034015417099, "reward_std": 0.11157214641571045, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.20246127247810364, "rewards/logprob_reward/std": 0.29840922355651855, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 686.28125, "completions/mean_terminated_length": 663.7667236328125, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.9290123456790124, "grad_norm": 1.9757350220952408, "kl": 0.130615234375, "learning_rate": 4.691633176525651e-07, "loss": -0.1141, "num_tokens": 7896011.0, "reward": 0.29782772064208984, "reward_std": 0.1299738734960556, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.24064189195632935, "rewards/logprob_reward/std": 0.2755417823791504, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 716.75, "completions/mean_terminated_length": 716.75, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.9320987654320988, "grad_norm": 1.9809728655743788, "kl": 0.1226806640625, "learning_rate": 4.689223494425959e-07, "loss": -0.131, "num_tokens": 7925179.0, "reward": 0.4906806945800781, "reward_std": 0.1080249547958374, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.44103413820266724, "rewards/logprob_reward/std": 0.33539119362831116, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 771.59375, "completions/mean_terminated_length": 724.8518676757812, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.9351851851851852, "grad_norm": 1.781295235393144, "kl": 0.12567138671875, "learning_rate": 4.686805057705645e-07, "loss": -0.1597, "num_tokens": 7956362.0, "reward": 0.40358904004096985, "reward_std": 0.09109903872013092, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3650989532470703, "rewards/logprob_reward/std": 0.32141250371932983, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 735.125, "completions/mean_terminated_length": 715.86669921875, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.9382716049382716, "grad_norm": 2.066848233935454, "kl": 0.1314697265625, "learning_rate": 4.684377876035944e-07, "loss": -0.0172, "num_tokens": 7986794.0, "reward": 0.5110626220703125, "reward_std": 0.09560798108577728, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4636806845664978, "rewards/logprob_reward/std": 0.3362780809402466, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 721.21875, "completions/mean_terminated_length": 701.0333862304688, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.941358024691358, "grad_norm": 2.039354206326112, "kl": 0.1279296875, "learning_rate": 4.681941959123063e-07, "loss": -0.1306, "num_tokens": 8015981.0, "reward": 0.3085150718688965, "reward_std": 0.07853810489177704, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2421000748872757, "rewards/logprob_reward/std": 0.29355353116989136, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 722.9375, "completions/mean_terminated_length": 713.2257690429688, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.9444444444444444, "grad_norm": 1.8667147798112715, "kl": 0.134033203125, "learning_rate": 4.6794973167081397e-07, "loss": -0.1184, "num_tokens": 8045419.0, "reward": 0.3769611120223999, "reward_std": 0.1390760838985443, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3250957131385803, "rewards/logprob_reward/std": 0.3121023178100586, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 725.15625, "completions/mean_terminated_length": 694.2413940429688, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.9475308641975309, "grad_norm": 2.0930250578095917, "kl": 0.1412353515625, "learning_rate": 4.6770439585672046e-07, "loss": -0.1245, "num_tokens": 8075028.0, "reward": 0.26745402812957764, "reward_std": 0.1428137570619583, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.20342113077640533, "rewards/logprob_reward/std": 0.26819995045661926, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 714.5625, "completions/mean_terminated_length": 693.933349609375, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.9506172839506173, "grad_norm": 1.8712907059513373, "kl": 0.12957763671875, "learning_rate": 4.6745818945111426e-07, "loss": -0.08, "num_tokens": 8104590.0, "reward": 0.2642862796783447, "reward_std": 0.06831199675798416, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.19642920792102814, "rewards/logprob_reward/std": 0.3182244300842285, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 698.625, "completions/mean_terminated_length": 688.1290283203125, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.9537037037037037, "grad_norm": 1.6835031736693717, "kl": 0.13104248046875, "learning_rate": 4.6721111343856547e-07, "loss": -0.1737, "num_tokens": 8133370.0, "reward": 0.3306589722633362, "reward_std": 0.17537572979927063, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.27712106704711914, "rewards/logprob_reward/std": 0.32108616828918457, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 683.28125, "completions/mean_terminated_length": 672.290283203125, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 0.9567901234567902, "grad_norm": 1.9747688110779889, "kl": 0.12689208984375, "learning_rate": 4.669631688071214e-07, "loss": 0.0718, "num_tokens": 8161363.0, "reward": 0.37768062949180603, "reward_std": 0.09226556867361069, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.31200623512268066, "rewards/logprob_reward/std": 0.28680598735809326, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 774.75, "completions/mean_terminated_length": 717.2307739257812, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.9598765432098766, "grad_norm": 1.9413084485588443, "kl": 0.12567138671875, "learning_rate": 4.667143565483032e-07, "loss": 0.0074, "num_tokens": 8192767.0, "reward": 0.356092244386673, "reward_std": 0.1613890528678894, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.30538028478622437, "rewards/logprob_reward/std": 0.3577762544155121, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 785.34375, "completions/mean_terminated_length": 718.5199584960938, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 0.9629629629629629, "grad_norm": 1.8844171399778213, "kl": 0.1419677734375, "learning_rate": 4.664646776571015e-07, "loss": -0.2315, "num_tokens": 8224598.0, "reward": 0.24053294956684113, "reward_std": 0.1699310541152954, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.18739773333072662, "rewards/logprob_reward/std": 0.2319127321243286, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 756.28125, "completions/mean_terminated_length": 747.6451416015625, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.9660493827160493, "grad_norm": 1.6507203118851788, "kl": 0.13739013671875, "learning_rate": 4.662141331319726e-07, "loss": -0.0723, "num_tokens": 8255867.0, "reward": 0.21402481198310852, "reward_std": 0.028427444398403168, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.1371109038591385, "rewards/logprob_reward/std": 0.3138679265975952, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 719.0, "completions/mean_terminated_length": 698.6666870117188, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.9691358024691358, "grad_norm": 1.9618433857187108, "kl": 0.1502685546875, "learning_rate": 4.6596272397483445e-07, "loss": -0.2105, "num_tokens": 8285375.0, "reward": 0.3375122845172882, "reward_std": 0.16897845268249512, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.28473588824272156, "rewards/logprob_reward/std": 0.30175507068634033, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 717.1875, "completions/mean_terminated_length": 717.1875, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 0.9722222222222222, "grad_norm": 1.8249678996016583, "kl": 0.1307373046875, "learning_rate": 4.657104511910626e-07, "loss": -0.0646, "num_tokens": 8314581.0, "reward": 0.4501153826713562, "reward_std": 0.08264179527759552, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.39943376183509827, "rewards/logprob_reward/std": 0.3258548378944397, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 716.15625, "completions/mean_terminated_length": 695.6333618164062, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.9753086419753086, "grad_norm": 1.5563293400202056, "kl": 0.15167236328125, "learning_rate": 4.654573157894861e-07, "loss": -0.1675, "num_tokens": 8343934.0, "reward": 0.37637853622436523, "reward_std": 0.20345339179039001, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3244483768939972, "rewards/logprob_reward/std": 0.29473960399627686, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 694.03125, "completions/mean_terminated_length": 683.3870849609375, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.9783950617283951, "grad_norm": 1.7930505786195439, "kl": 0.14288330078125, "learning_rate": 4.652033187823838e-07, "loss": -0.0894, "num_tokens": 8372615.0, "reward": 0.29526612162590027, "reward_std": 0.08803264796733856, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.23432348668575287, "rewards/logprob_reward/std": 0.3486010730266571, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 715.71875, "completions/mean_terminated_length": 695.1666870117188, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.9814814814814815, "grad_norm": 1.7476454960594117, "kl": 0.143310546875, "learning_rate": 4.6494846118548e-07, "loss": -0.0786, "num_tokens": 8401646.0, "reward": 0.30523213744163513, "reward_std": 0.09354106336832047, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.24192458391189575, "rewards/logprob_reward/std": 0.26693663001060486, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 715.34375, "completions/mean_terminated_length": 694.7667236328125, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.9845679012345679, "grad_norm": 2.0188412631321846, "kl": 0.1619873046875, "learning_rate": 4.6469274401794044e-07, "loss": -0.0313, "num_tokens": 8430941.0, "reward": 0.3138546347618103, "reward_std": 0.10602065920829773, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2515051066875458, "rewards/logprob_reward/std": 0.33907023072242737, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 740.71875, "completions/mean_terminated_length": 711.413818359375, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.9876543209876543, "grad_norm": 1.8178209498763729, "kl": 0.1690673828125, "learning_rate": 4.6443616830236823e-07, "loss": -0.0981, "num_tokens": 8461696.0, "reward": 0.3523213863372803, "reward_std": 0.09807136654853821, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.29424601793289185, "rewards/logprob_reward/std": 0.3174029290676117, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 751.34375, "completions/mean_terminated_length": 712.3928833007812, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.9907407407407407, "grad_norm": 1.6941964188309575, "kl": 0.15374755859375, "learning_rate": 4.641787350647997e-07, "loss": -0.186, "num_tokens": 8492747.0, "reward": 0.4111780524253845, "reward_std": 0.14883726835250854, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.37005895376205444, "rewards/logprob_reward/std": 0.3187277317047119, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 771.4375, "completions/mean_terminated_length": 754.6000366210938, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 0.9938271604938271, "grad_norm": 1.66971819588201, "kl": 0.141357421875, "learning_rate": 4.6392044533470053e-07, "loss": -0.1522, "num_tokens": 8524373.0, "reward": 0.3567969799041748, "reward_std": 0.1542615443468094, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2992188632488251, "rewards/logprob_reward/std": 0.2843568027019501, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 793.5625, "completions/mean_terminated_length": 769.72412109375, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 0.9969135802469136, "grad_norm": 1.8096214005562896, "kl": 0.15576171875, "learning_rate": 4.636613001449615e-07, "loss": 0.0458, "num_tokens": 8556567.0, "reward": 0.38746386766433716, "reward_std": 0.08819326758384705, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.33329319953918457, "rewards/logprob_reward/std": 0.31957486271858215, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 683.75, "completions/mean_terminated_length": 672.774169921875, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 1.0, "grad_norm": 1.8037360522082957, "kl": 0.14910888671875, "learning_rate": 4.6340130053189417e-07, "loss": -0.064, "num_tokens": 8585099.0, "reward": 0.2172153741121292, "reward_std": 0.10121525079011917, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.14065596461296082, "rewards/logprob_reward/std": 0.24197712540626526, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 707.09375, "completions/mean_terminated_length": 696.8709716796875, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 1.0030864197530864, "grad_norm": 1.5600153227909415, "kl": 0.16265869140625, "learning_rate": 4.6314044753522703e-07, "loss": -0.1907, "num_tokens": 8613970.0, "reward": 0.38201913237571716, "reward_std": 0.14706704020500183, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.32724347710609436, "rewards/logprob_reward/std": 0.2946895658969879, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 634.15625, "completions/mean_terminated_length": 634.15625, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 1.0061728395061729, "grad_norm": 1.839129429019458, "kl": 0.16448974609375, "learning_rate": 4.6287874219810117e-07, "loss": -0.1229, "num_tokens": 8640415.0, "reward": 0.41193997859954834, "reward_std": 0.07707273215055466, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3535444140434265, "rewards/logprob_reward/std": 0.3378883898258209, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 695.40625, "completions/mean_terminated_length": 684.8064575195312, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 1.0092592592592593, "grad_norm": 1.7689225377267597, "kl": 0.15240478515625, "learning_rate": 4.626161855670663e-07, "loss": -0.0873, "num_tokens": 8668912.0, "reward": 0.34593120217323303, "reward_std": 0.0575147345662117, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.28367358446121216, "rewards/logprob_reward/std": 0.30742818117141724, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 771.28125, "completions/mean_terminated_length": 745.137939453125, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 1.0123456790123457, "grad_norm": 1.5296042545199942, "kl": 0.16119384765625, "learning_rate": 4.623527786920761e-07, "loss": -0.3191, "num_tokens": 8700349.0, "reward": 0.3128499984741211, "reward_std": 0.19204246997833252, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.26080557703971863, "rewards/logprob_reward/std": 0.30975356698036194, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 681.59375, "completions/mean_terminated_length": 670.54833984375, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 1.0154320987654322, "grad_norm": 1.7405126459085807, "kl": 0.1654052734375, "learning_rate": 4.620885226264847e-07, "loss": -0.1985, "num_tokens": 8728256.0, "reward": 0.4890057444572449, "reward_std": 0.20618030428886414, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.44611746072769165, "rewards/logprob_reward/std": 0.35616156458854675, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 742.625, "completions/mean_terminated_length": 702.4285888671875, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 1.0185185185185186, "grad_norm": 2.044178040000131, "kl": 0.15966796875, "learning_rate": 4.6182341842704177e-07, "loss": -0.2793, "num_tokens": 8759044.0, "reward": 0.2933819890022278, "reward_std": 0.1604938805103302, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.24264663457870483, "rewards/logprob_reward/std": 0.2976343631744385, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 684.03125, "completions/mean_terminated_length": 673.0645141601562, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 1.021604938271605, "grad_norm": 1.6997353638750023, "kl": 0.1563720703125, "learning_rate": 4.6155746715388903e-07, "loss": -0.044, "num_tokens": 8787113.0, "reward": 0.40067258477211, "reward_std": 0.1611134111881256, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.34796953201293945, "rewards/logprob_reward/std": 0.34093475341796875, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 674.125, "completions/mean_terminated_length": 674.125, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 1.0246913580246915, "grad_norm": 1.6287015835686807, "kl": 0.1541748046875, "learning_rate": 4.6129066987055533e-07, "loss": -0.169, "num_tokens": 8814689.0, "reward": 0.2877604365348816, "reward_std": 0.08794932812452316, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.21556717157363892, "rewards/logprob_reward/std": 0.26832330226898193, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 727.15625, "completions/mean_terminated_length": 696.4483032226562, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 1.0277777777777777, "grad_norm": 2.3934086349553234, "kl": 0.15423583984375, "learning_rate": 4.610230276439526e-07, "loss": -0.221, "num_tokens": 8844294.0, "reward": 0.37226366996765137, "reward_std": 0.2540280818939209, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3268207311630249, "rewards/logprob_reward/std": 0.35140475630760193, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 772.03125, "completions/mean_terminated_length": 736.0357666015625, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 1.0308641975308641, "grad_norm": 1.7276905323943375, "kl": 0.1561279296875, "learning_rate": 4.607545415443721e-07, "loss": -0.0786, "num_tokens": 8875699.0, "reward": 0.4880432188510895, "reward_std": 0.18752965331077576, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4450480341911316, "rewards/logprob_reward/std": 0.34920734167099, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 699.8125, "completions/mean_terminated_length": 689.3547973632812, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 1.0339506172839505, "grad_norm": 1.8100396954895857, "kl": 0.148193359375, "learning_rate": 4.604852126454792e-07, "loss": -0.3319, "num_tokens": 8904553.0, "reward": 0.2966664731502533, "reward_std": 0.10275300592184067, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.24282385408878326, "rewards/logprob_reward/std": 0.30878373980522156, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 708.46875, "completions/mean_terminated_length": 698.290283203125, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 1.037037037037037, "grad_norm": 1.9717530465643918, "kl": 0.164306640625, "learning_rate": 4.6021504202430983e-07, "loss": -0.0965, "num_tokens": 8933976.0, "reward": 0.49608516693115234, "reward_std": 0.17462338507175446, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.45051127672195435, "rewards/logprob_reward/std": 0.33869361877441406, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 740.3125, "completions/mean_terminated_length": 699.7857666015625, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 1.0401234567901234, "grad_norm": 1.778557945269197, "kl": 0.16015625, "learning_rate": 4.599440307612661e-07, "loss": -0.0592, "num_tokens": 8964098.0, "reward": 0.3353271782398224, "reward_std": 0.12842261791229248, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2823079824447632, "rewards/logprob_reward/std": 0.3195965886116028, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 761.9375, "completions/mean_terminated_length": 744.4667358398438, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 1.0432098765432098, "grad_norm": 1.5822177745297374, "kl": 0.1688232421875, "learning_rate": 4.5967217994011144e-07, "loss": -0.1174, "num_tokens": 8994824.0, "reward": 0.3083050549030304, "reward_std": 0.1396174132823944, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.24533894658088684, "rewards/logprob_reward/std": 0.26990100741386414, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 685.53125, "completions/mean_terminated_length": 674.6128540039062, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 1.0462962962962963, "grad_norm": 1.7477556978817692, "kl": 0.1734619140625, "learning_rate": 4.593994906479669e-07, "loss": -0.0962, "num_tokens": 9023001.0, "reward": 0.3859248161315918, "reward_std": 0.09894312918186188, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.328110933303833, "rewards/logprob_reward/std": 0.31080329418182373, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 705.59375, "completions/mean_terminated_length": 695.3225708007812, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 1.0493827160493827, "grad_norm": 1.784312445784458, "kl": 0.17315673828125, "learning_rate": 4.591259639753066e-07, "loss": -0.2183, "num_tokens": 9052272.0, "reward": 0.3951360583305359, "reward_std": 0.19134345650672913, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.34529003500938416, "rewards/logprob_reward/std": 0.3040313422679901, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 674.84375, "completions/mean_terminated_length": 674.84375, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 1.0524691358024691, "grad_norm": 1.7443746080524123, "kl": 0.1614990234375, "learning_rate": 4.588516010159529e-07, "loss": -0.1277, "num_tokens": 9080087.0, "reward": 0.27620768547058105, "reward_std": 0.04202444106340408, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.20273073017597198, "rewards/logprob_reward/std": 0.2939564287662506, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 733.5625, "completions/mean_terminated_length": 714.2000122070312, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 1.0555555555555556, "grad_norm": 1.803712185567785, "kl": 0.1761474609375, "learning_rate": 4.58576402867073e-07, "loss": -0.0299, "num_tokens": 9110229.0, "reward": 0.3048999309539795, "reward_std": 0.10018420219421387, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.23461106419563293, "rewards/logprob_reward/std": 0.2689569890499115, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 675.8125, "completions/mean_terminated_length": 664.5806274414062, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 1.058641975308642, "grad_norm": 1.8237094101309665, "kl": 0.1666259765625, "learning_rate": 4.5830037062917373e-07, "loss": -0.0985, "num_tokens": 9138411.0, "reward": 0.4562307596206665, "reward_std": 0.10543191432952881, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.40622860193252563, "rewards/logprob_reward/std": 0.3304784595966339, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 616.5625, "completions/mean_terminated_length": 616.5625, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 1.0617283950617284, "grad_norm": 1.8023259592439844, "kl": 0.18603515625, "learning_rate": 4.580235054060971e-07, "loss": -0.1532, "num_tokens": 9163993.0, "reward": 0.28082212805747986, "reward_std": 0.05924757570028305, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.20785793662071228, "rewards/logprob_reward/std": 0.25364646315574646, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 683.8125, "completions/mean_terminated_length": 648.6206665039062, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 1.0648148148148149, "grad_norm": 1.7814867950195465, "kl": 0.17529296875, "learning_rate": 4.5774580830501685e-07, "loss": -0.0271, "num_tokens": 9192195.0, "reward": 0.3262143135070801, "reward_std": 0.0773783028125763, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2721825838088989, "rewards/logprob_reward/std": 0.31620585918426514, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 728.34375, "completions/mean_terminated_length": 718.8064575195312, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 1.0679012345679013, "grad_norm": 1.6811717966844153, "kl": 0.150634765625, "learning_rate": 4.574672804364329e-07, "loss": -0.021, "num_tokens": 9222210.0, "reward": 0.5070068836212158, "reward_std": 0.10863804072141647, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4591743052005768, "rewards/logprob_reward/std": 0.3085536062717438, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 771.65625, "completions/mean_terminated_length": 724.9259033203125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 1.0709876543209877, "grad_norm": 1.9670042361018614, "kl": 0.193115234375, "learning_rate": 4.571879229141674e-07, "loss": -0.0263, "num_tokens": 9253459.0, "reward": 0.35388824343681335, "reward_std": 0.07171864807605743, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.31682026386260986, "rewards/logprob_reward/std": 0.3356803357601166, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 756.03125, "completions/mean_terminated_length": 706.4074096679688, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 1.074074074074074, "grad_norm": 1.7555748467283874, "kl": 0.197998046875, "learning_rate": 4.5690773685536037e-07, "loss": -0.2268, "num_tokens": 9284432.0, "reward": 0.22919271886348724, "reward_std": 0.16237503290176392, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.16785302758216858, "rewards/logprob_reward/std": 0.25098544359207153, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 753.0, "completions/mean_terminated_length": 734.933349609375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 1.0771604938271604, "grad_norm": 1.7765626394434795, "kl": 0.1834716796875, "learning_rate": 4.5662672338046513e-07, "loss": -0.1996, "num_tokens": 9315596.0, "reward": 0.3539670407772064, "reward_std": 0.2097143679857254, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3030189275741577, "rewards/logprob_reward/std": 0.29757142066955566, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 707.71875, "completions/mean_terminated_length": 675.0, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 1.0802469135802468, "grad_norm": 1.742152111635428, "kl": 0.181396484375, "learning_rate": 4.5634488361324386e-07, "loss": -0.0948, "num_tokens": 9344483.0, "reward": 0.3031151592731476, "reward_std": 0.12978127598762512, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.23957239091396332, "rewards/logprob_reward/std": 0.3078653812408447, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 786.5, "completions/mean_terminated_length": 752.5714721679688, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 1.0833333333333333, "grad_norm": 1.7232889022380038, "kl": 0.166259765625, "learning_rate": 4.560622186807628e-07, "loss": -0.0882, "num_tokens": 9376907.0, "reward": 0.22259816527366638, "reward_std": 0.06441903114318848, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.16052573919296265, "rewards/logprob_reward/std": 0.29343971610069275, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 621.46875, "completions/mean_terminated_length": 621.46875, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 1.0864197530864197, "grad_norm": 1.8095209574544433, "kl": 0.17333984375, "learning_rate": 4.5577872971338826e-07, "loss": -0.1238, "num_tokens": 9402894.0, "reward": 0.4814028739929199, "reward_std": 0.11612526327371597, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4307253956794739, "rewards/logprob_reward/std": 0.3606871962547302, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 708.65625, "completions/mean_terminated_length": 687.6333618164062, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 1.0895061728395061, "grad_norm": 1.7416670846667817, "kl": 0.2139892578125, "learning_rate": 4.554944178447816e-07, "loss": -0.1644, "num_tokens": 9432503.0, "reward": 0.3205365538597107, "reward_std": 0.09633137285709381, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2554572820663452, "rewards/logprob_reward/std": 0.2847333550453186, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 703.4375, "completions/mean_terminated_length": 693.0967407226562, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 1.0925925925925926, "grad_norm": 1.9409341209289452, "kl": 0.1678466796875, "learning_rate": 4.552092842118952e-07, "loss": -0.0478, "num_tokens": 9461621.0, "reward": 0.3566076159477234, "reward_std": 0.07810933887958527, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.29553622007369995, "rewards/logprob_reward/std": 0.3342590928077698, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 707.1875, "completions/mean_terminated_length": 686.0667114257812, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 1.095679012345679, "grad_norm": 1.785174549100891, "kl": 0.15869140625, "learning_rate": 4.549233299549674e-07, "loss": -0.0766, "num_tokens": 9490611.0, "reward": 0.4348280727863312, "reward_std": 0.11788354068994522, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.38244783878326416, "rewards/logprob_reward/std": 0.3105940520763397, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 719.90625, "completions/mean_terminated_length": 676.4642944335938, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 1.0987654320987654, "grad_norm": 1.9109089780337054, "kl": 0.1805419921875, "learning_rate": 4.546365562175184e-07, "loss": 0.0044, "num_tokens": 9520240.0, "reward": 0.24984115362167358, "reward_std": 0.11191071569919586, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.18385128676891327, "rewards/logprob_reward/std": 0.23025715351104736, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 695.59375, "completions/mean_terminated_length": 685.0, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 1.1018518518518519, "grad_norm": 1.8457675901295947, "kl": 0.1864013671875, "learning_rate": 4.543489641463452e-07, "loss": -0.1839, "num_tokens": 9549151.0, "reward": 0.42485958337783813, "reward_std": 0.1277141273021698, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3713717460632324, "rewards/logprob_reward/std": 0.36805737018585205, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 668.84375, "completions/mean_terminated_length": 632.1034545898438, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 1.1049382716049383, "grad_norm": 1.7444704805027338, "kl": 0.1669921875, "learning_rate": 4.540605548915175e-07, "loss": -0.0777, "num_tokens": 9576698.0, "reward": 0.3078099191188812, "reward_std": 0.10854010283946991, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.24826101958751678, "rewards/logprob_reward/std": 0.29448699951171875, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 700.875, "completions/mean_terminated_length": 667.4483032226562, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 1.1080246913580247, "grad_norm": 1.942694179536497, "kl": 0.18212890625, "learning_rate": 4.537713296063729e-07, "loss": -0.0558, "num_tokens": 9605322.0, "reward": 0.3992810845375061, "reward_std": 0.13575471937656403, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.342951238155365, "rewards/logprob_reward/std": 0.28589510917663574, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 767.5, "completions/mean_terminated_length": 708.3077392578125, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 1.1111111111111112, "grad_norm": 1.6969759192147023, "kl": 0.154541015625, "learning_rate": 4.534812894475122e-07, "loss": 0.0404, "num_tokens": 9636298.0, "reward": 0.274058997631073, "reward_std": 0.17136448621749878, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.2211766242980957, "rewards/logprob_reward/std": 0.2847890853881836, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 717.25, "completions/mean_terminated_length": 707.3547973632812, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 1.1141975308641976, "grad_norm": 1.7575166390599173, "kl": 0.1671142578125, "learning_rate": 4.5319043557479474e-07, "loss": 0.0323, "num_tokens": 9666250.0, "reward": 0.2980307936668396, "reward_std": 0.061921969056129456, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.22697865962982178, "rewards/logprob_reward/std": 0.3177005648612976, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 641.625, "completions/mean_terminated_length": 629.290283203125, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 1.117283950617284, "grad_norm": 1.7808752212667782, "kl": 0.2083740234375, "learning_rate": 4.5289876915133394e-07, "loss": -0.1642, "num_tokens": 9693302.0, "reward": 0.4484062194824219, "reward_std": 0.15394183993339539, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.39753466844558716, "rewards/logprob_reward/std": 0.3276720345020294, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 726.375, "completions/mean_terminated_length": 695.586181640625, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 1.1203703703703705, "grad_norm": 1.839388600009079, "kl": 0.1690673828125, "learning_rate": 4.5260629134349284e-07, "loss": -0.0969, "num_tokens": 9723174.0, "reward": 0.39015138149261475, "reward_std": 0.1565711498260498, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3362793028354645, "rewards/logprob_reward/std": 0.29638880491256714, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 685.3125, "completions/mean_terminated_length": 636.9285888671875, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 1.123456790123457, "grad_norm": 2.115824536202263, "kl": 0.18798828125, "learning_rate": 4.523130033208788e-07, "loss": -0.2175, "num_tokens": 9751672.0, "reward": 0.37128084897994995, "reward_std": 0.2709712088108063, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.32572871446609497, "rewards/logprob_reward/std": 0.3476737141609192, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 698.6875, "completions/mean_terminated_length": 677.0000610351562, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 1.126543209876543, "grad_norm": 1.774737044748608, "kl": 0.1597900390625, "learning_rate": 4.520189062563393e-07, "loss": -0.0561, "num_tokens": 9780418.0, "reward": 0.41968581080436707, "reward_std": 0.13961078226566315, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3656231164932251, "rewards/logprob_reward/std": 0.32677170634269714, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 665.59375, "completions/mean_terminated_length": 641.7000122070312, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 1.1296296296296295, "grad_norm": 1.9089455759989087, "kl": 0.1748046875, "learning_rate": 4.5172400132595737e-07, "loss": -0.143, "num_tokens": 9808165.0, "reward": 0.44666704535484314, "reward_std": 0.0963614284992218, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.395602285861969, "rewards/logprob_reward/std": 0.2937835454940796, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 734.90625, "completions/mean_terminated_length": 693.607177734375, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 1.132716049382716, "grad_norm": 1.9062309332942429, "kl": 0.1790771484375, "learning_rate": 4.514282897090464e-07, "loss": 0.0246, "num_tokens": 9838662.0, "reward": 0.4043995141983032, "reward_std": 0.12931513786315918, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3521105647087097, "rewards/logprob_reward/std": 0.37220168113708496, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 747.40625, "completions/mean_terminated_length": 707.8928833007812, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 1.1358024691358024, "grad_norm": 1.94395173720028, "kl": 0.201416015625, "learning_rate": 4.511317725881457e-07, "loss": -0.0314, "num_tokens": 9869547.0, "reward": 0.25423258543014526, "reward_std": 0.08513497561216354, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.1887306571006775, "rewards/logprob_reward/std": 0.2540509104728699, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 697.9375, "completions/mean_terminated_length": 651.357177734375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 1.1388888888888888, "grad_norm": 1.8955059573828663, "kl": 0.1800537109375, "learning_rate": 4.50834451149016e-07, "loss": 0.0211, "num_tokens": 9898533.0, "reward": 0.39969688653945923, "reward_std": 0.12748803198337555, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.35035765171051025, "rewards/logprob_reward/std": 0.31814640760421753, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 644.78125, "completions/mean_terminated_length": 644.78125, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 1.1419753086419753, "grad_norm": 1.7530028921361416, "kl": 0.1810302734375, "learning_rate": 4.505363265806342e-07, "loss": -0.1708, "num_tokens": 9925410.0, "reward": 0.2889525294303894, "reward_std": 0.0775633379817009, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.22036395967006683, "rewards/logprob_reward/std": 0.2845820486545563, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 685.4375, "completions/mean_terminated_length": 650.413818359375, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 1.1450617283950617, "grad_norm": 1.8231182041468141, "kl": 0.181640625, "learning_rate": 4.502374000751891e-07, "loss": -0.0958, "num_tokens": 9953984.0, "reward": 0.2698981761932373, "reward_std": 0.11757264286279678, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.20266462862491608, "rewards/logprob_reward/std": 0.2318088412284851, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 680.09375, "completions/mean_terminated_length": 657.1666870117188, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 1.1481481481481481, "grad_norm": 1.9192411115436785, "kl": 0.1776123046875, "learning_rate": 4.49937672828076e-07, "loss": -0.1749, "num_tokens": 9982191.0, "reward": 0.28294429183006287, "reward_std": 0.12822963297367096, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.22410476207733154, "rewards/logprob_reward/std": 0.28940549492836, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 704.25, "completions/mean_terminated_length": 682.933349609375, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 1.1512345679012346, "grad_norm": 1.9288893527293391, "kl": 0.1846923828125, "learning_rate": 4.4963714603789315e-07, "loss": -0.13, "num_tokens": 10011671.0, "reward": 0.48150351643562317, "reward_std": 0.13163623213768005, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4377816915512085, "rewards/logprob_reward/std": 0.3106361925601959, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 657.0, "completions/mean_terminated_length": 632.5333862304688, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 1.154320987654321, "grad_norm": 2.0198418585369176, "kl": 0.17724609375, "learning_rate": 4.4933582090643516e-07, "loss": -0.1356, "num_tokens": 10039183.0, "reward": 0.4289727210998535, "reward_std": 0.1568451076745987, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3863585591316223, "rewards/logprob_reward/std": 0.3272345960140228, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 692.3125, "completions/mean_terminated_length": 644.9285888671875, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 1.1574074074074074, "grad_norm": 1.9449171130405607, "kl": 0.20166015625, "learning_rate": 4.4903369863869e-07, "loss": -0.1224, "num_tokens": 10068141.0, "reward": 0.3509941101074219, "reward_std": 0.17378085851669312, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.30318793654441833, "rewards/logprob_reward/std": 0.3854600191116333, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 658.4375, "completions/mean_terminated_length": 646.6451416015625, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 1.1604938271604939, "grad_norm": 2.281663296302417, "kl": 0.195068359375, "learning_rate": 4.4873078044283273e-07, "loss": -0.2036, "num_tokens": 10095891.0, "reward": 0.37850576639175415, "reward_std": 0.11467234790325165, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3198675513267517, "rewards/logprob_reward/std": 0.24902574717998505, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 664.46875, "completions/mean_terminated_length": 640.5000610351562, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 1.1635802469135803, "grad_norm": 2.0778642143376467, "kl": 0.1981201171875, "learning_rate": 4.484270675302218e-07, "loss": -0.0499, "num_tokens": 10123962.0, "reward": 0.498789519071579, "reward_std": 0.13224981725215912, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4535160958766937, "rewards/logprob_reward/std": 0.3516581952571869, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 678.09375, "completions/mean_terminated_length": 666.9354858398438, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 1.1666666666666667, "grad_norm": 1.8475166169349337, "kl": 0.1820068359375, "learning_rate": 4.481225611153933e-07, "loss": -0.0842, "num_tokens": 10152425.0, "reward": 0.5553348660469055, "reward_std": 0.1258741170167923, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.5128719806671143, "rewards/logprob_reward/std": 0.32009485363960266, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 717.96875, "completions/mean_terminated_length": 661.2963256835938, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 1.1697530864197532, "grad_norm": 2.8419692831906525, "kl": 0.2025146484375, "learning_rate": 4.4781726241605683e-07, "loss": -0.4449, "num_tokens": 10182408.0, "reward": 0.27363550662994385, "reward_std": 0.17848703265190125, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.23806722462177277, "rewards/logprob_reward/std": 0.32219284772872925, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 672.8125, "completions/mean_terminated_length": 661.4838256835938, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 1.1728395061728394, "grad_norm": 1.8432592674642745, "kl": 0.1824951171875, "learning_rate": 4.4751117265309e-07, "loss": -0.2853, "num_tokens": 10210390.0, "reward": 0.26492011547088623, "reward_std": 0.0770343616604805, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.19713342189788818, "rewards/logprob_reward/std": 0.2790571451187134, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 695.375, "completions/mean_terminated_length": 648.4285888671875, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 1.175925925925926, "grad_norm": 1.8925532635265268, "kl": 0.2276611328125, "learning_rate": 4.472042930505342e-07, "loss": 0.0614, "num_tokens": 10239018.0, "reward": 0.4651971459388733, "reward_std": 0.1915726214647293, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4196634888648987, "rewards/logprob_reward/std": 0.3236381709575653, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 721.34375, "completions/mean_terminated_length": 678.107177734375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 1.1790123456790123, "grad_norm": 2.3443044450277464, "kl": 0.201171875, "learning_rate": 4.46896624835589e-07, "loss": 0.0589, "num_tokens": 10269125.0, "reward": 0.41626065969467163, "reward_std": 0.17843464016914368, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3687618672847748, "rewards/logprob_reward/std": 0.3181452453136444, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 609.125, "completions/mean_terminated_length": 609.125, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 1.1820987654320987, "grad_norm": 1.771441655415213, "kl": 0.2030029296875, "learning_rate": 4.465881692386078e-07, "loss": -0.1203, "num_tokens": 10294537.0, "reward": 0.3583890199661255, "reward_std": 0.027536917477846146, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.29057109355926514, "rewards/logprob_reward/std": 0.2985168397426605, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 633.25, "completions/mean_terminated_length": 633.25, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 1.1851851851851851, "grad_norm": 1.9818800238050571, "kl": 0.1905517578125, "learning_rate": 4.4627892749309273e-07, "loss": -0.0591, "num_tokens": 10321001.0, "reward": 0.4245106279850006, "reward_std": 0.0699777826666832, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3675118088722229, "rewards/logprob_reward/std": 0.30467489361763, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 708.65625, "completions/mean_terminated_length": 687.6333618164062, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 1.1882716049382716, "grad_norm": 2.9009822069344007, "kl": 0.19873046875, "learning_rate": 4.459689008356896e-07, "loss": -0.3449, "num_tokens": 10350826.0, "reward": 0.3208976984024048, "reward_std": 0.14450488984584808, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.27669188380241394, "rewards/logprob_reward/std": 0.3419397175312042, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 673.53125, "completions/mean_terminated_length": 673.53125, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 1.191358024691358, "grad_norm": 1.899353174899307, "kl": 0.1844482421875, "learning_rate": 4.4565809050618317e-07, "loss": -0.2333, "num_tokens": 10378819.0, "reward": 0.3638414740562439, "reward_std": 0.15561725199222565, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.307046115398407, "rewards/logprob_reward/std": 0.2958175837993622, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 626.9375, "completions/mean_terminated_length": 585.862060546875, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 1.1944444444444444, "grad_norm": 2.4491150979065885, "kl": 0.2093505859375, "learning_rate": 4.45346497747492e-07, "loss": -0.1114, "num_tokens": 10405005.0, "reward": 0.37417009472846985, "reward_std": 0.1424696147441864, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3219945430755615, "rewards/logprob_reward/std": 0.2858918607234955, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 698.15625, "completions/mean_terminated_length": 676.433349609375, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 1.1975308641975309, "grad_norm": 1.9098804847951727, "kl": 0.2005615234375, "learning_rate": 4.450341238056634e-07, "loss": -0.0078, "num_tokens": 10433926.0, "reward": 0.38260602951049805, "reward_std": 0.06737158447504044, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3244233727455139, "rewards/logprob_reward/std": 0.32282161712646484, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 672.34375, "completions/mean_terminated_length": 661.0, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 1.2006172839506173, "grad_norm": 1.8876225266101572, "kl": 0.2037353515625, "learning_rate": 4.4472096992986895e-07, "loss": -0.1186, "num_tokens": 10461273.0, "reward": 0.43250441551208496, "reward_std": 0.08439714461565018, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.37986600399017334, "rewards/logprob_reward/std": 0.28391388058662415, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 711.4375, "completions/mean_terminated_length": 690.6000366210938, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 1.2037037037037037, "grad_norm": 1.7760892613242423, "kl": 0.181396484375, "learning_rate": 4.444070373723989e-07, "loss": -0.0526, "num_tokens": 10490515.0, "reward": 0.46583858132362366, "reward_std": 0.11341547220945358, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4169039726257324, "rewards/logprob_reward/std": 0.3366798162460327, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 724.0, "completions/mean_terminated_length": 692.9655151367188, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 1.2067901234567902, "grad_norm": 1.7569039769820898, "kl": 0.2022705078125, "learning_rate": 4.4409232738865744e-07, "loss": -0.0517, "num_tokens": 10520583.0, "reward": 0.30083006620407104, "reward_std": 0.16141238808631897, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.24050560593605042, "rewards/logprob_reward/std": 0.3009737730026245, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 684.90625, "completions/mean_terminated_length": 662.300048828125, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 1.2098765432098766, "grad_norm": 1.7065903373399869, "kl": 0.1998291015625, "learning_rate": 4.4377684123715763e-07, "loss": -0.3056, "num_tokens": 10549200.0, "reward": 0.26473569869995117, "reward_std": 0.15362651646137238, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.20387297868728638, "rewards/logprob_reward/std": 0.2488246113061905, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 673.28125, "completions/mean_terminated_length": 661.9677124023438, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 1.212962962962963, "grad_norm": 1.777431516640144, "kl": 0.2037353515625, "learning_rate": 4.434605801795167e-07, "loss": -0.0399, "num_tokens": 10577401.0, "reward": 0.4544137418270111, "reward_std": 0.10484503954648972, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4007374942302704, "rewards/logprob_reward/std": 0.3108270764350891, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 682.78125, "completions/mean_terminated_length": 671.774169921875, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 1.2160493827160495, "grad_norm": 1.6259877834798262, "kl": 0.208740234375, "learning_rate": 4.431435454804503e-07, "loss": -0.1775, "num_tokens": 10605702.0, "reward": 0.3046150803565979, "reward_std": 0.11134239286184311, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.24123899638652802, "rewards/logprob_reward/std": 0.2965165972709656, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 740.59375, "completions/mean_terminated_length": 721.7000122070312, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 1.2191358024691359, "grad_norm": 1.7067222532887, "kl": 0.212158203125, "learning_rate": 4.42825738407768e-07, "loss": -0.0262, "num_tokens": 10636509.0, "reward": 0.2922634482383728, "reward_std": 0.050021443516016006, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.22404274344444275, "rewards/logprob_reward/std": 0.3021162450313568, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 721.21875, "completions/mean_terminated_length": 711.4515991210938, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 1.2222222222222223, "grad_norm": 1.8744250474486976, "kl": 0.192138671875, "learning_rate": 4.425071602323681e-07, "loss": 0.0036, "num_tokens": 10666336.0, "reward": 0.43557479977607727, "reward_std": 0.020043041557073593, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.3728609085083008, "rewards/logprob_reward/std": 0.3177824020385742, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 688.25, "completions/mean_terminated_length": 665.86669921875, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 1.2253086419753085, "grad_norm": 1.8995812572372421, "kl": 0.2060546875, "learning_rate": 4.421878122282325e-07, "loss": -0.0618, "num_tokens": 10694392.0, "reward": 0.479648619890213, "reward_std": 0.16129814088344574, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4357207119464874, "rewards/logprob_reward/std": 0.324872761964798, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 613.96875, "completions/mean_terminated_length": 613.96875, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 1.228395061728395, "grad_norm": 1.8704730804921612, "kl": 0.211181640625, "learning_rate": 4.4186769567242163e-07, "loss": -0.182, "num_tokens": 10720239.0, "reward": 0.394355446100235, "reward_std": 0.14043927192687988, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.34095048904418945, "rewards/logprob_reward/std": 0.341088205575943, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 704.84375, "completions/mean_terminated_length": 704.84375, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 1.2314814814814814, "grad_norm": 1.8129328182097417, "kl": 0.193603515625, "learning_rate": 4.4154681184506927e-07, "loss": -0.0602, "num_tokens": 10749214.0, "reward": 0.4249587059020996, "reward_std": 0.060396406799554825, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.36453741788864136, "rewards/logprob_reward/std": 0.31362006068229675, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 713.78125, "completions/mean_terminated_length": 703.774169921875, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 1.2345679012345678, "grad_norm": 1.9467172533773898, "kl": 0.2332763671875, "learning_rate": 4.4122516202937745e-07, "loss": -0.1665, "num_tokens": 10778687.0, "reward": 0.3848200738430023, "reward_std": 0.10175098478794098, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3303556442260742, "rewards/logprob_reward/std": 0.26722925901412964, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 649.1875, "completions/mean_terminated_length": 649.1875, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 1.2376543209876543, "grad_norm": 1.8583183177522469, "kl": 0.1983642578125, "learning_rate": 4.4090274751161144e-07, "loss": -0.2323, "num_tokens": 10806109.0, "reward": 0.3918599486351013, "reward_std": 0.1365358829498291, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.33817771077156067, "rewards/logprob_reward/std": 0.2772637903690338, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 683.96875, "completions/mean_terminated_length": 683.96875, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 1.2407407407407407, "grad_norm": 1.5936545383904313, "kl": 0.2208251953125, "learning_rate": 4.4057956958109453e-07, "loss": -0.2322, "num_tokens": 10834912.0, "reward": 0.4339386820793152, "reward_std": 0.1126788929104805, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.38145965337753296, "rewards/logprob_reward/std": 0.31864434480667114, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 692.34375, "completions/mean_terminated_length": 692.34375, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 1.2438271604938271, "grad_norm": 1.7611536145496853, "kl": 0.2037353515625, "learning_rate": 4.402556295302029e-07, "loss": -0.018, "num_tokens": 10863555.0, "reward": 0.46414098143577576, "reward_std": 0.08751709014177322, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.4080733060836792, "rewards/logprob_reward/std": 0.31995660066604614, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 691.8125, "completions/mean_terminated_length": 681.0967407226562, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 1.2469135802469136, "grad_norm": 1.6968859617671568, "kl": 0.189208984375, "learning_rate": 4.3993092865436035e-07, "loss": -0.0184, "num_tokens": 10891857.0, "reward": 0.3808833062648773, "reward_std": 0.15741512179374695, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.322509229183197, "rewards/logprob_reward/std": 0.3390623927116394, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 672.25, "completions/mean_terminated_length": 660.9031982421875, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 1.25, "grad_norm": 1.8517745948271715, "kl": 0.2349853515625, "learning_rate": 4.3960546825203304e-07, "loss": -0.1622, "num_tokens": 10919973.0, "reward": 0.3947577178478241, "reward_std": 0.14303703606128693, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3413974642753601, "rewards/logprob_reward/std": 0.314985990524292, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 667.5, "completions/mean_terminated_length": 656.0, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 1.2530864197530864, "grad_norm": 1.628849545181259, "kl": 0.2232666015625, "learning_rate": 4.392792496247248e-07, "loss": -0.1882, "num_tokens": 10947725.0, "reward": 0.353115051984787, "reward_std": 0.1257696896791458, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.29165560007095337, "rewards/logprob_reward/std": 0.2920438051223755, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 645.625, "completions/mean_terminated_length": 645.625, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 1.2561728395061729, "grad_norm": 1.863828073669658, "kl": 0.2237548828125, "learning_rate": 4.3895227407697135e-07, "loss": -0.081, "num_tokens": 10974993.0, "reward": 0.3625939190387726, "reward_std": 0.07430053502321243, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2987154424190521, "rewards/logprob_reward/std": 0.298009991645813, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 680.03125, "completions/mean_terminated_length": 657.1000366210938, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 1.2592592592592593, "grad_norm": 1.7041759338795905, "kl": 0.2293701171875, "learning_rate": 4.3862454291633523e-07, "loss": -0.1839, "num_tokens": 11002958.0, "reward": 0.4448365569114685, "reward_std": 0.20915868878364563, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.40745729207992554, "rewards/logprob_reward/std": 0.31327465176582336, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 728.25, "completions/mean_terminated_length": 686.0000610351562, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 1.2623456790123457, "grad_norm": 1.9326938940071368, "kl": 0.2568359375, "learning_rate": 4.382960574534009e-07, "loss": 0.0303, "num_tokens": 11033878.0, "reward": 0.26945725083351135, "reward_std": 0.13883638381958008, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.20564697682857513, "rewards/logprob_reward/std": 0.2669181525707245, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 664.4375, "completions/mean_terminated_length": 652.8386840820312, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 1.2654320987654322, "grad_norm": 1.7315250416905614, "kl": 0.212158203125, "learning_rate": 4.3796681900176903e-07, "loss": -0.0883, "num_tokens": 11062104.0, "reward": 0.232651948928833, "reward_std": 0.08430661261081696, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.15780772268772125, "rewards/logprob_reward/std": 0.2765156328678131, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 732.5625, "completions/mean_terminated_length": 678.5925903320312, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 1.2685185185185186, "grad_norm": 2.1920256686772697, "kl": 0.2099609375, "learning_rate": 4.3763682887805153e-07, "loss": -0.3636, "num_tokens": 11092398.0, "reward": 0.41544368863105774, "reward_std": 0.2628130316734314, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.3852151930332184, "rewards/logprob_reward/std": 0.3716643750667572, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 707.6875, "completions/mean_terminated_length": 697.4838256835938, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 1.2716049382716048, "grad_norm": 1.782355357807194, "kl": 0.2095947265625, "learning_rate": 4.3730608840186625e-07, "loss": 0.0397, "num_tokens": 11121472.0, "reward": 0.38699018955230713, "reward_std": 0.049748074263334274, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.3223502039909363, "rewards/logprob_reward/std": 0.30200719833374023, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 641.375, "completions/mean_terminated_length": 629.0322265625, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 1.2746913580246915, "grad_norm": 1.7802268552191287, "kl": 0.2294921875, "learning_rate": 4.3697459889583166e-07, "loss": -0.2236, "num_tokens": 11148600.0, "reward": 0.30410224199295044, "reward_std": 0.13497327268123627, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.24414139986038208, "rewards/logprob_reward/std": 0.28533270955085754, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 649.3125, "completions/mean_terminated_length": 649.3125, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 1.2777777777777777, "grad_norm": 1.9540352173079205, "kl": 0.2078857421875, "learning_rate": 4.366423616855615e-07, "loss": -0.1546, "num_tokens": 11175562.0, "reward": 0.3013601005077362, "reward_std": 0.053075261414051056, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.23762232065200806, "rewards/logprob_reward/std": 0.2833193838596344, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 711.0625, "completions/mean_terminated_length": 666.357177734375, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 1.2808641975308643, "grad_norm": 1.8186840980301837, "kl": 0.224365234375, "learning_rate": 4.363093780996596e-07, "loss": 0.0144, "num_tokens": 11204640.0, "reward": 0.38616931438446045, "reward_std": 0.09226133674383163, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.33532702922821045, "rewards/logprob_reward/std": 0.2958979606628418, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 632.5, "completions/mean_terminated_length": 632.5, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 1.2839506172839505, "grad_norm": 1.8996005727820868, "kl": 0.23974609375, "learning_rate": 4.359756494697146e-07, "loss": -0.1131, "num_tokens": 11231284.0, "reward": 0.41181010007858276, "reward_std": 0.050463706254959106, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3534001410007477, "rewards/logprob_reward/std": 0.290108323097229, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 659.6875, "completions/mean_terminated_length": 635.4000244140625, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 1.287037037037037, "grad_norm": 1.8158782189092906, "kl": 0.232666015625, "learning_rate": 4.356411771302944e-07, "loss": -0.0954, "num_tokens": 11258982.0, "reward": 0.32881060242652893, "reward_std": 0.11027109622955322, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.26812291145324707, "rewards/logprob_reward/std": 0.28130772709846497, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 649.9375, "completions/mean_terminated_length": 637.8709716796875, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 1.2901234567901234, "grad_norm": 1.7777585570202132, "kl": 0.2291259765625, "learning_rate": 4.353059624189411e-07, "loss": -0.1077, "num_tokens": 11286432.0, "reward": 0.3244953155517578, "reward_std": 0.07711391150951385, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2668003439903259, "rewards/logprob_reward/std": 0.3004566729068756, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 641.96875, "completions/mean_terminated_length": 641.96875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 1.2932098765432098, "grad_norm": 1.6993472859857148, "kl": 0.2158203125, "learning_rate": 4.3497000667616534e-07, "loss": -0.1115, "num_tokens": 11313635.0, "reward": 0.3319767117500305, "reward_std": 0.10741353780031204, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.268168568611145, "rewards/logprob_reward/std": 0.2863391637802124, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 689.90625, "completions/mean_terminated_length": 667.6333618164062, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 1.2962962962962963, "grad_norm": 1.8147846811795532, "kl": 0.2276611328125, "learning_rate": 4.346333112454413e-07, "loss": -0.0922, "num_tokens": 11342008.0, "reward": 0.3092659115791321, "reward_std": 0.12654417753219604, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.24987879395484924, "rewards/logprob_reward/std": 0.28420397639274597, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 611.78125, "completions/mean_terminated_length": 611.78125, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 1.2993827160493827, "grad_norm": 1.5726857002291987, "kl": 0.2120361328125, "learning_rate": 4.342958774732011e-07, "loss": -0.2052, "num_tokens": 11367961.0, "reward": 0.4570673704147339, "reward_std": 0.17360137403011322, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.41757482290267944, "rewards/logprob_reward/std": 0.327872633934021, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 663.71875, "completions/mean_terminated_length": 652.0967407226562, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 1.3024691358024691, "grad_norm": 1.7004925627613858, "kl": 0.2344970703125, "learning_rate": 4.3395770670882935e-07, "loss": -0.1059, "num_tokens": 11395640.0, "reward": 0.3118894398212433, "reward_std": 0.08785378932952881, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2458493858575821, "rewards/logprob_reward/std": 0.2987278997898102, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 643.15625, "completions/mean_terminated_length": 603.7586059570312, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 1.3055555555555556, "grad_norm": 2.0488605787434118, "kl": 0.2364501953125, "learning_rate": 4.3361880030465803e-07, "loss": 0.0016, "num_tokens": 11422365.0, "reward": 0.31915712356567383, "reward_std": 0.09576151520013809, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.25739678740501404, "rewards/logprob_reward/std": 0.2789553701877594, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 666.84375, "completions/mean_terminated_length": 666.84375, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 1.308641975308642, "grad_norm": 1.829407137709928, "kl": 0.234619140625, "learning_rate": 4.3327915961596066e-07, "loss": -0.1152, "num_tokens": 11449800.0, "reward": 0.4870791435241699, "reward_std": 0.10329632461071014, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4370323419570923, "rewards/logprob_reward/std": 0.2747824192047119, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 722.46875, "completions/mean_terminated_length": 722.46875, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 1.3117283950617284, "grad_norm": 1.6668568629620062, "kl": 0.245361328125, "learning_rate": 4.3293878600094746e-07, "loss": -0.1263, "num_tokens": 11479991.0, "reward": 0.36308202147483826, "reward_std": 0.053532831370830536, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2992578148841858, "rewards/logprob_reward/std": 0.32172027230262756, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 611.0, "completions/mean_terminated_length": 611.0, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 1.3148148148148149, "grad_norm": 2.4597197933900903, "kl": 0.23876953125, "learning_rate": 4.325976808207594e-07, "loss": -0.1897, "num_tokens": 11506191.0, "reward": 0.30910032987594604, "reward_std": 0.06165868043899536, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.24969482421875, "rewards/logprob_reward/std": 0.3103349804878235, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 734.4375, "completions/mean_terminated_length": 704.4827270507812, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 1.3179012345679013, "grad_norm": 1.5408544184239952, "kl": 0.2364501953125, "learning_rate": 4.3225584543946303e-07, "loss": -0.1712, "num_tokens": 11536977.0, "reward": 0.3227326273918152, "reward_std": 0.14196579158306122, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.27178627252578735, "rewards/logprob_reward/std": 0.36609771847724915, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 674.75, "completions/mean_terminated_length": 663.4838256835938, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 1.3209876543209877, "grad_norm": 1.8071744343482854, "kl": 0.2255859375, "learning_rate": 4.319132812240448e-07, "loss": 0.0064, "num_tokens": 11565129.0, "reward": 0.24986402690410614, "reward_std": 0.020610220730304718, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.16998781263828278, "rewards/logprob_reward/std": 0.29376277327537537, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 651.46875, "completions/mean_terminated_length": 639.4515991210938, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 1.324074074074074, "grad_norm": 1.7529437375201444, "kl": 0.2513427734375, "learning_rate": 4.3156998954440587e-07, "loss": -0.1313, "num_tokens": 11592700.0, "reward": 0.2823496460914612, "reward_std": 0.10343953967094421, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.21302737295627594, "rewards/logprob_reward/std": 0.2671968638896942, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 600.0, "completions/mean_terminated_length": 600.0, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 1.3271604938271606, "grad_norm": 1.8938786192898898, "kl": 0.2520751953125, "learning_rate": 4.312259717733565e-07, "loss": -0.0513, "num_tokens": 11618164.0, "reward": 0.4317903518676758, "reward_std": 0.07958275079727173, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.3721281886100769, "rewards/logprob_reward/std": 0.2777947783470154, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 704.0, "completions/mean_terminated_length": 704.0, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 1.3302469135802468, "grad_norm": 1.7974271972623401, "kl": 0.23779296875, "learning_rate": 4.308812292866105e-07, "loss": -0.0233, "num_tokens": 11647124.0, "reward": 0.39808037877082825, "reward_std": 0.030062925070524216, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.33120042085647583, "rewards/logprob_reward/std": 0.3527129292488098, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 699.21875, "completions/mean_terminated_length": 677.5667114257812, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 1.3333333333333333, "grad_norm": 1.8306924631643946, "kl": 0.230224609375, "learning_rate": 4.3053576346277997e-07, "loss": -0.1398, "num_tokens": 11675975.0, "reward": 0.2862873673439026, "reward_std": 0.1391184777021408, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2243470996618271, "rewards/logprob_reward/std": 0.21020878851413727, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 670.40625, "completions/mean_terminated_length": 646.8333740234375, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 1.3364197530864197, "grad_norm": 2.0865250050528936, "kl": 0.230712890625, "learning_rate": 4.301895756833692e-07, "loss": -0.1011, "num_tokens": 11703940.0, "reward": 0.3959459662437439, "reward_std": 0.18350961804389954, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3461899757385254, "rewards/logprob_reward/std": 0.3270353078842163, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 668.40625, "completions/mean_terminated_length": 668.40625, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 1.3395061728395061, "grad_norm": 1.7428421850387181, "kl": 0.2276611328125, "learning_rate": 4.298426673327701e-07, "loss": -0.1901, "num_tokens": 11731717.0, "reward": 0.47772669792175293, "reward_std": 0.06522883474826813, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.42664074897766113, "rewards/logprob_reward/std": 0.3209468722343445, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 683.59375, "completions/mean_terminated_length": 660.9000244140625, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 1.3425925925925926, "grad_norm": 1.7556947960357638, "kl": 0.2318115234375, "learning_rate": 4.2949503979825563e-07, "loss": -0.0243, "num_tokens": 11760008.0, "reward": 0.4469088315963745, "reward_std": 0.18695521354675293, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3958709239959717, "rewards/logprob_reward/std": 0.3618847131729126, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 695.375, "completions/mean_terminated_length": 695.375, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 1.345679012345679, "grad_norm": 1.509183611935194, "kl": 0.2266845703125, "learning_rate": 4.2914669446997504e-07, "loss": -0.1917, "num_tokens": 11789112.0, "reward": 0.5762996673583984, "reward_std": 0.14322608709335327, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.5396385192871094, "rewards/logprob_reward/std": 0.3687451481819153, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 629.65625, "completions/mean_terminated_length": 629.65625, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 1.3487654320987654, "grad_norm": 1.9193409329678235, "kl": 0.2501220703125, "learning_rate": 4.287976327409478e-07, "loss": 0.015, "num_tokens": 11815393.0, "reward": 0.32551538944244385, "reward_std": 0.03944399207830429, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.25751712918281555, "rewards/logprob_reward/std": 0.29502660036087036, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 692.21875, "completions/mean_terminated_length": 692.21875, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 1.3518518518518519, "grad_norm": 1.7027911970777934, "kl": 0.2340087890625, "learning_rate": 4.284478560070585e-07, "loss": -0.054, "num_tokens": 11844288.0, "reward": 0.5426849126815796, "reward_std": 0.052159011363983154, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.495344340801239, "rewards/logprob_reward/std": 0.34464961290359497, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 652.625, "completions/mean_terminated_length": 640.6451416015625, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 1.3549382716049383, "grad_norm": 1.840891916005011, "kl": 0.2420654296875, "learning_rate": 4.280973656670508e-07, "loss": -0.1462, "num_tokens": 11871112.0, "reward": 0.39393919706344604, "reward_std": 0.08086130023002625, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3404880166053772, "rewards/logprob_reward/std": 0.30242791771888733, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 601.6875, "completions/mean_terminated_length": 601.6875, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 1.3580246913580247, "grad_norm": 1.8257421358268964, "kl": 0.250732421875, "learning_rate": 4.277461631225221e-07, "loss": -0.0332, "num_tokens": 11896418.0, "reward": 0.4316021800041199, "reward_std": 0.12745192646980286, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.37191909551620483, "rewards/logprob_reward/std": 0.2930411696434021, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 678.625, "completions/mean_terminated_length": 655.6000366210938, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 1.3611111111111112, "grad_norm": 1.9205620700950574, "kl": 0.2489013671875, "learning_rate": 4.2739424977791784e-07, "loss": -0.0209, "num_tokens": 11925062.0, "reward": 0.2340576946735382, "reward_std": 0.0891726091504097, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.15936966240406036, "rewards/logprob_reward/std": 0.2742205858230591, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 698.9375, "completions/mean_terminated_length": 677.2667236328125, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 1.3641975308641976, "grad_norm": 2.1527224411889176, "kl": 0.2293701171875, "learning_rate": 4.2704162704052594e-07, "loss": -0.0618, "num_tokens": 11953900.0, "reward": 0.22152839601039886, "reward_std": 0.11285325884819031, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.1523926556110382, "rewards/logprob_reward/std": 0.2706919014453888, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 660.34375, "completions/mean_terminated_length": 660.34375, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 1.367283950617284, "grad_norm": 1.5867903908643584, "kl": 0.23779296875, "learning_rate": 4.2668829632047124e-07, "loss": -0.1115, "num_tokens": 11981275.0, "reward": 0.40765953063964844, "reward_std": 0.0801214873790741, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3487883508205414, "rewards/logprob_reward/std": 0.26389750838279724, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 665.25, "completions/mean_terminated_length": 665.25, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 1.3703703703703702, "grad_norm": 1.7087261570830543, "kl": 0.2532958984375, "learning_rate": 4.2633425903070973e-07, "loss": -0.1115, "num_tokens": 12009047.0, "reward": 0.384908527135849, "reward_std": 0.10631655156612396, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.32350945472717285, "rewards/logprob_reward/std": 0.2891896963119507, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 658.03125, "completions/mean_terminated_length": 646.2257690429688, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 1.373456790123457, "grad_norm": 1.694982689344678, "kl": 0.2374267578125, "learning_rate": 4.259795165870229e-07, "loss": -0.0879, "num_tokens": 12037264.0, "reward": 0.2926807999610901, "reward_std": 0.10861394554376602, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2245064675807953, "rewards/logprob_reward/std": 0.2800906300544739, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 712.8125, "completions/mean_terminated_length": 680.6206665039062, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 1.376543209876543, "grad_norm": 1.817903305068257, "kl": 0.2476806640625, "learning_rate": 4.256240704080121e-07, "loss": -0.0993, "num_tokens": 12066398.0, "reward": 0.3412991762161255, "reward_std": 0.15560884773731232, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2889435291290283, "rewards/logprob_reward/std": 0.2789413034915924, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 636.15625, "completions/mean_terminated_length": 610.300048828125, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 1.3796296296296298, "grad_norm": 1.679636796501421, "kl": 0.265625, "learning_rate": 4.2526792191509297e-07, "loss": -0.0305, "num_tokens": 12092959.0, "reward": 0.3824636936187744, "reward_std": 0.08300714194774628, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3207929730415344, "rewards/logprob_reward/std": 0.30403515696525574, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 703.96875, "completions/mean_terminated_length": 644.7037353515625, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 1.382716049382716, "grad_norm": 1.4559490449314987, "kl": 0.2880859375, "learning_rate": 4.249110725324897e-07, "loss": -0.2289, "num_tokens": 12122426.0, "reward": 0.37153974175453186, "reward_std": 0.1705636978149414, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.3329608142375946, "rewards/logprob_reward/std": 0.38416314125061035, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 745.0, "completions/mean_terminated_length": 726.4000244140625, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 1.3858024691358024, "grad_norm": 1.6084966743951474, "kl": 0.2315673828125, "learning_rate": 4.2455352368722916e-07, "loss": -0.0509, "num_tokens": 12152886.0, "reward": 0.4715339243412018, "reward_std": 0.06669139862060547, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.42323213815689087, "rewards/logprob_reward/std": 0.3124513626098633, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 712.28125, "completions/mean_terminated_length": 691.5000610351562, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 1.3888888888888888, "grad_norm": 1.7748771338380662, "kl": 0.2381591796875, "learning_rate": 4.2419527680913554e-07, "loss": -0.1264, "num_tokens": 12182567.0, "reward": 0.2789851427078247, "reward_std": 0.12647826969623566, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2127612680196762, "rewards/logprob_reward/std": 0.26926350593566895, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 619.21875, "completions/mean_terminated_length": 619.21875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 1.3919753086419753, "grad_norm": 1.7225349792099427, "kl": 0.302978515625, "learning_rate": 4.2383633333082423e-07, "loss": -0.2504, "num_tokens": 12208638.0, "reward": 0.3069344460964203, "reward_std": 0.13005352020263672, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2438160479068756, "rewards/logprob_reward/std": 0.2806828022003174, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 645.28125, "completions/mean_terminated_length": 645.28125, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 1.3950617283950617, "grad_norm": 1.7421468900955437, "kl": 0.2884521484375, "learning_rate": 4.234766946876965e-07, "loss": -0.1319, "num_tokens": 12235719.0, "reward": 0.32916927337646484, "reward_std": 0.04509817808866501, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.25810474157333374, "rewards/logprob_reward/std": 0.27571651339530945, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 619.5, "completions/mean_terminated_length": 606.4515991210938, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 1.3981481481481481, "grad_norm": 1.8565168219557857, "kl": 0.2828369140625, "learning_rate": 4.231163623179335e-07, "loss": -0.0976, "num_tokens": 12261911.0, "reward": 0.3649769127368927, "reward_std": 0.1030060350894928, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3013632297515869, "rewards/logprob_reward/std": 0.30950435996055603, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 702.21875, "completions/mean_terminated_length": 691.8386840820312, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 1.4012345679012346, "grad_norm": 1.8249479236742987, "kl": 0.265625, "learning_rate": 4.227553376624904e-07, "loss": 0.0315, "num_tokens": 12290814.0, "reward": 0.45419785380363464, "reward_std": 0.08522972464561462, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.3970254063606262, "rewards/logprob_reward/std": 0.29944905638694763, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 667.15625, "completions/mean_terminated_length": 667.15625, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 1.404320987654321, "grad_norm": 1.7165628326941342, "kl": 0.2611083984375, "learning_rate": 4.22393622165091e-07, "loss": -0.1062, "num_tokens": 12318751.0, "reward": 0.4371485114097595, "reward_std": 0.1147489994764328, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.38155388832092285, "rewards/logprob_reward/std": 0.31830334663391113, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 678.59375, "completions/mean_terminated_length": 678.59375, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 1.4074074074074074, "grad_norm": 1.841675398563942, "kl": 0.272216796875, "learning_rate": 4.220312172722216e-07, "loss": -0.0899, "num_tokens": 12347058.0, "reward": 0.5687949061393738, "reward_std": 0.10135474056005478, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.5278276801109314, "rewards/logprob_reward/std": 0.35609227418899536, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 686.9375, "completions/mean_terminated_length": 676.0645141601562, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 1.4104938271604939, "grad_norm": 1.9309761364999138, "kl": 0.25146484375, "learning_rate": 4.216681244331256e-07, "loss": -0.1145, "num_tokens": 12376028.0, "reward": 0.3074977695941925, "reward_std": 0.11579814553260803, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.244441956281662, "rewards/logprob_reward/std": 0.2673291563987732, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 624.09375, "completions/mean_terminated_length": 611.1935424804688, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 1.4135802469135803, "grad_norm": 1.7709283323979133, "kl": 0.2728271484375, "learning_rate": 4.2130434509979714e-07, "loss": -0.1819, "num_tokens": 12402603.0, "reward": 0.3058101534843445, "reward_std": 0.16067659854888916, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.24951128661632538, "rewards/logprob_reward/std": 0.27765336632728577, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 699.65625, "completions/mean_terminated_length": 678.0333862304688, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 1.4166666666666667, "grad_norm": 1.5708970596382321, "kl": 0.2674560546875, "learning_rate": 4.209398807269758e-07, "loss": -0.1681, "num_tokens": 12431376.0, "reward": 0.2743222415447235, "reward_std": 0.12449018657207489, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.21452471613883972, "rewards/logprob_reward/std": 0.27530333399772644, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 656.75, "completions/mean_terminated_length": 656.75, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 1.4197530864197532, "grad_norm": 1.6442196441348378, "kl": 0.2681884765625, "learning_rate": 4.205747327721407e-07, "loss": -0.2139, "num_tokens": 12458792.0, "reward": 0.4269120693206787, "reward_std": 0.0731905996799469, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.37365227937698364, "rewards/logprob_reward/std": 0.3116844892501831, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 666.9375, "completions/mean_terminated_length": 655.4193115234375, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 1.4228395061728394, "grad_norm": 1.7423564930575814, "kl": 0.29248046875, "learning_rate": 4.2020890269550454e-07, "loss": -0.0057, "num_tokens": 12486370.0, "reward": 0.5228009223937988, "reward_std": 0.06369026750326157, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.4732511043548584, "rewards/logprob_reward/std": 0.34157904982566833, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 656.1875, "completions/mean_terminated_length": 644.3225708007812, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 1.425925925925926, "grad_norm": 1.6498108309995225, "kl": 0.2833251953125, "learning_rate": 4.198423919600076e-07, "loss": -0.1474, "num_tokens": 12513820.0, "reward": 0.4474060535430908, "reward_std": 0.11143474280834198, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3964233994483948, "rewards/logprob_reward/std": 0.31516239047050476, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 646.53125, "completions/mean_terminated_length": 646.53125, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 1.4290123456790123, "grad_norm": 1.8138417949897414, "kl": 0.3167724609375, "learning_rate": 4.1947520203131217e-07, "loss": -0.0725, "num_tokens": 12540805.0, "reward": 0.5504790544509888, "reward_std": 0.08624166250228882, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.5040044784545898, "rewards/logprob_reward/std": 0.36580199003219604, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 669.6875, "completions/mean_terminated_length": 669.6875, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 1.4320987654320987, "grad_norm": 1.6678822705940968, "kl": 0.307373046875, "learning_rate": 4.191073343777968e-07, "loss": -0.169, "num_tokens": 12569363.0, "reward": 0.2993100583553314, "reward_std": 0.05184660851955414, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.22840005159378052, "rewards/logprob_reward/std": 0.31269511580467224, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 663.71875, "completions/mean_terminated_length": 639.7000122070312, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 1.4351851851851851, "grad_norm": 1.588850632658499, "kl": 0.296630859375, "learning_rate": 4.1873879047055005e-07, "loss": -0.159, "num_tokens": 12597182.0, "reward": 0.2417197823524475, "reward_std": 0.12020526826381683, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.17135532200336456, "rewards/logprob_reward/std": 0.27957800030708313, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 678.65625, "completions/mean_terminated_length": 655.6333618164062, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 1.4382716049382716, "grad_norm": 1.6407745799975963, "kl": 0.28466796875, "learning_rate": 4.183695717833649e-07, "loss": -0.0078, "num_tokens": 12625419.0, "reward": 0.25879836082458496, "reward_std": 0.09372325241565704, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.18685927987098694, "rewards/logprob_reward/std": 0.22162769734859467, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 664.46875, "completions/mean_terminated_length": 664.46875, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 1.441358024691358, "grad_norm": 1.9780290787705557, "kl": 0.2960205078125, "learning_rate": 4.179996797927326e-07, "loss": -0.1364, "num_tokens": 12652942.0, "reward": 0.3983832597732544, "reward_std": 0.0513664186000824, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3384813666343689, "rewards/logprob_reward/std": 0.3200857639312744, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 639.25, "completions/mean_terminated_length": 639.25, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 1.4444444444444444, "grad_norm": 1.7729840405924318, "kl": 0.2916259765625, "learning_rate": 4.17629115977837e-07, "loss": -0.0646, "num_tokens": 12680258.0, "reward": 0.319660484790802, "reward_std": 0.05290937423706055, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2544838786125183, "rewards/logprob_reward/std": 0.3000459372997284, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 685.46875, "completions/mean_terminated_length": 685.46875, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 1.4475308641975309, "grad_norm": 1.7408547513682184, "kl": 0.299560546875, "learning_rate": 4.1725788182054867e-07, "loss": -0.1353, "num_tokens": 12709073.0, "reward": 0.40663546323776245, "reward_std": 0.05858787149190903, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.35112273693084717, "rewards/logprob_reward/std": 0.3067576289176941, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 647.71875, "completions/mean_terminated_length": 635.5806274414062, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 1.4506172839506173, "grad_norm": 1.654203065970209, "kl": 0.298828125, "learning_rate": 4.1688597880541863e-07, "loss": -0.2158, "num_tokens": 12736320.0, "reward": 0.3193134069442749, "reward_std": 0.15462279319763184, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.26451486349105835, "rewards/logprob_reward/std": 0.2710249423980713, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 652.1875, "completions/mean_terminated_length": 652.1875, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 1.4537037037037037, "grad_norm": 1.7394871711319955, "kl": 0.2982177734375, "learning_rate": 4.1651340841967284e-07, "loss": -0.2201, "num_tokens": 12763494.0, "reward": 0.3173232078552246, "reward_std": 0.07067559659481049, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.25535911321640015, "rewards/logprob_reward/std": 0.3024485111236572, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 725.9375, "completions/mean_terminated_length": 683.357177734375, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 1.4567901234567902, "grad_norm": 1.539979896925199, "kl": 0.2813720703125, "learning_rate": 4.161401721532059e-07, "loss": -0.0313, "num_tokens": 12793516.0, "reward": 0.3675277829170227, "reward_std": 0.16590730845928192, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3146141767501831, "rewards/logprob_reward/std": 0.27219313383102417, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 651.84375, "completions/mean_terminated_length": 627.0333862304688, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 1.4598765432098766, "grad_norm": 1.7233478757972764, "kl": 0.30126953125, "learning_rate": 4.1576627149857513e-07, "loss": -0.1177, "num_tokens": 12820615.0, "reward": 0.2186134159564972, "reward_std": 0.14007329940795898, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.15609824657440186, "rewards/logprob_reward/std": 0.24942095577716827, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 702.71875, "completions/mean_terminated_length": 681.300048828125, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 1.462962962962963, "grad_norm": 1.6261578350125432, "kl": 0.3060302734375, "learning_rate": 4.153917079509952e-07, "loss": 0.003, "num_tokens": 12849286.0, "reward": 0.4375837445259094, "reward_std": 0.06572870910167694, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.38203755021095276, "rewards/logprob_reward/std": 0.31925711035728455, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 613.78125, "completions/mean_terminated_length": 600.54833984375, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 1.4660493827160495, "grad_norm": 1.7270665539243555, "kl": 0.328857421875, "learning_rate": 4.150164830083311e-07, "loss": -0.0418, "num_tokens": 12874831.0, "reward": 0.2839992046356201, "reward_std": 0.08301331102848053, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2113879919052124, "rewards/logprob_reward/std": 0.2617226541042328, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 637.25, "completions/mean_terminated_length": 637.25, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 1.4691358024691357, "grad_norm": 1.9498047944665835, "kl": 0.29638671875, "learning_rate": 4.146405981710931e-07, "loss": -0.2383, "num_tokens": 12902103.0, "reward": 0.35477036237716675, "reward_std": 0.1580616533756256, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3039115071296692, "rewards/logprob_reward/std": 0.31024959683418274, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 740.25, "completions/mean_terminated_length": 687.7037353515625, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 1.4722222222222223, "grad_norm": 1.8104589412001122, "kl": 0.306640625, "learning_rate": 4.142640549424302e-07, "loss": 0.0842, "num_tokens": 12932595.0, "reward": 0.3414173722267151, "reward_std": 0.08399326354265213, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2856026291847229, "rewards/logprob_reward/std": 0.303835928440094, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 712.90625, "completions/mean_terminated_length": 680.72412109375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 1.4753086419753085, "grad_norm": 1.717016321500636, "kl": 0.2825927734375, "learning_rate": 4.1388685482812413e-07, "loss": -0.102, "num_tokens": 12962136.0, "reward": 0.4072793126106262, "reward_std": 0.12144917249679565, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.358782559633255, "rewards/logprob_reward/std": 0.35088032484054565, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 645.28125, "completions/mean_terminated_length": 645.28125, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 1.4783950617283952, "grad_norm": 1.7610270894290763, "kl": 0.32861328125, "learning_rate": 4.135089993365839e-07, "loss": 0.0351, "num_tokens": 12988933.0, "reward": 0.40260133147239685, "reward_std": 0.027284763753414154, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.33622369170188904, "rewards/logprob_reward/std": 0.3135136663913727, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 693.0625, "completions/mean_terminated_length": 671.0000610351562, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 1.4814814814814814, "grad_norm": 1.8050476697780888, "kl": 0.2879638671875, "learning_rate": 4.131304899788389e-07, "loss": -0.0575, "num_tokens": 13017459.0, "reward": 0.2166840136051178, "reward_std": 0.02420724555850029, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.14006556570529938, "rewards/logprob_reward/std": 0.3111254870891571, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 702.125, "completions/mean_terminated_length": 656.1428833007812, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 1.4845679012345678, "grad_norm": 1.899073484161513, "kl": 0.326171875, "learning_rate": 4.127513282685336e-07, "loss": -0.0505, "num_tokens": 13046627.0, "reward": 0.3510780334472656, "reward_std": 0.10593295097351074, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.29286450147628784, "rewards/logprob_reward/std": 0.3164975345134735, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 633.9375, "completions/mean_terminated_length": 633.9375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 1.4876543209876543, "grad_norm": 1.6612344360865268, "kl": 0.30078125, "learning_rate": 4.123715157219211e-07, "loss": -0.2384, "num_tokens": 13072901.0, "reward": 0.4000457227230072, "reward_std": 0.1173812597990036, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3472730219364166, "rewards/logprob_reward/std": 0.3256080448627472, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 647.28125, "completions/mean_terminated_length": 635.1290283203125, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 1.4907407407407407, "grad_norm": 1.7435896124458152, "kl": 0.314697265625, "learning_rate": 4.1199105385785727e-07, "loss": -0.049, "num_tokens": 13099902.0, "reward": 0.23226365447044373, "reward_std": 0.022250061854720116, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.15043184161186218, "rewards/logprob_reward/std": 0.2844749689102173, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 680.1875, "completions/mean_terminated_length": 680.1875, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 1.4938271604938271, "grad_norm": 1.6277762713948583, "kl": 0.2999267578125, "learning_rate": 4.116099441977943e-07, "loss": -0.0246, "num_tokens": 13128336.0, "reward": 0.45115000009536743, "reward_std": 0.0666617676615715, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.3936389088630676, "rewards/logprob_reward/std": 0.32673826813697815, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 659.5625, "completions/mean_terminated_length": 659.5625, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 1.4969135802469136, "grad_norm": 1.669341256909508, "kl": 0.34033203125, "learning_rate": 4.112281882657751e-07, "loss": -0.1168, "num_tokens": 13156114.0, "reward": 0.3881649076938629, "reward_std": 0.11646661162376404, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.33059990406036377, "rewards/logprob_reward/std": 0.3713279366493225, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 641.6875, "completions/mean_terminated_length": 641.6875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 1.5, "grad_norm": 1.6810633037756597, "kl": 0.331298828125, "learning_rate": 4.1084578758842714e-07, "loss": -0.2342, "num_tokens": 13183188.0, "reward": 0.3575547933578491, "reward_std": 0.05817033350467682, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.30006086826324463, "rewards/logprob_reward/std": 0.3089773654937744, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 644.75, "completions/mean_terminated_length": 644.75, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 1.5030864197530864, "grad_norm": 1.5738377480084533, "kl": 0.33984375, "learning_rate": 4.104627436949559e-07, "loss": -0.2078, "num_tokens": 13210056.0, "reward": 0.4084114730358124, "reward_std": 0.1359805166721344, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3530960977077484, "rewards/logprob_reward/std": 0.3314626216888428, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 708.3125, "completions/mean_terminated_length": 687.2667236328125, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 1.5061728395061729, "grad_norm": 1.6665736732077019, "kl": 0.308349609375, "learning_rate": 4.1007905811713915e-07, "loss": -0.05, "num_tokens": 13239018.0, "reward": 0.30789151787757874, "reward_std": 0.10876470804214478, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.25182390213012695, "rewards/logprob_reward/std": 0.33362630009651184, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 628.4375, "completions/mean_terminated_length": 615.6774291992188, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 1.5092592592592593, "grad_norm": 1.6611003431428804, "kl": 0.343505859375, "learning_rate": 4.096947323893209e-07, "loss": -0.0601, "num_tokens": 13265168.0, "reward": 0.40426886081695557, "reward_std": 0.035667628049850464, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.34154874086380005, "rewards/logprob_reward/std": 0.32039040327072144, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 710.125, "completions/mean_terminated_length": 677.6551513671875, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 1.5123456790123457, "grad_norm": 1.7197402780375335, "kl": 0.33837890625, "learning_rate": 4.0930976804840487e-07, "loss": -0.0217, "num_tokens": 13294232.0, "reward": 0.3448511064052582, "reward_std": 0.10858896374702454, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.28941789269447327, "rewards/logprob_reward/std": 0.2727254629135132, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 622.03125, "completions/mean_terminated_length": 622.03125, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 1.515432098765432, "grad_norm": 1.742918076560471, "kl": 0.345703125, "learning_rate": 4.0892416663384874e-07, "loss": -0.1184, "num_tokens": 13320261.0, "reward": 0.3053366541862488, "reward_std": 0.058555878698825836, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.23509632050991058, "rewards/logprob_reward/std": 0.27831724286079407, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 662.4375, "completions/mean_terminated_length": 638.3333740234375, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 1.5185185185185186, "grad_norm": 1.7024203603138879, "kl": 0.3271484375, "learning_rate": 4.0853792968765765e-07, "loss": -0.0645, "num_tokens": 13347963.0, "reward": 0.28549724817276, "reward_std": 0.11276431381702423, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.21999698877334595, "rewards/logprob_reward/std": 0.28163352608680725, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 652.03125, "completions/mean_terminated_length": 652.03125, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 1.5216049382716048, "grad_norm": 2.6534547187956288, "kl": 0.343017578125, "learning_rate": 4.081510587543784e-07, "loss": -0.2368, "num_tokens": 13375280.0, "reward": 0.2914389371871948, "reward_std": 0.10471706092357635, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2231266349554062, "rewards/logprob_reward/std": 0.2874836325645447, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 714.375, "completions/mean_terminated_length": 693.7333984375, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 1.5246913580246915, "grad_norm": 1.7661428850915075, "kl": 0.328369140625, "learning_rate": 4.0776355538109285e-07, "loss": -0.2306, "num_tokens": 13405180.0, "reward": 0.3396759629249573, "reward_std": 0.17052994668483734, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2871399521827698, "rewards/logprob_reward/std": 0.32741817831993103, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 661.28125, "completions/mean_terminated_length": 637.1000366210938, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 1.5277777777777777, "grad_norm": 1.9457265662963257, "kl": 0.325439453125, "learning_rate": 4.073754211174123e-07, "loss": -0.0898, "num_tokens": 13432809.0, "reward": 0.40113365650177, "reward_std": 0.08360134065151215, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.34848183393478394, "rewards/logprob_reward/std": 0.315945029258728, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 677.8125, "completions/mean_terminated_length": 654.7333374023438, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 1.5308641975308643, "grad_norm": 1.607906836908459, "kl": 0.3289794921875, "learning_rate": 4.069866575154706e-07, "loss": -0.0655, "num_tokens": 13460847.0, "reward": 0.2800005078315735, "reward_std": 0.09354648739099503, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2138894647359848, "rewards/logprob_reward/std": 0.31120461225509644, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 699.1875, "completions/mean_terminated_length": 688.7096557617188, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 1.5339506172839505, "grad_norm": 1.4560712840811845, "kl": 0.32958984375, "learning_rate": 4.0659726612991853e-07, "loss": -0.1978, "num_tokens": 13489929.0, "reward": 0.2921089828014374, "reward_std": 0.06781557947397232, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.22734330594539642, "rewards/logprob_reward/std": 0.3027639091014862, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 629.09375, "completions/mean_terminated_length": 616.3547973632812, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 1.5370370370370372, "grad_norm": 1.5262139013635476, "kl": 0.3150634765625, "learning_rate": 4.062072485179172e-07, "loss": -0.0948, "num_tokens": 13516188.0, "reward": 0.24819517135620117, "reward_std": 0.08755959570407867, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.18896688520908356, "rewards/logprob_reward/std": 0.2810383141040802, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 726.78125, "completions/mean_terminated_length": 684.3214721679688, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 1.5401234567901234, "grad_norm": 1.4925708814961112, "kl": 0.3408203125, "learning_rate": 4.0581660623913216e-07, "loss": 0.0155, "num_tokens": 13546465.0, "reward": 0.3662879467010498, "reward_std": 0.06421985477209091, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3201810419559479, "rewards/logprob_reward/std": 0.3515361249446869, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 656.34375, "completions/mean_terminated_length": 656.34375, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 1.5432098765432098, "grad_norm": 1.8595894832149937, "kl": 0.341796875, "learning_rate": 4.0542534085572677e-07, "loss": -0.1224, "num_tokens": 13573616.0, "reward": 0.32948625087738037, "reward_std": 0.03549141809344292, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.2584569454193115, "rewards/logprob_reward/std": 0.31909671425819397, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 712.28125, "completions/mean_terminated_length": 691.5000610351562, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 1.5462962962962963, "grad_norm": 1.5310766413891215, "kl": 0.30908203125, "learning_rate": 4.050334539323563e-07, "loss": -0.1116, "num_tokens": 13602629.0, "reward": 0.3345943093299866, "reward_std": 0.10310542583465576, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2745492458343506, "rewards/logprob_reward/std": 0.314729779958725, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 650.25, "completions/mean_terminated_length": 650.25, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 1.5493827160493827, "grad_norm": 1.9538270450666084, "kl": 0.356689453125, "learning_rate": 4.046409470361615e-07, "loss": -0.0559, "num_tokens": 13629889.0, "reward": 0.2929022014141083, "reward_std": 0.04634756222367287, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.22128021717071533, "rewards/logprob_reward/std": 0.27578675746917725, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 705.4375, "completions/mean_terminated_length": 684.2000122070312, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 1.5524691358024691, "grad_norm": 1.7120784926279886, "kl": 0.33984375, "learning_rate": 4.0424782173676235e-07, "loss": -0.067, "num_tokens": 13659235.0, "reward": 0.5592725872993469, "reward_std": 0.14062070846557617, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.5207195281982422, "rewards/logprob_reward/std": 0.33387675881385803, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 660.5625, "completions/mean_terminated_length": 648.8386840820312, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 1.5555555555555556, "grad_norm": 1.6606948676057802, "kl": 0.343994140625, "learning_rate": 4.0385407960625185e-07, "loss": -0.0523, "num_tokens": 13686685.0, "reward": 0.5368808507919312, "reward_std": 0.06722936034202576, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.49236762523651123, "rewards/logprob_reward/std": 0.29157769680023193, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 683.5, "completions/mean_terminated_length": 660.800048828125, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 1.558641975308642, "grad_norm": 1.6811227040533339, "kl": 0.34228515625, "learning_rate": 4.034597222191896e-07, "loss": -0.1028, "num_tokens": 13714781.0, "reward": 0.4721260070800781, "reward_std": 0.13844622671604156, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.42736223340034485, "rewards/logprob_reward/std": 0.3467288911342621, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 682.53125, "completions/mean_terminated_length": 647.2069091796875, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 1.5617283950617284, "grad_norm": 1.880528510186601, "kl": 0.36376953125, "learning_rate": 4.030647511525956e-07, "loss": -0.1281, "num_tokens": 13743182.0, "reward": 0.24451838433742523, "reward_std": 0.1296471655368805, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.1814093142747879, "rewards/logprob_reward/std": 0.2609933912754059, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 712.0, "completions/mean_terminated_length": 701.9354858398438, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 1.5648148148148149, "grad_norm": 1.5629142299510113, "kl": 0.326171875, "learning_rate": 4.0266916798594417e-07, "loss": -0.0174, "num_tokens": 13772450.0, "reward": 0.40720510482788086, "reward_std": 0.038407742977142334, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.34481120109558105, "rewards/logprob_reward/std": 0.35448914766311646, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 612.21875, "completions/mean_terminated_length": 612.21875, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 1.567901234567901, "grad_norm": 1.9224789368464252, "kl": 0.340576171875, "learning_rate": 4.02272974301157e-07, "loss": -0.0219, "num_tokens": 13798297.0, "reward": 0.3401462435722351, "reward_std": 0.02498357743024826, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.26682913303375244, "rewards/logprob_reward/std": 0.29686465859413147, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 659.9375, "completions/mean_terminated_length": 648.1935424804688, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 1.5709876543209877, "grad_norm": 1.63998178825964, "kl": 0.34033203125, "learning_rate": 4.018761716825974e-07, "loss": -0.0992, "num_tokens": 13825823.0, "reward": 0.40080416202545166, "reward_std": 0.10780981183052063, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3446435332298279, "rewards/logprob_reward/std": 0.26984983682632446, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 684.5625, "completions/mean_terminated_length": 684.5625, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 1.574074074074074, "grad_norm": 1.9176512086047477, "kl": 0.331298828125, "learning_rate": 4.014787617170639e-07, "loss": -0.1169, "num_tokens": 13853801.0, "reward": 0.4687584638595581, "reward_std": 0.07805125415325165, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4201483130455017, "rewards/logprob_reward/std": 0.339819997549057, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 644.46875, "completions/mean_terminated_length": 619.1666870117188, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 1.5771604938271606, "grad_norm": 1.6512666482290077, "kl": 0.344482421875, "learning_rate": 4.010807459937836e-07, "loss": -0.1461, "num_tokens": 13880716.0, "reward": 0.39378565549850464, "reward_std": 0.088468998670578, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.34378960728645325, "rewards/logprob_reward/std": 0.3410727083683014, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 653.625, "completions/mean_terminated_length": 653.625, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 1.5802469135802468, "grad_norm": 1.671809174462636, "kl": 0.35205078125, "learning_rate": 4.006821261044061e-07, "loss": -0.0878, "num_tokens": 13908600.0, "reward": 0.39071688055992126, "reward_std": 0.056234851479530334, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.32649099826812744, "rewards/logprob_reward/std": 0.3350942134857178, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 637.53125, "completions/mean_terminated_length": 625.0645141601562, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 1.5833333333333335, "grad_norm": 1.654330362130557, "kl": 0.3544921875, "learning_rate": 4.002829036429971e-07, "loss": -0.0632, "num_tokens": 13935505.0, "reward": 0.43745487928390503, "reward_std": 0.0654434859752655, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3818943202495575, "rewards/logprob_reward/std": 0.3222728371620178, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 711.09375, "completions/mean_terminated_length": 711.09375, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 1.5864197530864197, "grad_norm": 1.59436931967049, "kl": 0.326416015625, "learning_rate": 3.998830802060317e-07, "loss": -0.1089, "num_tokens": 13964592.0, "reward": 0.5349079370498657, "reward_std": 0.044535908848047256, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.4867032766342163, "rewards/logprob_reward/std": 0.28785204887390137, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 700.5, "completions/mean_terminated_length": 678.933349609375, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 1.5895061728395061, "grad_norm": 1.9830912238131653, "kl": 0.3427734375, "learning_rate": 3.994826573923886e-07, "loss": -0.0112, "num_tokens": 13994404.0, "reward": 0.4081534743309021, "reward_std": 0.09322597086429596, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3562816381454468, "rewards/logprob_reward/std": 0.31925255060195923, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 596.71875, "completions/mean_terminated_length": 596.71875, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 1.5925925925925926, "grad_norm": 1.689035865386424, "kl": 0.353759765625, "learning_rate": 3.9908163680334326e-07, "loss": -0.1287, "num_tokens": 14019495.0, "reward": 0.26731565594673157, "reward_std": 0.0294056236743927, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.1928507387638092, "rewards/logprob_reward/std": 0.30968308448791504, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 693.9375, "completions/mean_terminated_length": 683.290283203125, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 1.595679012345679, "grad_norm": 1.7802795651264582, "kl": 0.31494140625, "learning_rate": 3.9868002004256165e-07, "loss": 0.0107, "num_tokens": 14048577.0, "reward": 0.3223717510700226, "reward_std": 0.04483116790652275, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.2505519390106201, "rewards/logprob_reward/std": 0.3064448833465576, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 632.90625, "completions/mean_terminated_length": 606.8333740234375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 1.5987654320987654, "grad_norm": 2.7507523015216906, "kl": 0.319091796875, "learning_rate": 3.982778087160935e-07, "loss": -0.2782, "num_tokens": 14075154.0, "reward": 0.3087996244430542, "reward_std": 0.11147301644086838, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.25977736711502075, "rewards/logprob_reward/std": 0.3210987448692322, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 717.53125, "completions/mean_terminated_length": 707.6451416015625, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 1.6018518518518519, "grad_norm": 1.6619311999763997, "kl": 0.334716796875, "learning_rate": 3.9787500443236664e-07, "loss": -0.0536, "num_tokens": 14104927.0, "reward": 0.42025119066238403, "reward_std": 0.08252840489149094, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.36277908086776733, "rewards/logprob_reward/std": 0.37111157178878784, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 671.09375, "completions/mean_terminated_length": 647.5667114257812, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 1.6049382716049383, "grad_norm": 1.9089003764359953, "kl": 0.382568359375, "learning_rate": 3.9747160880217994e-07, "loss": 0.067, "num_tokens": 14133314.0, "reward": 0.35402050614356995, "reward_std": 0.06606175005435944, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2891894578933716, "rewards/logprob_reward/std": 0.3357047140598297, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 608.6875, "completions/mean_terminated_length": 608.6875, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 1.6080246913580247, "grad_norm": 1.7975649058875252, "kl": 0.387939453125, "learning_rate": 3.9706762343869705e-07, "loss": -0.0922, "num_tokens": 14159268.0, "reward": 0.495649129152298, "reward_std": 0.13956648111343384, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4500267803668976, "rewards/logprob_reward/std": 0.3299814462661743, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 673.25, "completions/mean_terminated_length": 673.25, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 1.6111111111111112, "grad_norm": 1.5489321758739614, "kl": 0.349853515625, "learning_rate": 3.966630499574397e-07, "loss": -0.0981, "num_tokens": 14187604.0, "reward": 0.5752013921737671, "reward_std": 0.04745848476886749, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.5384182333946228, "rewards/logprob_reward/std": 0.33066999912261963, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 676.125, "completions/mean_terminated_length": 676.125, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 1.6141975308641974, "grad_norm": 1.8401942496067614, "kl": 0.362548828125, "learning_rate": 3.9625788997628196e-07, "loss": -0.0354, "num_tokens": 14216048.0, "reward": 0.4444113075733185, "reward_std": 0.010376403108239174, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.3826792538166046, "rewards/logprob_reward/std": 0.3483096659183502, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 640.5, "completions/mean_terminated_length": 640.5, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 1.617283950617284, "grad_norm": 1.6924934372607516, "kl": 0.353271484375, "learning_rate": 3.958521451154428e-07, "loss": -0.0581, "num_tokens": 14243516.0, "reward": 0.4543353319168091, "reward_std": 0.06058476120233536, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.39717814326286316, "rewards/logprob_reward/std": 0.31436458230018616, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 688.375, "completions/mean_terminated_length": 666.0000610351562, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 1.6203703703703702, "grad_norm": 1.610660406692888, "kl": 0.34130859375, "learning_rate": 3.954458169974805e-07, "loss": -0.0259, "num_tokens": 14272112.0, "reward": 0.4688533842563629, "reward_std": 0.13230594992637634, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4202537536621094, "rewards/logprob_reward/std": 0.3025515079498291, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 634.625, "completions/mean_terminated_length": 634.625, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 1.623456790123457, "grad_norm": 1.8297120842863874, "kl": 0.36376953125, "learning_rate": 3.950389072472855e-07, "loss": -0.0451, "num_tokens": 14298376.0, "reward": 0.5358814001083374, "reward_std": 0.04500151425600052, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.48431265354156494, "rewards/logprob_reward/std": 0.3575548529624939, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 611.875, "completions/mean_terminated_length": 611.875, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 1.626543209876543, "grad_norm": 1.7258362505254745, "kl": 0.369873046875, "learning_rate": 3.9463141749207425e-07, "loss": -0.1286, "num_tokens": 14324480.0, "reward": 0.3218584358692169, "reward_std": 0.08617682754993439, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2534538209438324, "rewards/logprob_reward/std": 0.3151490092277527, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 649.125, "completions/mean_terminated_length": 637.0322265625, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 1.6296296296296298, "grad_norm": 1.870207164039645, "kl": 0.33056640625, "learning_rate": 3.9422334936138255e-07, "loss": -0.0223, "num_tokens": 14351848.0, "reward": 0.4252345860004425, "reward_std": 0.09438787400722504, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3683162033557892, "rewards/logprob_reward/std": 0.3274937570095062, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 723.3125, "completions/mean_terminated_length": 692.2069091796875, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 1.632716049382716, "grad_norm": 1.7992652556820592, "kl": 0.34814453125, "learning_rate": 3.938147044870594e-07, "loss": -0.1024, "num_tokens": 14381646.0, "reward": 0.2941952347755432, "reward_std": 0.08251428604125977, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.22966134548187256, "rewards/logprob_reward/std": 0.32134273648262024, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 680.84375, "completions/mean_terminated_length": 669.774169921875, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 1.6358024691358026, "grad_norm": 1.7570665979665783, "kl": 0.33642578125, "learning_rate": 3.934054845032598e-07, "loss": -0.0838, "num_tokens": 14410037.0, "reward": 0.5867519974708557, "reward_std": 0.14281043410301208, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.5512522459030151, "rewards/logprob_reward/std": 0.3413895070552826, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 647.625, "completions/mean_terminated_length": 647.625, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 1.6388888888888888, "grad_norm": 1.752559160624634, "kl": 0.362548828125, "learning_rate": 3.9299569104643876e-07, "loss": -0.082, "num_tokens": 14437205.0, "reward": 0.3480065166950226, "reward_std": 0.017046008259058, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.2755628228187561, "rewards/logprob_reward/std": 0.2897040843963623, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 639.4375, "completions/mean_terminated_length": 639.4375, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 1.6419753086419753, "grad_norm": 1.6734941842161508, "kl": 0.373291015625, "learning_rate": 3.925853257553445e-07, "loss": 0.0101, "num_tokens": 14463599.0, "reward": 0.41276389360427856, "reward_std": 0.04166366159915924, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.35098767280578613, "rewards/logprob_reward/std": 0.3164430856704712, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 731.1875, "completions/mean_terminated_length": 711.6666870117188, "completions/min_length": 542.0, "completions/min_terminated_length": 542.0, "epoch": 1.6450617283950617, "grad_norm": 1.5566273862404882, "kl": 0.330322265625, "learning_rate": 3.921743902710122e-07, "loss": -0.1546, "num_tokens": 14493521.0, "reward": 0.246620774269104, "reward_std": 0.10049859434366226, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.17680084705352783, "rewards/logprob_reward/std": 0.26032939553260803, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 666.6875, "completions/mean_terminated_length": 642.86669921875, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 1.6481481481481481, "grad_norm": 1.5911904650039235, "kl": 0.348388671875, "learning_rate": 3.917628862367569e-07, "loss": -0.1869, "num_tokens": 14521831.0, "reward": 0.25571227073669434, "reward_std": 0.12316013127565384, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.19037476181983948, "rewards/logprob_reward/std": 0.28642648458480835, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 668.6875, "completions/mean_terminated_length": 657.2257690429688, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 1.6512345679012346, "grad_norm": 1.6923053624063342, "kl": 0.34619140625, "learning_rate": 3.913508152981674e-07, "loss": -0.0234, "num_tokens": 14549849.0, "reward": 0.4714509844779968, "reward_std": 0.13059422373771667, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4196677803993225, "rewards/logprob_reward/std": 0.31816789507865906, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 667.9375, "completions/mean_terminated_length": 644.2000122070312, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 1.654320987654321, "grad_norm": 1.808947174255735, "kl": 0.33154296875, "learning_rate": 3.909381791030998e-07, "loss": 0.0379, "num_tokens": 14578087.0, "reward": 0.42673569917678833, "reward_std": 0.08904215693473816, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.36998414993286133, "rewards/logprob_reward/std": 0.34248775243759155, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 639.625, "completions/mean_terminated_length": 627.2257690429688, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 1.6574074074074074, "grad_norm": 1.508744154886012, "kl": 0.3720703125, "learning_rate": 3.905249793016702e-07, "loss": -0.0681, "num_tokens": 14604503.0, "reward": 0.32515716552734375, "reward_std": 0.028417132794857025, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.2536468207836151, "rewards/logprob_reward/std": 0.30186712741851807, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 622.90625, "completions/mean_terminated_length": 622.90625, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 1.6604938271604939, "grad_norm": 1.8083565370327086, "kl": 0.3642578125, "learning_rate": 3.9011121754624865e-07, "loss": 0.0601, "num_tokens": 14630816.0, "reward": 0.47574323415756226, "reward_std": 0.0656055212020874, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.42096471786499023, "rewards/logprob_reward/std": 0.2819656431674957, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 675.65625, "completions/mean_terminated_length": 652.433349609375, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 1.6635802469135803, "grad_norm": 1.745943725789985, "kl": 0.34228515625, "learning_rate": 3.8969689549145266e-07, "loss": 0.0031, "num_tokens": 14658313.0, "reward": 0.3900541365146637, "reward_std": 0.06733787059783936, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3292268216609955, "rewards/logprob_reward/std": 0.2828224301338196, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 635.03125, "completions/mean_terminated_length": 635.03125, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 1.6666666666666665, "grad_norm": 1.7905638259672367, "kl": 0.353271484375, "learning_rate": 3.8928201479414024e-07, "loss": -0.0789, "num_tokens": 14684646.0, "reward": 0.4041874408721924, "reward_std": 0.028623167425394058, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.3379860818386078, "rewards/logprob_reward/std": 0.2746003270149231, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 642.46875, "completions/mean_terminated_length": 642.46875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 1.6697530864197532, "grad_norm": 1.8530364909594073, "kl": 0.352783203125, "learning_rate": 3.888665771134032e-07, "loss": -0.1937, "num_tokens": 14711745.0, "reward": 0.3126922845840454, "reward_std": 0.0742364227771759, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.24674147367477417, "rewards/logprob_reward/std": 0.31494566798210144, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 691.6875, "completions/mean_terminated_length": 680.9677124023438, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 1.6728395061728394, "grad_norm": 1.8119400861753523, "kl": 0.37158203125, "learning_rate": 3.8845058411056095e-07, "loss": -0.0698, "num_tokens": 14741059.0, "reward": 0.514247715473175, "reward_std": 0.04992896318435669, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.46721965074539185, "rewards/logprob_reward/std": 0.32162925601005554, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 666.71875, "completions/mean_terminated_length": 642.9000244140625, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 1.675925925925926, "grad_norm": 1.6531431711939002, "kl": 0.352783203125, "learning_rate": 3.880340374491535e-07, "loss": -0.1039, "num_tokens": 14768858.0, "reward": 0.3029060959815979, "reward_std": 0.09292587637901306, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.24281233549118042, "rewards/logprob_reward/std": 0.2804667353630066, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 724.15625, "completions/mean_terminated_length": 724.15625, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 1.6790123456790123, "grad_norm": 1.7759730905222038, "kl": 0.346923828125, "learning_rate": 3.8761693879493495e-07, "loss": -0.0339, "num_tokens": 14798911.0, "reward": 0.30212029814720154, "reward_std": 0.07162807881832123, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.2245781123638153, "rewards/logprob_reward/std": 0.26613637804985046, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 619.21875, "completions/mean_terminated_length": 606.1612548828125, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 1.682098765432099, "grad_norm": 1.7930900673914212, "kl": 0.35791015625, "learning_rate": 3.871992898158667e-07, "loss": -0.0923, "num_tokens": 14824710.0, "reward": 0.4632956385612488, "reward_std": 0.16647809743881226, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4175507128238678, "rewards/logprob_reward/std": 0.35847988724708557, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 595.3125, "completions/mean_terminated_length": 595.3125, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 1.6851851851851851, "grad_norm": 1.7456421229531904, "kl": 0.357666015625, "learning_rate": 3.867810921821112e-07, "loss": -0.0311, "num_tokens": 14849832.0, "reward": 0.3658713698387146, "reward_std": 0.022498883306980133, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.2954126000404358, "rewards/logprob_reward/std": 0.2749062180519104, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 695.46875, "completions/mean_terminated_length": 673.5667114257812, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 1.6882716049382716, "grad_norm": 1.6571827890018376, "kl": 0.33203125, "learning_rate": 3.863623475660245e-07, "loss": 0.0252, "num_tokens": 14878891.0, "reward": 0.19036829471588135, "reward_std": 0.06843934208154678, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.10735367238521576, "rewards/logprob_reward/std": 0.2825632095336914, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 675.96875, "completions/mean_terminated_length": 664.741943359375, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 1.691358024691358, "grad_norm": 1.756745527288289, "kl": 0.551025390625, "learning_rate": 3.859430576421503e-07, "loss": -0.0722, "num_tokens": 14907102.0, "reward": 0.42507806420326233, "reward_std": 0.15121257305145264, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3750867545604706, "rewards/logprob_reward/std": 0.3356475830078125, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 690.28125, "completions/mean_terminated_length": 668.0333862304688, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 1.6944444444444444, "grad_norm": 1.7381387514379045, "kl": 0.3408203125, "learning_rate": 3.855232240872128e-07, "loss": 0.029, "num_tokens": 14935443.0, "reward": 0.35399746894836426, "reward_std": 0.11043843626976013, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.28916388750076294, "rewards/logprob_reward/std": 0.3232763111591339, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 650.28125, "completions/mean_terminated_length": 638.2257690429688, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 1.6975308641975309, "grad_norm": 1.8465955316099198, "kl": 0.365966796875, "learning_rate": 3.851028485801105e-07, "loss": -0.1415, "num_tokens": 14962336.0, "reward": 0.3472544550895691, "reward_std": 0.09711381047964096, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2955605387687683, "rewards/logprob_reward/std": 0.2936035096645355, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 690.34375, "completions/mean_terminated_length": 628.5555419921875, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 1.7006172839506173, "grad_norm": 1.7108365763521578, "kl": 0.38720703125, "learning_rate": 3.8468193280190864e-07, "loss": -0.0163, "num_tokens": 14990927.0, "reward": 0.26889538764953613, "reward_std": 0.13430963456630707, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.2154393047094345, "rewards/logprob_reward/std": 0.30202263593673706, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 630.40625, "completions/mean_terminated_length": 630.40625, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 1.7037037037037037, "grad_norm": 1.622651875014227, "kl": 0.3720703125, "learning_rate": 3.842604784358333e-07, "loss": -0.1725, "num_tokens": 15017456.0, "reward": 0.3989924490451813, "reward_std": 0.09176602214574814, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.339158296585083, "rewards/logprob_reward/std": 0.3244415819644928, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 661.375, "completions/mean_terminated_length": 649.6774291992188, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 1.7067901234567902, "grad_norm": 1.8298669997424495, "kl": 0.372314453125, "learning_rate": 3.8383848716726444e-07, "loss": -0.1475, "num_tokens": 15045280.0, "reward": 0.5385907888412476, "reward_std": 0.10829242318868637, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.501211941242218, "rewards/logprob_reward/std": 0.3262372314929962, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 640.25, "completions/mean_terminated_length": 627.8709716796875, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 1.7098765432098766, "grad_norm": 1.7233471378789158, "kl": 0.400390625, "learning_rate": 3.8341596068372874e-07, "loss": -0.0124, "num_tokens": 15072148.0, "reward": 0.32031872868537903, "reward_std": 0.026581529527902603, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.24827080965042114, "rewards/logprob_reward/std": 0.3367535471916199, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 688.375, "completions/mean_terminated_length": 688.375, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 1.7129629629629628, "grad_norm": 1.5257605335267712, "kl": 0.354736328125, "learning_rate": 3.829929006748934e-07, "loss": -0.1426, "num_tokens": 15100476.0, "reward": 0.3698934018611908, "reward_std": 0.053136810660362244, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.30682602524757385, "rewards/logprob_reward/std": 0.302482008934021, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 675.75, "completions/mean_terminated_length": 664.51611328125, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 1.7160493827160495, "grad_norm": 1.6470560351151577, "kl": 0.38232421875, "learning_rate": 3.8256930883255927e-07, "loss": -0.1231, "num_tokens": 15128624.0, "reward": 0.33092033863067627, "reward_std": 0.05144982784986496, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.27046701312065125, "rewards/logprob_reward/std": 0.3257130980491638, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 679.21875, "completions/mean_terminated_length": 668.0967407226562, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 1.7191358024691357, "grad_norm": 1.7351801280386998, "kl": 0.387939453125, "learning_rate": 3.8214518685065377e-07, "loss": -0.0775, "num_tokens": 15156975.0, "reward": 0.4172821044921875, "reward_std": 0.0829072892665863, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3594801127910614, "rewards/logprob_reward/std": 0.3225708603858948, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 677.34375, "completions/mean_terminated_length": 677.34375, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 1.7222222222222223, "grad_norm": 1.7294683075061303, "kl": 0.36669921875, "learning_rate": 3.817205364252244e-07, "loss": -0.1594, "num_tokens": 15184890.0, "reward": 0.4971083104610443, "reward_std": 0.14131881296634674, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.45164811611175537, "rewards/logprob_reward/std": 0.3644844889640808, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 683.40625, "completions/mean_terminated_length": 634.75, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 1.7253086419753085, "grad_norm": 1.84311197089849, "kl": 0.394775390625, "learning_rate": 3.8129535925443187e-07, "loss": -0.0187, "num_tokens": 15213171.0, "reward": 0.33320316672325134, "reward_std": 0.1602332890033722, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.27994799613952637, "rewards/logprob_reward/std": 0.32700470089912415, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 694.1875, "completions/mean_terminated_length": 694.1875, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 1.7283950617283952, "grad_norm": 1.4685063120330277, "kl": 0.35693359375, "learning_rate": 3.8086965703854336e-07, "loss": -0.0838, "num_tokens": 15241597.0, "reward": 0.5073024034500122, "reward_std": 0.04984048753976822, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.4560304284095764, "rewards/logprob_reward/std": 0.2675994336605072, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 664.21875, "completions/mean_terminated_length": 664.21875, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 1.7314814814814814, "grad_norm": 1.5157350462269765, "kl": 0.3697509765625, "learning_rate": 3.8044343147992563e-07, "loss": -0.0611, "num_tokens": 15269468.0, "reward": 0.2525057792663574, "reward_std": 0.01685785874724388, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.17292310297489166, "rewards/logprob_reward/std": 0.30887579917907715, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 727.4375, "completions/mean_terminated_length": 696.7586059570312, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 1.734567901234568, "grad_norm": 1.4200840565592359, "kl": 0.380615234375, "learning_rate": 3.8001668428303847e-07, "loss": 0.0177, "num_tokens": 15299566.0, "reward": 0.326347291469574, "reward_std": 0.059085913002491, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2619136571884155, "rewards/logprob_reward/std": 0.31335076689720154, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 726.46875, "completions/mean_terminated_length": 695.6896362304688, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 1.7376543209876543, "grad_norm": 1.5614435426687499, "kl": 0.353271484375, "learning_rate": 3.7958941715442726e-07, "loss": -0.0413, "num_tokens": 15329837.0, "reward": 0.4401216506958008, "reward_std": 0.030299311503767967, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3918018341064453, "rewards/logprob_reward/std": 0.36950358748435974, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 697.21875, "completions/mean_terminated_length": 686.6774291992188, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 1.7407407407407407, "grad_norm": 1.58971531887673, "kl": 0.35302734375, "learning_rate": 3.791616318027171e-07, "loss": 0.0657, "num_tokens": 15358936.0, "reward": 0.3132648468017578, "reward_std": 0.03542498126626015, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.24043317139148712, "rewards/logprob_reward/std": 0.30018025636672974, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 732.21875, "completions/mean_terminated_length": 690.5357666015625, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 1.7438271604938271, "grad_norm": 1.656633667314653, "kl": 0.365234375, "learning_rate": 3.78733329938605e-07, "loss": -0.0057, "num_tokens": 15389035.0, "reward": 0.34566572308540344, "reward_std": 0.14165744185447693, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.29032301902770996, "rewards/logprob_reward/std": 0.30284354090690613, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 626.28125, "completions/mean_terminated_length": 626.28125, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 1.7469135802469136, "grad_norm": 1.5543439865666007, "kl": 0.37548828125, "learning_rate": 3.7830451327485367e-07, "loss": -0.2076, "num_tokens": 15414888.0, "reward": 0.4929235279560089, "reward_std": 0.13939961791038513, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.44352617859840393, "rewards/logprob_reward/std": 0.3277125060558319, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 710.71875, "completions/mean_terminated_length": 700.6128540039062, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 1.75, "grad_norm": 1.7804695786315095, "kl": 0.39453125, "learning_rate": 3.778751835262847e-07, "loss": -0.0494, "num_tokens": 15444435.0, "reward": 0.4575570821762085, "reward_std": 0.1351914405822754, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.40423011779785156, "rewards/logprob_reward/std": 0.29609009623527527, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 654.5, "completions/mean_terminated_length": 654.5, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 1.7530864197530864, "grad_norm": 1.6088491062115924, "kl": 0.38037109375, "learning_rate": 3.7744534240977085e-07, "loss": 0.0257, "num_tokens": 15471659.0, "reward": 0.4808681011199951, "reward_std": 0.08881210535764694, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.4266590476036072, "rewards/logprob_reward/std": 0.32791951298713684, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 692.65625, "completions/mean_terminated_length": 645.3214721679688, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 1.7561728395061729, "grad_norm": 1.8433567651286928, "kl": 0.375244140625, "learning_rate": 3.7701499164423045e-07, "loss": -0.1023, "num_tokens": 15500412.0, "reward": 0.3531523048877716, "reward_std": 0.12531909346580505, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3021136522293091, "rewards/logprob_reward/std": 0.284778356552124, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 685.90625, "completions/mean_terminated_length": 685.90625, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 1.7592592592592593, "grad_norm": 1.4967692218881388, "kl": 0.364501953125, "learning_rate": 3.7658413295061974e-07, "loss": -0.0665, "num_tokens": 15529113.0, "reward": 0.24972134828567505, "reward_std": 0.06304517388343811, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.16982927918434143, "rewards/logprob_reward/std": 0.25125589966773987, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 705.6875, "completions/mean_terminated_length": 684.4666748046875, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 1.7623456790123457, "grad_norm": 1.604656087147012, "kl": 0.373046875, "learning_rate": 3.7615276805192595e-07, "loss": 0.0217, "num_tokens": 15558015.0, "reward": 0.3893260359764099, "reward_std": 0.1260727345943451, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3318900465965271, "rewards/logprob_reward/std": 0.2655092775821686, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 657.75, "completions/mean_terminated_length": 657.75, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 1.765432098765432, "grad_norm": 1.3644098405753282, "kl": 0.3837890625, "learning_rate": 3.7572089867316075e-07, "loss": -0.1725, "num_tokens": 15585215.0, "reward": 0.3458081781864166, "reward_std": 0.0818120539188385, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2835369110107422, "rewards/logprob_reward/std": 0.28677284717559814, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 693.96875, "completions/mean_terminated_length": 671.9666748046875, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 1.7685185185185186, "grad_norm": 1.4633557422323784, "kl": 0.371337890625, "learning_rate": 3.7528852654135323e-07, "loss": -0.165, "num_tokens": 15614690.0, "reward": 0.46047812700271606, "reward_std": 0.1828877031803131, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.41789233684539795, "rewards/logprob_reward/std": 0.3763064444065094, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 716.65625, "completions/mean_terminated_length": 706.741943359375, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 1.7716049382716048, "grad_norm": 1.5986771806121496, "kl": 0.384521484375, "learning_rate": 3.7485565338554294e-07, "loss": -0.0306, "num_tokens": 15644259.0, "reward": 0.3438904881477356, "reward_std": 0.05636099725961685, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.27446165680885315, "rewards/logprob_reward/std": 0.2866028845310211, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 669.71875, "completions/mean_terminated_length": 658.290283203125, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 1.7746913580246915, "grad_norm": 1.793094788913407, "kl": 0.5390625, "learning_rate": 3.7442228093677296e-07, "loss": -0.0618, "num_tokens": 15672118.0, "reward": 0.42978358268737793, "reward_std": 0.13361231982707977, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3837873041629791, "rewards/logprob_reward/std": 0.34469282627105713, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 710.4375, "completions/mean_terminated_length": 689.5333862304688, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 1.7777777777777777, "grad_norm": 1.533075817697161, "kl": 0.45458984375, "learning_rate": 3.7398841092808307e-07, "loss": -0.0584, "num_tokens": 15701436.0, "reward": 0.32136979699134827, "reward_std": 0.0514095202088356, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.26332753896713257, "rewards/logprob_reward/std": 0.3078490197658539, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 729.71875, "completions/mean_terminated_length": 710.1000366210938, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 1.7808641975308643, "grad_norm": 1.5306421509299435, "kl": 0.384033203125, "learning_rate": 3.735540450945028e-07, "loss": -0.0122, "num_tokens": 15731167.0, "reward": 0.44718992710113525, "reward_std": 0.13933010399341583, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3961832523345947, "rewards/logprob_reward/std": 0.33702927827835083, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 660.1875, "completions/mean_terminated_length": 648.4515991210938, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 1.7839506172839505, "grad_norm": 1.6453300164170337, "kl": 0.40380859375, "learning_rate": 3.731191851730443e-07, "loss": -0.136, "num_tokens": 15758377.0, "reward": 0.3259640336036682, "reward_std": 0.05456719547510147, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.26496002078056335, "rewards/logprob_reward/std": 0.3232046365737915, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 748.6875, "completions/mean_terminated_length": 697.7037353515625, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 1.7870370370370372, "grad_norm": 1.6631271097562075, "kl": 0.364501953125, "learning_rate": 3.7268383290269583e-07, "loss": -0.0545, "num_tokens": 15789143.0, "reward": 0.36026930809020996, "reward_std": 0.12670348584651947, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.31349366903305054, "rewards/logprob_reward/std": 0.3339720666408539, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 673.90625, "completions/mean_terminated_length": 673.90625, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 1.7901234567901234, "grad_norm": 1.703975315580601, "kl": 0.426025390625, "learning_rate": 3.7224799002441427e-07, "loss": -0.2326, "num_tokens": 15817544.0, "reward": 0.37692970037460327, "reward_std": 0.11854599416255951, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.32158854603767395, "rewards/logprob_reward/std": 0.29043349623680115, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 687.21875, "completions/mean_terminated_length": 687.21875, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 1.7932098765432098, "grad_norm": 1.7261124313290757, "kl": 0.390380859375, "learning_rate": 3.718116582811186e-07, "loss": -0.0674, "num_tokens": 15846115.0, "reward": 0.34583932161331177, "reward_std": 0.11445088684558868, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2870436906814575, "rewards/logprob_reward/std": 0.35239043831825256, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 660.53125, "completions/mean_terminated_length": 648.8064575195312, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 1.7962962962962963, "grad_norm": 1.5940955362878022, "kl": 0.40087890625, "learning_rate": 3.713748394176827e-07, "loss": -0.2038, "num_tokens": 15873496.0, "reward": 0.22363172471523285, "reward_std": 0.07886569201946259, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.1547296941280365, "rewards/logprob_reward/std": 0.26627713441848755, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 683.78125, "completions/mean_terminated_length": 648.586181640625, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 1.7993827160493827, "grad_norm": 1.605826056158922, "kl": 0.407470703125, "learning_rate": 3.7093753518092853e-07, "loss": -0.1314, "num_tokens": 15901733.0, "reward": 0.2686939239501953, "reward_std": 0.07776576280593872, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.20479878783226013, "rewards/logprob_reward/std": 0.25549501180648804, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 668.3125, "completions/mean_terminated_length": 656.8386840820312, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 1.8024691358024691, "grad_norm": 1.842817898127346, "kl": 0.41650390625, "learning_rate": 3.704997473196187e-07, "loss": -0.1874, "num_tokens": 15929939.0, "reward": 0.4359135627746582, "reward_std": 0.14617206156253815, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3905984163284302, "rewards/logprob_reward/std": 0.32774922251701355, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 651.03125, "completions/mean_terminated_length": 626.1666870117188, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 1.8055555555555556, "grad_norm": 1.7091695146824697, "kl": 0.442138671875, "learning_rate": 3.7006147758445017e-07, "loss": -0.0942, "num_tokens": 15957444.0, "reward": 0.2950804829597473, "reward_std": 0.11794188618659973, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.22717274725437164, "rewards/logprob_reward/std": 0.2964441180229187, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 678.875, "completions/mean_terminated_length": 629.5714721679688, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 1.808641975308642, "grad_norm": 1.7291381516657178, "kl": 0.4150390625, "learning_rate": 3.696227277280467e-07, "loss": -0.1788, "num_tokens": 15985624.0, "reward": 0.3000914454460144, "reward_std": 0.1440761536359787, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.25357380509376526, "rewards/logprob_reward/std": 0.35216978192329407, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 654.65625, "completions/mean_terminated_length": 642.741943359375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 1.8117283950617284, "grad_norm": 1.2066538442278163, "kl": 0.37744140625, "learning_rate": 3.691834995049522e-07, "loss": -0.3243, "num_tokens": 16012849.0, "reward": 0.35727205872535706, "reward_std": 0.19804148375988007, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.30669116973876953, "rewards/logprob_reward/std": 0.3091610372066498, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 663.84375, "completions/mean_terminated_length": 639.8333740234375, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 1.8148148148148149, "grad_norm": 1.8802276469621038, "kl": 0.42822265625, "learning_rate": 3.687437946716234e-07, "loss": -0.2653, "num_tokens": 16040480.0, "reward": 0.29515692591667175, "reward_std": 0.15594741702079773, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.24114656448364258, "rewards/logprob_reward/std": 0.3019496202468872, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 668.40625, "completions/mean_terminated_length": 668.40625, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 1.817901234567901, "grad_norm": 2.784345875300513, "kl": 0.369140625, "learning_rate": 3.68303614986423e-07, "loss": -0.5022, "num_tokens": 16068705.0, "reward": 0.36515969038009644, "reward_std": 0.19762031733989716, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.32587188482284546, "rewards/logprob_reward/std": 0.3808615207672119, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 681.21875, "completions/mean_terminated_length": 658.36669921875, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 1.8209876543209877, "grad_norm": 1.7118898997008667, "kl": 0.4189453125, "learning_rate": 3.6786296220961277e-07, "loss": 0.0711, "num_tokens": 16096828.0, "reward": 0.2952471375465393, "reward_std": 0.16394685208797455, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2273579239845276, "rewards/logprob_reward/std": 0.321044921875, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 704.1875, "completions/mean_terminated_length": 682.86669921875, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 1.824074074074074, "grad_norm": 1.849247795812898, "kl": 0.390625, "learning_rate": 3.6742183810334605e-07, "loss": -0.1278, "num_tokens": 16125906.0, "reward": 0.36467570066452026, "reward_std": 0.11812958121299744, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3149174451828003, "rewards/logprob_reward/std": 0.27399981021881104, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 687.5, "completions/mean_terminated_length": 687.5, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 1.8271604938271606, "grad_norm": 1.6746138640927068, "kl": 0.452392578125, "learning_rate": 3.6698024443166134e-07, "loss": -0.3047, "num_tokens": 16154890.0, "reward": 0.3672482967376709, "reward_std": 0.1447775512933731, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3143036365509033, "rewards/logprob_reward/std": 0.3351304829120636, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 658.375, "completions/mean_terminated_length": 634.0000610351562, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 1.8302469135802468, "grad_norm": 1.5405531811077184, "kl": 0.40576171875, "learning_rate": 3.6653818296047466e-07, "loss": -0.0804, "num_tokens": 16182426.0, "reward": 0.261323481798172, "reward_std": 0.13447348773479462, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.19313718378543854, "rewards/logprob_reward/std": 0.24218080937862396, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 647.5625, "completions/mean_terminated_length": 622.4666748046875, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 1.8333333333333335, "grad_norm": 1.7570781445930013, "kl": 0.402099609375, "learning_rate": 3.660956554575729e-07, "loss": -0.3366, "num_tokens": 16209636.0, "reward": 0.3960472345352173, "reward_std": 0.21257005631923676, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.36366361379623413, "rewards/logprob_reward/std": 0.34580740332603455, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 687.03125, "completions/mean_terminated_length": 676.1612548828125, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 1.8364197530864197, "grad_norm": 1.5249700519780514, "kl": 0.39501953125, "learning_rate": 3.656526636926065e-07, "loss": -0.1362, "num_tokens": 16238353.0, "reward": 0.2718071937561035, "reward_std": 0.11010853946208954, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.20478574931621552, "rewards/logprob_reward/std": 0.28518587350845337, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 683.4375, "completions/mean_terminated_length": 672.4515991210938, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 1.8395061728395061, "grad_norm": 1.612460563817424, "kl": 0.412841796875, "learning_rate": 3.652092094370826e-07, "loss": 0.0586, "num_tokens": 16266879.0, "reward": 0.34757891297340393, "reward_std": 0.019987735897302628, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.2785598933696747, "rewards/logprob_reward/std": 0.2837047278881073, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 775.21875, "completions/mean_terminated_length": 717.8077392578125, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 1.8425925925925926, "grad_norm": 1.5932385132071285, "kl": 0.3984375, "learning_rate": 3.647652944643577e-07, "loss": -0.173, "num_tokens": 16298874.0, "reward": 0.22359932959079742, "reward_std": 0.08102130889892578, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.17205481231212616, "rewards/logprob_reward/std": 0.3204740285873413, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 660.84375, "completions/mean_terminated_length": 636.6333618164062, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 1.845679012345679, "grad_norm": 1.5526282596435892, "kl": 0.436767578125, "learning_rate": 3.6432092054963055e-07, "loss": -0.2527, "num_tokens": 16326457.0, "reward": 0.3753017783164978, "reward_std": 0.14999264478683472, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.32672423124313354, "rewards/logprob_reward/std": 0.3004819452762604, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 691.59375, "completions/mean_terminated_length": 669.433349609375, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 1.8487654320987654, "grad_norm": 1.7233072385244923, "kl": 0.4267578125, "learning_rate": 3.638760894699355e-07, "loss": -0.1034, "num_tokens": 16355276.0, "reward": 0.25644612312316895, "reward_std": 0.1267407089471817, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.18771792948246002, "rewards/logprob_reward/std": 0.24069729447364807, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 667.71875, "completions/mean_terminated_length": 643.9666748046875, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 1.8518518518518519, "grad_norm": 1.6709510732392059, "kl": 0.391357421875, "learning_rate": 3.6343080300413497e-07, "loss": -0.0407, "num_tokens": 16382579.0, "reward": 0.4641103148460388, "reward_std": 0.11252662539482117, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.41498368978500366, "rewards/logprob_reward/std": 0.2876684069633484, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 689.375, "completions/mean_terminated_length": 667.0667114257812, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 1.8549382716049383, "grad_norm": 1.7403724361355983, "kl": 0.38671875, "learning_rate": 3.629850629329124e-07, "loss": -0.272, "num_tokens": 16411603.0, "reward": 0.26402801275253296, "reward_std": 0.12539947032928467, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.19961445033550262, "rewards/logprob_reward/std": 0.2470732480287552, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 699.3125, "completions/mean_terminated_length": 677.6666870117188, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 1.8580246913580247, "grad_norm": 1.4711702150042372, "kl": 0.402587890625, "learning_rate": 3.625388710387651e-07, "loss": -0.1805, "num_tokens": 16440381.0, "reward": 0.3804614841938019, "reward_std": 0.19540894031524658, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.33245718479156494, "rewards/logprob_reward/std": 0.3370988965034485, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 631.75, "completions/mean_terminated_length": 619.0967407226562, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 1.8611111111111112, "grad_norm": 1.6959910234480935, "kl": 0.408203125, "learning_rate": 3.6209222910599746e-07, "loss": -0.0593, "num_tokens": 16466765.0, "reward": 0.35366272926330566, "reward_std": 0.07661660015583038, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2887919545173645, "rewards/logprob_reward/std": 0.29857319593429565, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 725.5, "completions/mean_terminated_length": 670.2222290039062, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 1.8641975308641974, "grad_norm": 1.445296833144238, "kl": 0.41845703125, "learning_rate": 3.616451389207133e-07, "loss": -0.0243, "num_tokens": 16496985.0, "reward": 0.24427032470703125, "reward_std": 0.02894832193851471, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.1811336874961853, "rewards/logprob_reward/std": 0.31449148058891296, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 667.5, "completions/mean_terminated_length": 643.7333374023438, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 1.867283950617284, "grad_norm": 1.6791410785324703, "kl": 0.454345703125, "learning_rate": 3.611976022708091e-07, "loss": -0.1866, "num_tokens": 16524901.0, "reward": 0.3625384569168091, "reward_std": 0.06658357381820679, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.30907052755355835, "rewards/logprob_reward/std": 0.34040942788124084, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 651.84375, "completions/mean_terminated_length": 627.0333862304688, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 1.8703703703703702, "grad_norm": 1.867436549804989, "kl": 0.439453125, "learning_rate": 3.6074962094596676e-07, "loss": -0.0075, "num_tokens": 16552728.0, "reward": 0.3140837252140045, "reward_std": 0.05286619812250137, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.25523191690444946, "rewards/logprob_reward/std": 0.30942094326019287, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 717.84375, "completions/mean_terminated_length": 661.1481323242188, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 1.873456790123457, "grad_norm": 1.6318266441145477, "kl": 0.4072265625, "learning_rate": 3.603011967376464e-07, "loss": 0.0081, "num_tokens": 16582243.0, "reward": 0.29781463742256165, "reward_std": 0.21601226925849915, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.25104403495788574, "rewards/logprob_reward/std": 0.308644562959671, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 666.0625, "completions/mean_terminated_length": 642.2000122070312, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 1.876543209876543, "grad_norm": 1.7079646413559832, "kl": 0.4228515625, "learning_rate": 3.598523314390792e-07, "loss": -0.1844, "num_tokens": 16610125.0, "reward": 0.328095018863678, "reward_std": 0.13337793946266174, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.27427223324775696, "rewards/logprob_reward/std": 0.3172014653682709, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 768.25, "completions/mean_terminated_length": 720.888916015625, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "epoch": 1.8796296296296298, "grad_norm": 1.3580904972356276, "kl": 0.409423828125, "learning_rate": 3.594030268452601e-07, "loss": -0.2044, "num_tokens": 16641625.0, "reward": 0.28068241477012634, "reward_std": 0.16129527986049652, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.22853600978851318, "rewards/logprob_reward/std": 0.285929799079895, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 712.9375, "completions/mean_terminated_length": 712.9375, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 1.882716049382716, "grad_norm": 1.5031200053518368, "kl": 0.377685546875, "learning_rate": 3.5895328475294106e-07, "loss": -0.0977, "num_tokens": 16670891.0, "reward": 0.38770195841789246, "reward_std": 0.05023252218961716, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.32661330699920654, "rewards/logprob_reward/std": 0.32525020837783813, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 640.34375, "completions/mean_terminated_length": 627.9677124023438, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 1.8858024691358026, "grad_norm": 1.509209902954745, "kl": 0.42236328125, "learning_rate": 3.585031069606234e-07, "loss": -0.0827, "num_tokens": 16697782.0, "reward": 0.48670294880867004, "reward_std": 0.11901184916496277, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4400866627693176, "rewards/logprob_reward/std": 0.3879680335521698, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 713.34375, "completions/mean_terminated_length": 681.2069091796875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 1.8888888888888888, "grad_norm": 1.5953935520457498, "kl": 0.402099609375, "learning_rate": 3.5805249526855074e-07, "loss": -0.1111, "num_tokens": 16727189.0, "reward": 0.3634263277053833, "reward_std": 0.18746894598007202, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.31352928280830383, "rewards/logprob_reward/std": 0.3375459909439087, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 747.65625, "completions/mean_terminated_length": 696.4815063476562, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 1.8919753086419753, "grad_norm": 1.7040316116624081, "kl": 0.419921875, "learning_rate": 3.5760145147870204e-07, "loss": -0.2003, "num_tokens": 16757614.0, "reward": 0.2180907428264618, "reward_std": 0.15260377526283264, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.16246193647384644, "rewards/logprob_reward/std": 0.2562755346298218, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 683.03125, "completions/mean_terminated_length": 683.03125, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 1.8950617283950617, "grad_norm": 1.61082123222166, "kl": 0.404541015625, "learning_rate": 3.571499773947839e-07, "loss": -0.0269, "num_tokens": 16786363.0, "reward": 0.4214928448200226, "reward_std": 0.05608215183019638, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.3606864809989929, "rewards/logprob_reward/std": 0.3399166166782379, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 622.0625, "completions/mean_terminated_length": 609.0967407226562, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 1.8981481481481481, "grad_norm": 1.7037159731014462, "kl": 0.42138671875, "learning_rate": 3.5669807482222395e-07, "loss": -0.2162, "num_tokens": 16812577.0, "reward": 0.31264930963516235, "reward_std": 0.11790737509727478, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.2605825662612915, "rewards/logprob_reward/std": 0.2916097640991211, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 715.09375, "completions/mean_terminated_length": 694.5000610351562, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 1.9012345679012346, "grad_norm": 1.7567099471263434, "kl": 0.427978515625, "learning_rate": 3.562457455681633e-07, "loss": -0.0661, "num_tokens": 16841652.0, "reward": 0.40816718339920044, "reward_std": 0.08529245853424072, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3528246581554413, "rewards/logprob_reward/std": 0.3609466254711151, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 708.25, "completions/mean_terminated_length": 708.25, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 1.904320987654321, "grad_norm": 1.2552904139699854, "kl": 0.386962890625, "learning_rate": 3.557929914414491e-07, "loss": -0.2956, "num_tokens": 16870676.0, "reward": 0.441328227519989, "reward_std": 0.17677578330039978, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3966147303581238, "rewards/logprob_reward/std": 0.3519150912761688, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 727.34375, "completions/mean_terminated_length": 696.6551513671875, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 1.9074074074074074, "grad_norm": 1.6055692411639337, "kl": 0.40869140625, "learning_rate": 3.553398142526277e-07, "loss": -0.1029, "num_tokens": 16900759.0, "reward": 0.3819138705730438, "reward_std": 0.1139083206653595, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.33407095074653625, "rewards/logprob_reward/std": 0.32453280687332153, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 712.1875, "completions/mean_terminated_length": 679.9310302734375, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 1.9104938271604939, "grad_norm": 1.5654909598879456, "kl": 0.435546875, "learning_rate": 3.5488621581393736e-07, "loss": 0.0169, "num_tokens": 16930161.0, "reward": 0.222086101770401, "reward_std": 0.10166365653276443, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.14954011142253876, "rewards/logprob_reward/std": 0.27775970101356506, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 719.53125, "completions/mean_terminated_length": 676.0357666015625, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 1.9135802469135803, "grad_norm": 1.8212461285979684, "kl": 0.39453125, "learning_rate": 3.5443219793930073e-07, "loss": -0.1911, "num_tokens": 16959798.0, "reward": 0.21528016030788422, "reward_std": 0.07165077328681946, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.1628112941980362, "rewards/logprob_reward/std": 0.3085905611515045, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 668.0625, "completions/mean_terminated_length": 656.5806274414062, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 1.9166666666666665, "grad_norm": 1.5665491281191506, "kl": 0.430908203125, "learning_rate": 3.5397776244431794e-07, "loss": -0.1175, "num_tokens": 16987224.0, "reward": 0.42997777462005615, "reward_std": 0.07502011954784393, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.37705862522125244, "rewards/logprob_reward/std": 0.32206177711486816, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 687.25, "completions/mean_terminated_length": 676.3870849609375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 1.9197530864197532, "grad_norm": 1.5285161209283842, "kl": 0.410888671875, "learning_rate": 3.535229111462589e-07, "loss": 0.0066, "num_tokens": 17015768.0, "reward": 0.36629563570022583, "reward_std": 0.10823097825050354, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.30282849073410034, "rewards/logprob_reward/std": 0.2638876438140869, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 757.21875, "completions/mean_terminated_length": 729.6206665039062, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 1.9228395061728394, "grad_norm": 1.646945755553067, "kl": 0.43359375, "learning_rate": 3.530676458640567e-07, "loss": -0.1216, "num_tokens": 17047039.0, "reward": 0.308902382850647, "reward_std": 0.14465618133544922, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2529470920562744, "rewards/logprob_reward/std": 0.2899276316165924, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 700.09375, "completions/mean_terminated_length": 640.1111450195312, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 1.925925925925926, "grad_norm": 1.6507273390779098, "kl": 0.42529296875, "learning_rate": 3.5261196841829957e-07, "loss": 0.0443, "num_tokens": 17075698.0, "reward": 0.35649365186691284, "reward_std": 0.0636821761727333, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.30235403776168823, "rewards/logprob_reward/std": 0.316956490278244, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 755.75, "completions/mean_terminated_length": 737.86669921875, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 1.9290123456790123, "grad_norm": 2.488766788631234, "kl": 0.38818359375, "learning_rate": 3.521558806312241e-07, "loss": -0.2705, "num_tokens": 17106518.0, "reward": 0.3861399292945862, "reward_std": 0.10409753024578094, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3387666344642639, "rewards/logprob_reward/std": 0.31598415970802307, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 714.28125, "completions/mean_terminated_length": 682.2413940429688, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 1.932098765432099, "grad_norm": 1.2939359683923317, "kl": 0.387451171875, "learning_rate": 3.5169938432670775e-07, "loss": -0.1012, "num_tokens": 17135867.0, "reward": 0.4845353364944458, "reward_std": 0.0558546707034111, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.44809484481811523, "rewards/logprob_reward/std": 0.38385310769081116, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 740.59375, "completions/mean_terminated_length": 721.7000122070312, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 1.9351851851851851, "grad_norm": 1.5928662751839284, "kl": 0.41064453125, "learning_rate": 3.5124248133026187e-07, "loss": -0.113, "num_tokens": 17166270.0, "reward": 0.4097488522529602, "reward_std": 0.11131132394075394, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3545820116996765, "rewards/logprob_reward/std": 0.2949684262275696, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 730.875, "completions/mean_terminated_length": 663.2307739257812, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 1.9382716049382716, "grad_norm": 1.7411199843830638, "kl": 0.42333984375, "learning_rate": 3.5078517346902384e-07, "loss": -0.1791, "num_tokens": 17196390.0, "reward": 0.15036368370056152, "reward_std": 0.10647177696228027, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.09415409713983536, "rewards/logprob_reward/std": 0.2291426658630371, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 675.25, "completions/mean_terminated_length": 664.0, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 1.941358024691358, "grad_norm": 1.671629386186832, "kl": 0.4169921875, "learning_rate": 3.503274625717504e-07, "loss": -0.0568, "num_tokens": 17224166.0, "reward": 0.3743082284927368, "reward_std": 0.022616667672991753, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.30825915932655334, "rewards/logprob_reward/std": 0.3393799364566803, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 723.78125, "completions/mean_terminated_length": 692.72412109375, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 1.9444444444444444, "grad_norm": 1.5524210709320578, "kl": 0.47607421875, "learning_rate": 3.498693504688097e-07, "loss": -0.1391, "num_tokens": 17253755.0, "reward": 0.4750348925590515, "reward_std": 0.20986796915531158, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.437538743019104, "rewards/logprob_reward/std": 0.4012451767921448, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 661.34375, "completions/mean_terminated_length": 661.34375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 1.9475308641975309, "grad_norm": 1.469140796228788, "kl": 0.40576171875, "learning_rate": 3.494108389921744e-07, "loss": -0.2431, "num_tokens": 17281594.0, "reward": 0.4284975528717041, "reward_std": 0.11910226196050644, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3788861334323883, "rewards/logprob_reward/std": 0.35377436876296997, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 714.53125, "completions/mean_terminated_length": 682.5172119140625, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 1.9506172839506173, "grad_norm": 1.6742798332834727, "kl": 0.385498046875, "learning_rate": 3.4895192997541436e-07, "loss": -0.0521, "num_tokens": 17311131.0, "reward": 0.37838172912597656, "reward_std": 0.09376947581768036, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3266741633415222, "rewards/logprob_reward/std": 0.31257009506225586, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 739.46875, "completions/mean_terminated_length": 730.290283203125, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 1.9537037037037037, "grad_norm": 1.5913785460078629, "kl": 0.405517578125, "learning_rate": 3.484926252536891e-07, "loss": -0.0377, "num_tokens": 17340842.0, "reward": 0.3858599066734314, "reward_std": 0.09795581549406052, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3245665431022644, "rewards/logprob_reward/std": 0.2939569056034088, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 688.5625, "completions/mean_terminated_length": 688.5625, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 1.9567901234567902, "grad_norm": 1.4847744446078994, "kl": 0.40576171875, "learning_rate": 3.4803292666374047e-07, "loss": -0.1154, "num_tokens": 17369160.0, "reward": 0.3202695846557617, "reward_std": 0.09746810793876648, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.25168848037719727, "rewards/logprob_reward/std": 0.2644346356391907, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 688.59375, "completions/mean_terminated_length": 666.2333984375, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 1.9598765432098766, "grad_norm": 1.5538500927510028, "kl": 0.44189453125, "learning_rate": 3.4757283604388546e-07, "loss": -0.0114, "num_tokens": 17397447.0, "reward": 0.42439788579940796, "reward_std": 0.1275281310081482, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3708587884902954, "rewards/logprob_reward/std": 0.33271247148513794, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 677.65625, "completions/mean_terminated_length": 677.65625, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 1.9629629629629628, "grad_norm": 1.4568973311873756, "kl": 0.43212890625, "learning_rate": 3.47112355234009e-07, "loss": -0.2087, "num_tokens": 17425684.0, "reward": 0.40236568450927734, "reward_std": 0.0919317975640297, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3463785648345947, "rewards/logprob_reward/std": 0.3311493992805481, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 706.34375, "completions/mean_terminated_length": 696.0967407226562, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 1.9660493827160495, "grad_norm": 1.6669922541126043, "kl": 0.419921875, "learning_rate": 3.466514860755559e-07, "loss": -0.0339, "num_tokens": 17454647.0, "reward": 0.40241318941116333, "reward_std": 0.03452123701572418, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.34295913577079773, "rewards/logprob_reward/std": 0.29374033212661743, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 656.71875, "completions/mean_terminated_length": 618.72412109375, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 1.9691358024691357, "grad_norm": 1.7307244291886672, "kl": 0.412353515625, "learning_rate": 3.4619023041152433e-07, "loss": -0.0477, "num_tokens": 17482062.0, "reward": 0.38481998443603516, "reward_std": 0.16833657026290894, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.33035555481910706, "rewards/logprob_reward/std": 0.2982006072998047, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 692.40625, "completions/mean_terminated_length": 681.7096557617188, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 1.9722222222222223, "grad_norm": 1.7225608312315646, "kl": 0.401123046875, "learning_rate": 3.4572859008645796e-07, "loss": -0.2253, "num_tokens": 17511095.0, "reward": 0.274787962436676, "reward_std": 0.1444133222103119, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.21156997978687286, "rewards/logprob_reward/std": 0.31099650263786316, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 675.875, "completions/mean_terminated_length": 664.6451416015625, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 1.9753086419753085, "grad_norm": 1.5727800985461269, "kl": 0.419921875, "learning_rate": 3.452665669464386e-07, "loss": -0.0515, "num_tokens": 17539947.0, "reward": 0.49371129274368286, "reward_std": 0.1423429399728775, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.45134586095809937, "rewards/logprob_reward/std": 0.32636651396751404, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 621.1875, "completions/mean_terminated_length": 621.1875, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 1.9783950617283952, "grad_norm": 1.6631957692238406, "kl": 0.43798828125, "learning_rate": 3.448041628390791e-07, "loss": -0.1168, "num_tokens": 17565941.0, "reward": 0.47981545329093933, "reward_std": 0.052941858768463135, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4289616346359253, "rewards/logprob_reward/std": 0.3563511073589325, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 696.875, "completions/mean_terminated_length": 675.0667114257812, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 1.9814814814814814, "grad_norm": 1.6121961564139136, "kl": 0.400390625, "learning_rate": 3.443413796135159e-07, "loss": -0.0543, "num_tokens": 17594561.0, "reward": 0.49061983823776245, "reward_std": 0.07868089526891708, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4444386959075928, "rewards/logprob_reward/std": 0.32546278834342957, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 705.71875, "completions/mean_terminated_length": 684.5000610351562, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 1.984567901234568, "grad_norm": 1.5199746595446675, "kl": 0.381591796875, "learning_rate": 3.4387821912040116e-07, "loss": -0.1474, "num_tokens": 17623668.0, "reward": 0.37347203493118286, "reward_std": 0.15393918752670288, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.32469117641448975, "rewards/logprob_reward/std": 0.28643256425857544, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 751.59375, "completions/mean_terminated_length": 688.7307739257812, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 1.9876543209876543, "grad_norm": 1.5425262138216926, "kl": 0.3857421875, "learning_rate": 3.4341468321189574e-07, "loss": -0.0661, "num_tokens": 17654691.0, "reward": 0.3747953772544861, "reward_std": 0.15010225772857666, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.33657824993133545, "rewards/logprob_reward/std": 0.3051075041294098, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 674.09375, "completions/mean_terminated_length": 674.09375, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 1.9907407407407407, "grad_norm": 1.5730058594900933, "kl": 0.3994140625, "learning_rate": 3.4295077374166214e-07, "loss": -0.0657, "num_tokens": 17682402.0, "reward": 0.4000759720802307, "reward_std": 0.028739571571350098, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.33688995242118835, "rewards/logprob_reward/std": 0.29831716418266296, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 704.8125, "completions/mean_terminated_length": 683.5333862304688, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 1.9938271604938271, "grad_norm": 1.6011134932721804, "kl": 0.4130859375, "learning_rate": 3.4248649256485655e-07, "loss": -0.0792, "num_tokens": 17711204.0, "reward": 0.4676671028137207, "reward_std": 0.1272086501121521, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4189356565475464, "rewards/logprob_reward/std": 0.3434039056301117, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 764.1875, "completions/mean_terminated_length": 755.8064575195312, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 1.9969135802469136, "grad_norm": 1.5862235515375993, "kl": 0.362060546875, "learning_rate": 3.4202184153812135e-07, "loss": -0.1481, "num_tokens": 17742546.0, "reward": 0.4220771789550781, "reward_std": 0.20551294088363647, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.37175241112709045, "rewards/logprob_reward/std": 0.34722012281417847, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 676.625, "completions/mean_terminated_length": 653.4666748046875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 2.0, "grad_norm": 1.7241248737569652, "kl": 0.440185546875, "learning_rate": 3.415568225195783e-07, "loss": -0.0864, "num_tokens": 17770350.0, "reward": 0.4448099136352539, "reward_std": 0.17245569825172424, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.39701104164123535, "rewards/logprob_reward/std": 0.34811583161354065, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 628.59375, "completions/mean_terminated_length": 628.59375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 2.003086419753086, "grad_norm": 1.61662043010161, "kl": 0.434814453125, "learning_rate": 3.410914373688205e-07, "loss": -0.0778, "num_tokens": 17796873.0, "reward": 0.353081613779068, "reward_std": 0.023739302530884743, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.2846740186214447, "rewards/logprob_reward/std": 0.2955014109611511, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 645.3125, "completions/mean_terminated_length": 645.3125, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 2.006172839506173, "grad_norm": 1.4601279371444804, "kl": 0.428955078125, "learning_rate": 3.4062568794690536e-07, "loss": -0.1458, "num_tokens": 17823739.0, "reward": 0.35317370295524597, "reward_std": 0.10584400594234467, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.29172080755233765, "rewards/logprob_reward/std": 0.2908954620361328, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 681.65625, "completions/mean_terminated_length": 670.6128540039062, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 2.009259259259259, "grad_norm": 1.4456386183339884, "kl": 0.4091796875, "learning_rate": 3.401595761163468e-07, "loss": -0.1774, "num_tokens": 17851992.0, "reward": 0.2858881950378418, "reward_std": 0.07210725545883179, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2239035665988922, "rewards/logprob_reward/std": 0.310941606760025, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 707.34375, "completions/mean_terminated_length": 674.586181640625, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 2.0123456790123457, "grad_norm": 1.717470719466214, "kl": 0.422119140625, "learning_rate": 3.3969310374110817e-07, "loss": -0.0284, "num_tokens": 17880975.0, "reward": 0.3305037319660187, "reward_std": 0.1170731708407402, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.27347636222839355, "rewards/logprob_reward/std": 0.2927412688732147, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 628.3125, "completions/mean_terminated_length": 615.54833984375, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 2.015432098765432, "grad_norm": 1.575208513580022, "kl": 0.42333984375, "learning_rate": 3.3922627268659467e-07, "loss": -0.13, "num_tokens": 17907197.0, "reward": 0.47305139899253845, "reward_std": 0.09180600941181183, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.42491820454597473, "rewards/logprob_reward/std": 0.32549819350242615, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 694.21875, "completions/mean_terminated_length": 672.2333984375, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 2.0185185185185186, "grad_norm": 1.7202092213698568, "kl": 0.4033203125, "learning_rate": 3.387590848196456e-07, "loss": -0.0197, "num_tokens": 17936012.0, "reward": 0.32415512204170227, "reward_std": 0.04510686919093132, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2594779133796692, "rewards/logprob_reward/std": 0.3060656189918518, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 723.9375, "completions/mean_terminated_length": 681.0714721679688, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 2.021604938271605, "grad_norm": 1.7029074024591004, "kl": 0.3955078125, "learning_rate": 3.382915420085274e-07, "loss": -0.1631, "num_tokens": 17966234.0, "reward": 0.4130585491657257, "reward_std": 0.17552123963832855, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.37562060356140137, "rewards/logprob_reward/std": 0.33336058259010315, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 743.65625, "completions/mean_terminated_length": 703.607177734375, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 2.0246913580246915, "grad_norm": 1.7389766065522478, "kl": 0.418701171875, "learning_rate": 3.3782364612292574e-07, "loss": 0.0383, "num_tokens": 17997323.0, "reward": 0.39100077748298645, "reward_std": 0.1255885809659958, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.340695321559906, "rewards/logprob_reward/std": 0.28662949800491333, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 680.46875, "completions/mean_terminated_length": 669.3870849609375, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 2.0277777777777777, "grad_norm": 1.6995454710465838, "kl": 0.406982421875, "learning_rate": 3.3735539903393826e-07, "loss": -0.0301, "num_tokens": 18025366.0, "reward": 0.47378528118133545, "reward_std": 0.09603683650493622, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.42226144671440125, "rewards/logprob_reward/std": 0.38681167364120483, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 697.90625, "completions/mean_terminated_length": 676.1666870117188, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 2.0308641975308643, "grad_norm": 1.597415360339783, "kl": 0.42041015625, "learning_rate": 3.368868026140672e-07, "loss": -0.0381, "num_tokens": 18054215.0, "reward": 0.32563379406929016, "reward_std": 0.059159185737371445, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.26112088561058044, "rewards/logprob_reward/std": 0.29317209124565125, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 701.125, "completions/mean_terminated_length": 679.6000366210938, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 2.0339506172839505, "grad_norm": 1.6953223397397352, "kl": 0.408935546875, "learning_rate": 3.364178587372115e-07, "loss": 0.0171, "num_tokens": 18083439.0, "reward": 0.43783941864967346, "reward_std": 0.04812919348478317, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3823215961456299, "rewards/logprob_reward/std": 0.30860188603401184, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 753.0625, "completions/mean_terminated_length": 725.0344848632812, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 2.037037037037037, "grad_norm": 1.5856199935998507, "kl": 0.41943359375, "learning_rate": 3.359485692786597e-07, "loss": -0.1399, "num_tokens": 18114469.0, "reward": 0.27857884764671326, "reward_std": 0.06401679664850235, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.22272652387619019, "rewards/logprob_reward/std": 0.30108642578125, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 733.46875, "completions/mean_terminated_length": 724.0967407226562, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 2.0401234567901234, "grad_norm": 1.513572610219854, "kl": 0.413330078125, "learning_rate": 3.354789361150824e-07, "loss": -0.0671, "num_tokens": 18145012.0, "reward": 0.3990068733692169, "reward_std": 0.03926682472229004, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3391742706298828, "rewards/logprob_reward/std": 0.3591441810131073, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 707.0, "completions/mean_terminated_length": 685.86669921875, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 2.04320987654321, "grad_norm": 1.7429485221492174, "kl": 0.403564453125, "learning_rate": 3.350089611245246e-07, "loss": 0.017, "num_tokens": 18174144.0, "reward": 0.5321580171585083, "reward_std": 0.07858886569738388, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.48712003231048584, "rewards/logprob_reward/std": 0.3105562925338745, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 622.75, "completions/mean_terminated_length": 622.75, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 2.0462962962962963, "grad_norm": 1.5749655458517173, "kl": 0.498291015625, "learning_rate": 3.345386461863981e-07, "loss": -0.0196, "num_tokens": 18200428.0, "reward": 0.43447086215019226, "reward_std": 0.09284544736146927, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.37857872247695923, "rewards/logprob_reward/std": 0.30828389525413513, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 732.84375, "completions/mean_terminated_length": 651.3200073242188, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 2.049382716049383, "grad_norm": 1.5500777271050838, "kl": 0.393310546875, "learning_rate": 3.340679931814743e-07, "loss": -0.0057, "num_tokens": 18230315.0, "reward": 0.3458241820335388, "reward_std": 0.13237610459327698, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.30091574788093567, "rewards/logprob_reward/std": 0.31251972913742065, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 689.59375, "completions/mean_terminated_length": 667.300048828125, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 2.052469135802469, "grad_norm": 1.7777934577474725, "kl": 0.4365234375, "learning_rate": 3.3359700399187654e-07, "loss": -0.2112, "num_tokens": 18258798.0, "reward": 0.2823296785354614, "reward_std": 0.10608969628810883, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2199496328830719, "rewards/logprob_reward/std": 0.2263510823249817, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 643.4375, "completions/mean_terminated_length": 643.4375, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 2.0555555555555554, "grad_norm": 1.9268564358351272, "kl": 0.443603515625, "learning_rate": 3.331256805010724e-07, "loss": -0.2094, "num_tokens": 18285772.0, "reward": 0.2640511095523834, "reward_std": 0.04838243126869202, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.19269567728042603, "rewards/logprob_reward/std": 0.2629052698612213, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 684.3125, "completions/mean_terminated_length": 649.1724243164062, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 2.058641975308642, "grad_norm": 1.790471330390895, "kl": 0.4375, "learning_rate": 3.326540245938666e-07, "loss": -0.1889, "num_tokens": 18313754.0, "reward": 0.2291223257780075, "reward_std": 0.1321234554052353, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.16430258750915527, "rewards/logprob_reward/std": 0.2753356993198395, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 713.03125, "completions/mean_terminated_length": 703.0, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 2.0617283950617282, "grad_norm": 1.8445267778178804, "kl": 0.44287109375, "learning_rate": 3.3218203815639265e-07, "loss": 0.0343, "num_tokens": 18343163.0, "reward": 0.48886197805404663, "reward_std": 0.06746520847082138, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.4355410635471344, "rewards/logprob_reward/std": 0.2983049750328064, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 695.15625, "completions/mean_terminated_length": 661.137939453125, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 2.064814814814815, "grad_norm": 1.639295665571653, "kl": 0.435791015625, "learning_rate": 3.3170972307610654e-07, "loss": -0.0425, "num_tokens": 18372316.0, "reward": 0.38681337237358093, "reward_std": 0.1464131772518158, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3325704336166382, "rewards/logprob_reward/std": 0.34786027669906616, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 697.0, "completions/mean_terminated_length": 686.4515991210938, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 2.067901234567901, "grad_norm": 1.7957042282451952, "kl": 0.42626953125, "learning_rate": 3.312370812417779e-07, "loss": -0.1944, "num_tokens": 18400688.0, "reward": 0.28181129693984985, "reward_std": 0.06732695549726486, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2159014642238617, "rewards/logprob_reward/std": 0.2937818765640259, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 663.84375, "completions/mean_terminated_length": 652.2257690429688, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 2.0709876543209877, "grad_norm": 1.6117496071808766, "kl": 0.429443359375, "learning_rate": 3.3076411454348336e-07, "loss": -0.0376, "num_tokens": 18428051.0, "reward": 0.5843040347099304, "reward_std": 0.11490219831466675, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.5450600385665894, "rewards/logprob_reward/std": 0.31851083040237427, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 651.46875, "completions/mean_terminated_length": 639.4515991210938, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 2.074074074074074, "grad_norm": 1.5888208272974447, "kl": 0.4189453125, "learning_rate": 3.3029082487259847e-07, "loss": -0.0749, "num_tokens": 18454802.0, "reward": 0.29231715202331543, "reward_std": 0.06631557643413544, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.22757463157176971, "rewards/logprob_reward/std": 0.3025350272655487, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 708.5, "completions/mean_terminated_length": 708.5, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 2.0771604938271606, "grad_norm": 1.515685882192776, "kl": 0.413818359375, "learning_rate": 3.298172141217905e-07, "loss": 0.0389, "num_tokens": 18484302.0, "reward": 0.30090922117233276, "reward_std": 0.006545512471348047, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.2232324779033661, "rewards/logprob_reward/std": 0.3084871470928192, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 692.3125, "completions/mean_terminated_length": 670.2000122070312, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 2.080246913580247, "grad_norm": 1.5510080299151163, "kl": 0.411376953125, "learning_rate": 3.2934328418501064e-07, "loss": -0.0251, "num_tokens": 18512852.0, "reward": 0.35732540488243103, "reward_std": 0.11991064995527267, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.29633378982543945, "rewards/logprob_reward/std": 0.3101804256439209, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 678.28125, "completions/mean_terminated_length": 667.1290283203125, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 2.0833333333333335, "grad_norm": 2.0438561154435098, "kl": 0.423583984375, "learning_rate": 3.2886903695748647e-07, "loss": -0.1691, "num_tokens": 18541721.0, "reward": 0.32805967330932617, "reward_std": 0.04710330069065094, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.26728856563568115, "rewards/logprob_reward/std": 0.31644710898399353, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 700.90625, "completions/mean_terminated_length": 654.75, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 2.0864197530864197, "grad_norm": 1.877137663136837, "kl": 0.47998046875, "learning_rate": 3.2839447433571454e-07, "loss": -0.0537, "num_tokens": 18570490.0, "reward": 0.3565995693206787, "reward_std": 0.15624743700027466, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.30247175693511963, "rewards/logprob_reward/std": 0.30927252769470215, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 625.03125, "completions/mean_terminated_length": 598.433349609375, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 2.0895061728395063, "grad_norm": 1.7785261270889097, "kl": 0.4423828125, "learning_rate": 3.279195982174524e-07, "loss": 0.0258, "num_tokens": 18596479.0, "reward": 0.4768751859664917, "reward_std": 0.11921582370996475, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.42569464445114136, "rewards/logprob_reward/std": 0.265025794506073, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 648.09375, "completions/mean_terminated_length": 648.09375, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 2.0925925925925926, "grad_norm": 1.90219772512068, "kl": 0.4248046875, "learning_rate": 3.2744441050171136e-07, "loss": -0.0293, "num_tokens": 18623726.0, "reward": 0.5335157513618469, "reward_std": 0.023804698139429092, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.481684148311615, "rewards/logprob_reward/std": 0.3298651874065399, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 676.1875, "completions/mean_terminated_length": 676.1875, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 2.095679012345679, "grad_norm": 1.7532479494639879, "kl": 0.436279296875, "learning_rate": 3.26968913088749e-07, "loss": -0.0179, "num_tokens": 18651700.0, "reward": 0.4315306842327118, "reward_std": 0.02383863553404808, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.36836743354797363, "rewards/logprob_reward/std": 0.30874764919281006, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 661.03125, "completions/mean_terminated_length": 649.3225708007812, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 2.0987654320987654, "grad_norm": 1.6319624279692144, "kl": 0.429931640625, "learning_rate": 3.264931078800611e-07, "loss": -0.0246, "num_tokens": 18679397.0, "reward": 0.5309042930603027, "reward_std": 0.084602952003479, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4857270121574402, "rewards/logprob_reward/std": 0.32934531569480896, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 670.8125, "completions/mean_terminated_length": 670.8125, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 2.1018518518518516, "grad_norm": 1.6673088182008717, "kl": 0.415771484375, "learning_rate": 3.260169967783744e-07, "loss": 0.0202, "num_tokens": 18707635.0, "reward": 0.6024180054664612, "reward_std": 0.04187729209661484, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.5582422018051147, "rewards/logprob_reward/std": 0.3019714951515198, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 732.34375, "completions/mean_terminated_length": 722.9354858398438, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 2.1049382716049383, "grad_norm": 1.5223196672300547, "kl": 0.403564453125, "learning_rate": 3.255405816876389e-07, "loss": -0.1496, "num_tokens": 18737790.0, "reward": 0.5536549687385559, "reward_std": 0.13116586208343506, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.5144777297973633, "rewards/logprob_reward/std": 0.3969856798648834, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 736.8125, "completions/mean_terminated_length": 727.54833984375, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 2.1080246913580245, "grad_norm": 1.7268971477216148, "kl": 0.428466796875, "learning_rate": 3.250638645130204e-07, "loss": 0.0117, "num_tokens": 18768196.0, "reward": 0.5080626606941223, "reward_std": 0.04626753181219101, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.456875205039978, "rewards/logprob_reward/std": 0.36775878071784973, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 684.96875, "completions/mean_terminated_length": 622.1851806640625, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 2.111111111111111, "grad_norm": 1.7913106292179288, "kl": 0.411865234375, "learning_rate": 3.2458684716089224e-07, "loss": -0.0347, "num_tokens": 18796659.0, "reward": 0.36706626415252686, "reward_std": 0.17104464769363403, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3245181143283844, "rewards/logprob_reward/std": 0.35194718837738037, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 778.34375, "completions/mean_terminated_length": 743.2500610351562, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 2.1141975308641974, "grad_norm": 1.721146333334945, "kl": 0.400390625, "learning_rate": 3.241095315388287e-07, "loss": 0.0997, "num_tokens": 18828306.0, "reward": 0.2593773603439331, "reward_std": 0.13359405100345612, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.1909748762845993, "rewards/logprob_reward/std": 0.24035049974918365, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 692.46875, "completions/mean_terminated_length": 670.36669921875, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 2.117283950617284, "grad_norm": 1.652608627421647, "kl": 0.448486328125, "learning_rate": 3.2363191955559656e-07, "loss": -0.0524, "num_tokens": 18856969.0, "reward": 0.30021947622299194, "reward_std": 0.044019218534231186, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.23288275301456451, "rewards/logprob_reward/std": 0.30290091037750244, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 667.625, "completions/mean_terminated_length": 630.7586059570312, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 2.1203703703703702, "grad_norm": 1.4983619827419203, "kl": 0.427978515625, "learning_rate": 3.231540131211478e-07, "loss": -0.0349, "num_tokens": 18884849.0, "reward": 0.39059650897979736, "reward_std": 0.14267951250076294, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3402461111545563, "rewards/logprob_reward/std": 0.31639257073402405, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 698.40625, "completions/mean_terminated_length": 676.7000122070312, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 2.123456790123457, "grad_norm": 1.488289577466255, "kl": 0.432861328125, "learning_rate": 3.22675814146612e-07, "loss": -0.1529, "num_tokens": 18913938.0, "reward": 0.49786239862442017, "reward_std": 0.09822002053260803, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4524860084056854, "rewards/logprob_reward/std": 0.3491072952747345, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 653.40625, "completions/mean_terminated_length": 641.4515991210938, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 2.126543209876543, "grad_norm": 1.6089192538050028, "kl": 0.474365234375, "learning_rate": 3.221973245442883e-07, "loss": -0.0789, "num_tokens": 18940971.0, "reward": 0.5680996179580688, "reward_std": 0.14561082422733307, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.5270551443099976, "rewards/logprob_reward/std": 0.36380481719970703, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 725.40625, "completions/mean_terminated_length": 705.5000610351562, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 2.1296296296296298, "grad_norm": 1.5415310279225316, "kl": 0.407958984375, "learning_rate": 3.217185462276382e-07, "loss": -0.0968, "num_tokens": 18970116.0, "reward": 0.3887181878089905, "reward_std": 0.0984717309474945, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.33121466636657715, "rewards/logprob_reward/std": 0.3047836124897003, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 726.875, "completions/mean_terminated_length": 684.4285888671875, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 2.132716049382716, "grad_norm": 1.5884639033220118, "kl": 0.40966796875, "learning_rate": 3.2123948111127795e-07, "loss": 0.0245, "num_tokens": 19000020.0, "reward": 0.2681761384010315, "reward_std": 0.12001194804906845, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.20422351360321045, "rewards/logprob_reward/std": 0.23547105491161346, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 741.25, "completions/mean_terminated_length": 700.857177734375, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 2.1358024691358026, "grad_norm": 1.6334821670790958, "kl": 0.406005859375, "learning_rate": 3.2076013111097055e-07, "loss": 0.0773, "num_tokens": 19030264.0, "reward": 0.3188161253929138, "reward_std": 0.11487194895744324, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.25701791048049927, "rewards/logprob_reward/std": 0.3238793611526489, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 677.6875, "completions/mean_terminated_length": 641.862060546875, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 2.138888888888889, "grad_norm": 1.9245198929084661, "kl": 0.430419921875, "learning_rate": 3.20280498143618e-07, "loss": 0.0221, "num_tokens": 19058386.0, "reward": 0.5147184133529663, "reward_std": 0.1694546639919281, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4746870994567871, "rewards/logprob_reward/std": 0.3264382481575012, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 665.9375, "completions/mean_terminated_length": 642.0667114257812, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 2.1419753086419755, "grad_norm": 1.6492470919561222, "kl": 0.45703125, "learning_rate": 3.1980058412725436e-07, "loss": -0.0628, "num_tokens": 19086100.0, "reward": 0.3934106230735779, "reward_std": 0.16372595727443695, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3468451499938965, "rewards/logprob_reward/std": 0.3585425317287445, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 677.65625, "completions/mean_terminated_length": 666.4838256835938, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 2.1450617283950617, "grad_norm": 1.4998664141749625, "kl": 0.431396484375, "learning_rate": 3.1932039098103723e-07, "loss": -0.0425, "num_tokens": 19114173.0, "reward": 0.44240695238113403, "reward_std": 0.071635402739048, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.38739660382270813, "rewards/logprob_reward/std": 0.3239496350288391, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 665.1875, "completions/mean_terminated_length": 653.6128540039062, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 2.148148148148148, "grad_norm": 1.85740157061537, "kl": 0.450439453125, "learning_rate": 3.188399206252406e-07, "loss": -0.0544, "num_tokens": 19141447.0, "reward": 0.2783058285713196, "reward_std": 0.07606051117181778, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.20158982276916504, "rewards/logprob_reward/std": 0.2852879762649536, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 702.125, "completions/mean_terminated_length": 691.741943359375, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 2.1512345679012346, "grad_norm": 1.6743956118043763, "kl": 0.429443359375, "learning_rate": 3.183591749812468e-07, "loss": -0.0609, "num_tokens": 19170727.0, "reward": 0.367331862449646, "reward_std": 0.08178030699491501, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3039798140525818, "rewards/logprob_reward/std": 0.32305416464805603, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 661.34375, "completions/mean_terminated_length": 661.34375, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 2.154320987654321, "grad_norm": 1.7072377032259252, "kl": 0.461669921875, "learning_rate": 3.1787815597153934e-07, "loss": -0.1026, "num_tokens": 19198542.0, "reward": 0.4060082733631134, "reward_std": 0.04487808048725128, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.3434814214706421, "rewards/logprob_reward/std": 0.3525448441505432, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 735.59375, "completions/mean_terminated_length": 716.36669921875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 2.1574074074074074, "grad_norm": 1.8465855217718612, "kl": 0.425537109375, "learning_rate": 3.173968655196947e-07, "loss": 0.0222, "num_tokens": 19229321.0, "reward": 0.6045317649841309, "reward_std": 0.08130021393299103, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.571007490158081, "rewards/logprob_reward/std": 0.37173011898994446, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 686.5, "completions/mean_terminated_length": 651.586181640625, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 2.1604938271604937, "grad_norm": 1.5766548183657474, "kl": 0.447265625, "learning_rate": 3.1691530555037493e-07, "loss": -0.1764, "num_tokens": 19257937.0, "reward": 0.29397067427635193, "reward_std": 0.17047765851020813, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.23635631799697876, "rewards/logprob_reward/std": 0.286420077085495, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 675.125, "completions/mean_terminated_length": 651.86669921875, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 2.1635802469135803, "grad_norm": 1.634907001952234, "kl": 0.43701171875, "learning_rate": 3.164334779893198e-07, "loss": -0.0509, "num_tokens": 19285421.0, "reward": 0.33253270387649536, "reward_std": 0.07230189442634583, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.26878637075424194, "rewards/logprob_reward/std": 0.2841111123561859, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 679.4375, "completions/mean_terminated_length": 668.3225708007812, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 2.1666666666666665, "grad_norm": 1.4723767872570386, "kl": 0.41845703125, "learning_rate": 3.159513847633393e-07, "loss": -0.0146, "num_tokens": 19313487.0, "reward": 0.3582479953765869, "reward_std": 0.05051340162754059, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2938866913318634, "rewards/logprob_reward/std": 0.3051302134990692, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 685.09375, "completions/mean_terminated_length": 674.1612548828125, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 2.169753086419753, "grad_norm": 1.6171449511739118, "kl": 0.434326171875, "learning_rate": 3.1546902780030555e-07, "loss": 0.0136, "num_tokens": 19342182.0, "reward": 0.5982824563980103, "reward_std": 0.08891736716032028, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.5605915784835815, "rewards/logprob_reward/std": 0.32951870560646057, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 702.59375, "completions/mean_terminated_length": 669.3448486328125, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 2.1728395061728394, "grad_norm": 1.5333968149197632, "kl": 0.42333984375, "learning_rate": 3.1498640902914565e-07, "loss": -0.1376, "num_tokens": 19371801.0, "reward": 0.4447559714317322, "reward_std": 0.2036588490009308, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.4038954973220825, "rewards/logprob_reward/std": 0.3297498822212219, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 711.46875, "completions/mean_terminated_length": 711.46875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 2.175925925925926, "grad_norm": 1.5649541399946945, "kl": 0.42822265625, "learning_rate": 3.1450353037983346e-07, "loss": -0.0834, "num_tokens": 19401292.0, "reward": 0.5195739269256592, "reward_std": 0.05239851400256157, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.4696654975414276, "rewards/logprob_reward/std": 0.31505122780799866, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 725.46875, "completions/mean_terminated_length": 715.8386840820312, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 2.1790123456790123, "grad_norm": 1.5338510497829594, "kl": 0.42041015625, "learning_rate": 3.140203937833821e-07, "loss": -0.057, "num_tokens": 19431203.0, "reward": 0.43541914224624634, "reward_std": 0.059950731694698334, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3796323835849762, "rewards/logprob_reward/std": 0.34962889552116394, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 708.53125, "completions/mean_terminated_length": 698.3547973632812, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 2.182098765432099, "grad_norm": 1.574872548137903, "kl": 0.468017578125, "learning_rate": 3.135370011718364e-07, "loss": -0.1214, "num_tokens": 19460520.0, "reward": 0.3524753749370575, "reward_std": 0.1239517405629158, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2909448742866516, "rewards/logprob_reward/std": 0.3338848054409027, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 715.34375, "completions/mean_terminated_length": 705.3870849609375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 2.185185185185185, "grad_norm": 1.6401974064265137, "kl": 0.4306640625, "learning_rate": 3.1305335447826477e-07, "loss": 0.0066, "num_tokens": 19489983.0, "reward": 0.5632875561714172, "reward_std": 0.07336528599262238, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.5182361602783203, "rewards/logprob_reward/std": 0.2729204297065735, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 699.75, "completions/mean_terminated_length": 653.4285888671875, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 2.1882716049382718, "grad_norm": 1.539901308068389, "kl": 0.440673828125, "learning_rate": 3.125694556367517e-07, "loss": -0.0765, "num_tokens": 19518783.0, "reward": 0.2775593400001526, "reward_std": 0.07794313877820969, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2146492898464203, "rewards/logprob_reward/std": 0.30969664454460144, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 712.34375, "completions/mean_terminated_length": 667.8214721679688, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 2.191358024691358, "grad_norm": 1.5596878339666762, "kl": 0.43994140625, "learning_rate": 3.1208530658239e-07, "loss": -0.0044, "num_tokens": 19547630.0, "reward": 0.23448556661605835, "reward_std": 0.062075793743133545, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.16331730782985687, "rewards/logprob_reward/std": 0.2702328860759735, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 705.1875, "completions/mean_terminated_length": 672.2069091796875, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 2.1944444444444446, "grad_norm": 1.5391815296789937, "kl": 0.45361328125, "learning_rate": 3.1160090925127325e-07, "loss": -0.1736, "num_tokens": 19576772.0, "reward": 0.28228211402893066, "reward_std": 0.15687409043312073, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.22336898744106293, "rewards/logprob_reward/std": 0.23918309807777405, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 645.0625, "completions/mean_terminated_length": 645.0625, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 2.197530864197531, "grad_norm": 1.65998197831879, "kl": 0.470458984375, "learning_rate": 3.1111626558048777e-07, "loss": -0.052, "num_tokens": 19603406.0, "reward": 0.3548557460308075, "reward_std": 0.051680706441402435, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.28664523363113403, "rewards/logprob_reward/std": 0.28751155734062195, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 683.90625, "completions/mean_terminated_length": 683.90625, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 2.200617283950617, "grad_norm": 1.5676272325970158, "kl": 0.537841796875, "learning_rate": 3.1063137750810493e-07, "loss": -0.0873, "num_tokens": 19631459.0, "reward": 0.34679698944091797, "reward_std": 0.03523360937833786, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2811633050441742, "rewards/logprob_reward/std": 0.3385540843009949, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 721.59375, "completions/mean_terminated_length": 711.8386840820312, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 2.2037037037037037, "grad_norm": 1.5316897220426338, "kl": 0.427734375, "learning_rate": 3.101462469731735e-07, "loss": -0.2236, "num_tokens": 19661326.0, "reward": 0.37905171513557434, "reward_std": 0.08628718554973602, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.32394635677337646, "rewards/logprob_reward/std": 0.3218473196029663, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 747.46875, "completions/mean_terminated_length": 729.0333862304688, "completions/min_length": 568.0, "completions/min_terminated_length": 568.0, "epoch": 2.20679012345679, "grad_norm": 1.6560043239151687, "kl": 0.41064453125, "learning_rate": 3.0966087591571184e-07, "loss": 0.0036, "num_tokens": 19691885.0, "reward": 0.4851224422454834, "reward_std": 0.12242323160171509, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4383305311203003, "rewards/logprob_reward/std": 0.3416211009025574, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 663.84375, "completions/mean_terminated_length": 639.8333740234375, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 2.2098765432098766, "grad_norm": 1.6872369467346524, "kl": 0.458740234375, "learning_rate": 3.091752662767001e-07, "loss": -0.1068, "num_tokens": 19719176.0, "reward": 0.4085639715194702, "reward_std": 0.11968888342380524, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.35673773288726807, "rewards/logprob_reward/std": 0.3456181585788727, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 783.59375, "completions/mean_terminated_length": 739.0740966796875, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 2.212962962962963, "grad_norm": 1.4829269376902834, "kl": 0.42431640625, "learning_rate": 3.0868941999807274e-07, "loss": -0.0582, "num_tokens": 19751271.0, "reward": 0.33875468373298645, "reward_std": 0.13463005423545837, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2861163318157196, "rewards/logprob_reward/std": 0.3661433160305023, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 706.28125, "completions/mean_terminated_length": 685.1000366210938, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 2.2160493827160495, "grad_norm": 1.6183725859205436, "kl": 0.485107421875, "learning_rate": 3.082033390227102e-07, "loss": -0.0081, "num_tokens": 19780280.0, "reward": 0.298360139131546, "reward_std": 0.053863734006881714, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.23081684112548828, "rewards/logprob_reward/std": 0.3114362359046936, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 682.25, "completions/mean_terminated_length": 659.4666748046875, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 2.2191358024691357, "grad_norm": 1.7659844125840582, "kl": 0.475830078125, "learning_rate": 3.0771702529443163e-07, "loss": 0.0516, "num_tokens": 19808428.0, "reward": 0.3636745810508728, "reward_std": 0.04236737638711929, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.29991614818573, "rewards/logprob_reward/std": 0.31050071120262146, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 689.03125, "completions/mean_terminated_length": 641.1785888671875, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 2.2222222222222223, "grad_norm": 1.5011720325900322, "kl": 0.431396484375, "learning_rate": 3.0723048075798694e-07, "loss": -0.0817, "num_tokens": 19837157.0, "reward": 0.34148484468460083, "reward_std": 0.1513044536113739, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.29262202978134155, "rewards/logprob_reward/std": 0.3316202163696289, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 675.40625, "completions/mean_terminated_length": 664.1612548828125, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 2.2253086419753085, "grad_norm": 1.6512675106059949, "kl": 0.473388671875, "learning_rate": 3.0674370735904917e-07, "loss": -0.049, "num_tokens": 19865070.0, "reward": 0.2514469027519226, "reward_std": 0.07443380355834961, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.17521876096725464, "rewards/logprob_reward/std": 0.2834149897098541, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 711.375, "completions/mean_terminated_length": 666.7142944335938, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 2.228395061728395, "grad_norm": 1.6224512089573222, "kl": 0.452880859375, "learning_rate": 3.0625670704420634e-07, "loss": 0.0962, "num_tokens": 19893826.0, "reward": 0.2972082793712616, "reward_std": 0.1553495079278946, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2330092191696167, "rewards/logprob_reward/std": 0.3103122115135193, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 602.125, "completions/mean_terminated_length": 588.51611328125, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 2.2314814814814814, "grad_norm": 1.7024815059405203, "kl": 0.463623046875, "learning_rate": 3.057694817609539e-07, "loss": -0.0479, "num_tokens": 19919418.0, "reward": 0.4942817986011505, "reward_std": 0.10975522547960281, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.44503530859947205, "rewards/logprob_reward/std": 0.31213757395744324, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 658.28125, "completions/mean_terminated_length": 646.4838256835938, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 2.234567901234568, "grad_norm": 1.8099488456546469, "kl": 0.458984375, "learning_rate": 3.0528203345768717e-07, "loss": -0.101, "num_tokens": 19946507.0, "reward": 0.37496617436408997, "reward_std": 0.08704390376806259, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3194068670272827, "rewards/logprob_reward/std": 0.2583277225494385, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 701.4375, "completions/mean_terminated_length": 679.933349609375, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 2.2376543209876543, "grad_norm": 1.7029204440019559, "kl": 0.452392578125, "learning_rate": 3.047943640836931e-07, "loss": -0.0039, "num_tokens": 19975661.0, "reward": 0.4101158380508423, "reward_std": 0.04188385605812073, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.35151755809783936, "rewards/logprob_reward/std": 0.3307645618915558, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 709.09375, "completions/mean_terminated_length": 676.5172119140625, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 2.240740740740741, "grad_norm": 1.7161213590868698, "kl": 0.480224609375, "learning_rate": 3.0430647558914284e-07, "loss": 0.0438, "num_tokens": 20004412.0, "reward": 0.6048932075500488, "reward_std": 0.19477154314517975, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.5714092254638672, "rewards/logprob_reward/std": 0.36336666345596313, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 640.625, "completions/mean_terminated_length": 640.625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 2.243827160493827, "grad_norm": 1.733149392414438, "kl": 0.495361328125, "learning_rate": 3.038183699250837e-07, "loss": -0.1142, "num_tokens": 20031096.0, "reward": 0.2511615753173828, "reward_std": 0.03225760906934738, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.17837397754192352, "rewards/logprob_reward/std": 0.3063567578792572, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 617.46875, "completions/mean_terminated_length": 617.46875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 2.246913580246914, "grad_norm": 1.8190223156564884, "kl": 0.5185546875, "learning_rate": 3.0333004904343153e-07, "loss": -0.1661, "num_tokens": 20058019.0, "reward": 0.3800065219402313, "reward_std": 0.06788572669029236, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.32153505086898804, "rewards/logprob_reward/std": 0.31045758724212646, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 717.6875, "completions/mean_terminated_length": 707.8064575195312, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 2.25, "grad_norm": 1.6859154700365333, "kl": 0.478271484375, "learning_rate": 3.0284151489696264e-07, "loss": 0.051, "num_tokens": 20087497.0, "reward": 0.5922437906265259, "reward_std": 0.07908917963504791, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.5504096746444702, "rewards/logprob_reward/std": 0.28946995735168457, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 664.34375, "completions/mean_terminated_length": 664.34375, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 2.253086419753086, "grad_norm": 1.6694044830748376, "kl": 0.459228515625, "learning_rate": 3.023527694393064e-07, "loss": -0.0552, "num_tokens": 20115292.0, "reward": 0.2864789366722107, "reward_std": 0.026300884783267975, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.21067103743553162, "rewards/logprob_reward/std": 0.3031250536441803, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 649.40625, "completions/mean_terminated_length": 649.40625, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 2.256172839506173, "grad_norm": 1.8429061742557677, "kl": 0.48583984375, "learning_rate": 3.0186381462493704e-07, "loss": -0.0053, "num_tokens": 20142249.0, "reward": 0.5457585453987122, "reward_std": 0.044836148619651794, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.4952872693538666, "rewards/logprob_reward/std": 0.31420621275901794, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 667.6875, "completions/mean_terminated_length": 616.7857666015625, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 2.259259259259259, "grad_norm": 1.7625007412485105, "kl": 0.44970703125, "learning_rate": 3.0137465240916614e-07, "loss": -0.1154, "num_tokens": 20170427.0, "reward": 0.22002951800823212, "reward_std": 0.03662740811705589, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.1611439287662506, "rewards/logprob_reward/std": 0.3154308795928955, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 670.03125, "completions/mean_terminated_length": 670.03125, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 2.2623456790123457, "grad_norm": 1.9122340188164593, "kl": 0.478759765625, "learning_rate": 3.008852847481346e-07, "loss": -0.143, "num_tokens": 20198048.0, "reward": 0.3749629855155945, "reward_std": 0.11715307086706161, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3228755295276642, "rewards/logprob_reward/std": 0.31561946868896484, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 693.59375, "completions/mean_terminated_length": 659.413818359375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 2.265432098765432, "grad_norm": 1.8091997662948105, "kl": 0.453369140625, "learning_rate": 3.003957135988049e-07, "loss": 0.0191, "num_tokens": 20226607.0, "reward": 0.4413583278656006, "reward_std": 0.07858394831418991, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3897037208080292, "rewards/logprob_reward/std": 0.28788360953330994, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 666.28125, "completions/mean_terminated_length": 629.27587890625, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 2.2685185185185186, "grad_norm": 1.7159220783645421, "kl": 0.497802734375, "learning_rate": 2.999059409189533e-07, "loss": -0.1155, "num_tokens": 20254852.0, "reward": 0.2918258309364319, "reward_std": 0.1298176795244217, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.23397311568260193, "rewards/logprob_reward/std": 0.2819875180721283, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 720.65625, "completions/mean_terminated_length": 689.27587890625, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 2.271604938271605, "grad_norm": 1.5254332834496518, "kl": 0.4482421875, "learning_rate": 2.9941596866716174e-07, "loss": 0.0419, "num_tokens": 20285105.0, "reward": 0.40915751457214355, "reward_std": 0.1256667673587799, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.35739725828170776, "rewards/logprob_reward/std": 0.3135104179382324, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 661.21875, "completions/mean_terminated_length": 649.51611328125, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 2.2746913580246915, "grad_norm": 1.5301917556965232, "kl": 0.44970703125, "learning_rate": 2.989257988028105e-07, "loss": -0.062, "num_tokens": 20312784.0, "reward": 0.2961769104003906, "reward_std": 0.09869799017906189, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.22839099168777466, "rewards/logprob_reward/std": 0.27328935265541077, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 661.65625, "completions/mean_terminated_length": 649.9677124023438, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 2.2777777777777777, "grad_norm": 1.55836751306861, "kl": 0.499267578125, "learning_rate": 2.984354332860702e-07, "loss": -0.1299, "num_tokens": 20340421.0, "reward": 0.23516541719436646, "reward_std": 0.10262149572372437, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.1640726923942566, "rewards/logprob_reward/std": 0.26426735520362854, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 645.15625, "completions/mean_terminated_length": 645.15625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 2.2808641975308643, "grad_norm": 1.5183547317755983, "kl": 0.45654296875, "learning_rate": 2.979448740778935e-07, "loss": -0.0952, "num_tokens": 20367406.0, "reward": 0.3453006148338318, "reward_std": 0.04703931882977486, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.27602845430374146, "rewards/logprob_reward/std": 0.2904045283794403, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 705.78125, "completions/mean_terminated_length": 684.5667114257812, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 2.2839506172839505, "grad_norm": 1.4210077898977274, "kl": 0.47607421875, "learning_rate": 2.9745412314000786e-07, "loss": -0.1381, "num_tokens": 20396927.0, "reward": 0.27291643619537354, "reward_std": 0.11545917391777039, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.20601823925971985, "rewards/logprob_reward/std": 0.29720136523246765, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 628.0, "completions/mean_terminated_length": 615.2257690429688, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 2.287037037037037, "grad_norm": 1.5604061046226112, "kl": 0.44775390625, "learning_rate": 2.9696318243490746e-07, "loss": -0.2407, "num_tokens": 20423563.0, "reward": 0.36318957805633545, "reward_std": 0.10056814551353455, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.30632179975509644, "rewards/logprob_reward/std": 0.3253307044506073, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 643.5625, "completions/mean_terminated_length": 643.5625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 2.2901234567901234, "grad_norm": 1.6167921814620831, "kl": 0.47314453125, "learning_rate": 2.9647205392584533e-07, "loss": -0.1703, "num_tokens": 20450577.0, "reward": 0.30634933710098267, "reward_std": 0.09227588027715683, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.23969370126724243, "rewards/logprob_reward/std": 0.30089661478996277, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 665.71875, "completions/mean_terminated_length": 641.8333740234375, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 2.29320987654321, "grad_norm": 1.6638971248513061, "kl": 0.440185546875, "learning_rate": 2.959807395768255e-07, "loss": 0.126, "num_tokens": 20478280.0, "reward": 0.5275158882141113, "reward_std": 0.09682638943195343, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.48196205496788025, "rewards/logprob_reward/std": 0.27469196915626526, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 684.75, "completions/mean_terminated_length": 673.8064575195312, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 2.2962962962962963, "grad_norm": 1.6274463721095571, "kl": 0.484130859375, "learning_rate": 2.95489241352595e-07, "loss": -0.0179, "num_tokens": 20506796.0, "reward": 0.33246132731437683, "reward_std": 0.04021794721484184, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2687070369720459, "rewards/logprob_reward/std": 0.2984887361526489, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 654.875, "completions/mean_terminated_length": 630.2667236328125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 2.299382716049383, "grad_norm": 1.5562073409005737, "kl": 0.458251953125, "learning_rate": 2.949975612186366e-07, "loss": -0.1476, "num_tokens": 20534476.0, "reward": 0.29937249422073364, "reward_std": 0.07975244522094727, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.23541387915611267, "rewards/logprob_reward/std": 0.30050328373908997, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 703.71875, "completions/mean_terminated_length": 703.71875, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 2.302469135802469, "grad_norm": 1.526323186452502, "kl": 0.453369140625, "learning_rate": 2.9450570114116014e-07, "loss": 0.0029, "num_tokens": 20563199.0, "reward": 0.42175859212875366, "reward_std": 0.04641527310013771, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.3575095534324646, "rewards/logprob_reward/std": 0.2974914014339447, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 643.0625, "completions/mean_terminated_length": 588.6428833007812, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 2.3055555555555554, "grad_norm": 2.1849677660036737, "kl": 0.496826171875, "learning_rate": 2.9401366308709513e-07, "loss": -0.3376, "num_tokens": 20590289.0, "reward": 0.230956569314003, "reward_std": 0.15202507376670837, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.1802295297384262, "rewards/logprob_reward/std": 0.28506940603256226, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 698.96875, "completions/mean_terminated_length": 688.4838256835938, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 2.308641975308642, "grad_norm": 1.574666273860599, "kl": 0.46875, "learning_rate": 2.9352144902408296e-07, "loss": -0.0264, "num_tokens": 20619268.0, "reward": 0.46434375643730164, "reward_std": 0.08767154812812805, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.4082986116409302, "rewards/logprob_reward/std": 0.2668216824531555, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 689.96875, "completions/mean_terminated_length": 628.1111450195312, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 2.3117283950617282, "grad_norm": 1.506985869377965, "kl": 0.46533203125, "learning_rate": 2.930290609204686e-07, "loss": -0.0342, "num_tokens": 20647839.0, "reward": 0.2441512644290924, "reward_std": 0.13996346294879913, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.18100138008594513, "rewards/logprob_reward/std": 0.24731674790382385, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 684.71875, "completions/mean_terminated_length": 684.71875, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 2.314814814814815, "grad_norm": 1.7151587198627212, "kl": 0.47998046875, "learning_rate": 2.925365007452933e-07, "loss": 0.0091, "num_tokens": 20676134.0, "reward": 0.348962664604187, "reward_std": 0.01183898001909256, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.27662521600723267, "rewards/logprob_reward/std": 0.2915521562099457, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 668.9375, "completions/mean_terminated_length": 618.2142944335938, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 2.317901234567901, "grad_norm": 1.590400317110339, "kl": 0.469970703125, "learning_rate": 2.920437704682861e-07, "loss": -0.0237, "num_tokens": 20703784.0, "reward": 0.17024844884872437, "reward_std": 0.09864601492881775, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.09541495144367218, "rewards/logprob_reward/std": 0.2429906725883484, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 691.5625, "completions/mean_terminated_length": 680.8386840820312, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 2.3209876543209877, "grad_norm": 1.4583624585015498, "kl": 0.451171875, "learning_rate": 2.915508720598566e-07, "loss": -0.0026, "num_tokens": 20733058.0, "reward": 0.2676784098148346, "reward_std": 0.0272978488355875, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.196725994348526, "rewards/logprob_reward/std": 0.3164057433605194, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 667.46875, "completions/mean_terminated_length": 655.9677124023438, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 2.324074074074074, "grad_norm": 1.7632195854243486, "kl": 0.45263671875, "learning_rate": 2.910578074910865e-07, "loss": 0.0377, "num_tokens": 20761085.0, "reward": 0.32235467433929443, "reward_std": 0.09719831496477127, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.25053298473358154, "rewards/logprob_reward/std": 0.2732374370098114, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 659.59375, "completions/mean_terminated_length": 647.8386840820312, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 2.3271604938271606, "grad_norm": 1.5595336417662529, "kl": 0.4287109375, "learning_rate": 2.9056457873372213e-07, "loss": -0.0621, "num_tokens": 20789092.0, "reward": 0.4815666079521179, "reward_std": 0.09370475262403488, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.43785178661346436, "rewards/logprob_reward/std": 0.35662686824798584, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 654.625, "completions/mean_terminated_length": 616.413818359375, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 2.330246913580247, "grad_norm": 1.7999251004826717, "kl": 0.47900390625, "learning_rate": 2.9007118776016635e-07, "loss": 0.0369, "num_tokens": 20817032.0, "reward": 0.3936396837234497, "reward_std": 0.15372705459594727, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3366829752922058, "rewards/logprob_reward/std": 0.3303096890449524, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 720.3125, "completions/mean_terminated_length": 710.51611328125, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 2.3333333333333335, "grad_norm": 1.6841827507063762, "kl": 0.46826171875, "learning_rate": 2.895776365434706e-07, "loss": -0.0887, "num_tokens": 20846678.0, "reward": 0.4823007881641388, "reward_std": 0.16205072402954102, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.43866756558418274, "rewards/logprob_reward/std": 0.34586983919143677, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 659.71875, "completions/mean_terminated_length": 635.433349609375, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 2.3364197530864197, "grad_norm": 1.8654860966028026, "kl": 0.44775390625, "learning_rate": 2.8908392705732724e-07, "loss": -0.3218, "num_tokens": 20874453.0, "reward": 0.4439699053764343, "reward_std": 0.230706125497818, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3995498716831207, "rewards/logprob_reward/std": 0.36121416091918945, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 600.5, "completions/mean_terminated_length": 586.8386840820312, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 2.3395061728395063, "grad_norm": 1.8285912980312045, "kl": 0.489501953125, "learning_rate": 2.885900612760616e-07, "loss": -0.2085, "num_tokens": 20899593.0, "reward": 0.43208223581314087, "reward_std": 0.12648561596870422, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.382869154214859, "rewards/logprob_reward/std": 0.3508533537387848, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 681.0, "completions/mean_terminated_length": 658.1333618164062, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 2.3425925925925926, "grad_norm": 1.665767452065642, "kl": 0.493896484375, "learning_rate": 2.8809604117462397e-07, "loss": -0.074, "num_tokens": 20927901.0, "reward": 0.5073033571243286, "reward_std": 0.1381436586380005, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.46644818782806396, "rewards/logprob_reward/std": 0.32971692085266113, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 607.53125, "completions/mean_terminated_length": 607.53125, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 2.3456790123456788, "grad_norm": 1.613673320804034, "kl": 0.4736328125, "learning_rate": 2.876018687285817e-07, "loss": -0.079, "num_tokens": 20953686.0, "reward": 0.42254993319511414, "reward_std": 0.08217039704322815, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.3618610203266144, "rewards/logprob_reward/std": 0.3114003837108612, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 665.46875, "completions/mean_terminated_length": 653.9031982421875, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 2.3487654320987654, "grad_norm": 1.4974139734095249, "kl": 0.4697265625, "learning_rate": 2.8710754591411147e-07, "loss": -0.1312, "num_tokens": 20981489.0, "reward": 0.47583597898483276, "reward_std": 0.12034577876329422, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4280121922492981, "rewards/logprob_reward/std": 0.29881569743156433, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 646.375, "completions/mean_terminated_length": 634.1935424804688, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 2.351851851851852, "grad_norm": 1.757159493826228, "kl": 0.478515625, "learning_rate": 2.8661307470799114e-07, "loss": -0.1621, "num_tokens": 21008429.0, "reward": 0.3171681761741638, "reward_std": 0.0553818978369236, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.24824240803718567, "rewards/logprob_reward/std": 0.26653721928596497, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 625.0625, "completions/mean_terminated_length": 625.0625, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 2.3549382716049383, "grad_norm": 2.021194603010977, "kl": 0.502685546875, "learning_rate": 2.861184570875921e-07, "loss": -0.2357, "num_tokens": 21035375.0, "reward": 0.31718799471855164, "reward_std": 0.07763171195983887, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2517366409301758, "rewards/logprob_reward/std": 0.3268481194972992, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 613.53125, "completions/mean_terminated_length": 613.53125, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 2.3580246913580245, "grad_norm": 1.8080341193700522, "kl": 0.481689453125, "learning_rate": 2.856236950308711e-07, "loss": -0.0947, "num_tokens": 21061244.0, "reward": 0.3404463529586792, "reward_std": 0.042737558484077454, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.27063482999801636, "rewards/logprob_reward/std": 0.26255881786346436, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 680.15625, "completions/mean_terminated_length": 669.0645141601562, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 2.361111111111111, "grad_norm": 1.7722477078771548, "kl": 0.44580078125, "learning_rate": 2.851287905163628e-07, "loss": 0.0189, "num_tokens": 21089757.0, "reward": 0.3283139169216156, "reward_std": 0.030004817992448807, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.25715434551239014, "rewards/logprob_reward/std": 0.32180550694465637, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 668.03125, "completions/mean_terminated_length": 631.2069091796875, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 2.3641975308641974, "grad_norm": 1.751446835254529, "kl": 0.466552734375, "learning_rate": 2.8463374552317123e-07, "loss": 0.006, "num_tokens": 21118018.0, "reward": 0.3956031799316406, "reward_std": 0.07900096476078033, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.33886459469795227, "rewards/logprob_reward/std": 0.2969200313091278, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 719.5, "completions/mean_terminated_length": 699.2000122070312, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 2.367283950617284, "grad_norm": 1.5995106666276824, "kl": 0.4326171875, "learning_rate": 2.8413856203096226e-07, "loss": -0.0644, "num_tokens": 21147954.0, "reward": 0.36191868782043457, "reward_std": 0.07143624126911163, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2979651987552643, "rewards/logprob_reward/std": 0.3082290291786194, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 674.125, "completions/mean_terminated_length": 674.125, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 2.3703703703703702, "grad_norm": 1.6119173977077033, "kl": 0.477783203125, "learning_rate": 2.836432420199557e-07, "loss": -0.0994, "num_tokens": 21175790.0, "reward": 0.3598131537437439, "reward_std": 0.08605542778968811, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.292153537273407, "rewards/logprob_reward/std": 0.2706261873245239, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 641.5, "completions/mean_terminated_length": 616.0000610351562, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 2.373456790123457, "grad_norm": 1.647587259909585, "kl": 0.47705078125, "learning_rate": 2.831477874709172e-07, "loss": 0.0047, "num_tokens": 21202706.0, "reward": 0.43583619594573975, "reward_std": 0.06834018230438232, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3835679888725281, "rewards/logprob_reward/std": 0.34199875593185425, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 688.34375, "completions/mean_terminated_length": 665.9666748046875, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 2.376543209876543, "grad_norm": 1.5741057518400055, "kl": 0.466796875, "learning_rate": 2.826522003651504e-07, "loss": -0.205, "num_tokens": 21231189.0, "reward": 0.357082337141037, "reward_std": 0.1235959529876709, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.30300813913345337, "rewards/logprob_reward/std": 0.27238768339157104, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 713.90625, "completions/mean_terminated_length": 681.8275756835938, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 2.3796296296296298, "grad_norm": 1.5938919510035199, "kl": 0.4326171875, "learning_rate": 2.8215648268448926e-07, "loss": 0.0412, "num_tokens": 21260698.0, "reward": 0.42854738235473633, "reward_std": 0.15699049830436707, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3754693269729614, "rewards/logprob_reward/std": 0.31307268142700195, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 658.96875, "completions/mean_terminated_length": 647.1935424804688, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 2.382716049382716, "grad_norm": 1.66741573379443, "kl": 0.49072265625, "learning_rate": 2.8166063641128963e-07, "loss": 0.0022, "num_tokens": 21288161.0, "reward": 0.42904025316238403, "reward_std": 0.07219739258289337, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.3690725266933441, "rewards/logprob_reward/std": 0.3360343277454376, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 651.09375, "completions/mean_terminated_length": 639.0645141601562, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 2.3858024691358026, "grad_norm": 1.6019351927086767, "kl": 0.48876953125, "learning_rate": 2.8116466352842165e-07, "loss": -0.1125, "num_tokens": 21315692.0, "reward": 0.4852568805217743, "reward_std": 0.055166251957416534, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.43500763177871704, "rewards/logprob_reward/std": 0.34550899267196655, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 706.6875, "completions/mean_terminated_length": 696.4515991210938, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 2.388888888888889, "grad_norm": 1.7170663048805457, "kl": 0.474853515625, "learning_rate": 2.80668566019262e-07, "loss": -0.0374, "num_tokens": 21345186.0, "reward": 0.4603572487831116, "reward_std": 0.03763112798333168, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4073413908481598, "rewards/logprob_reward/std": 0.37808045744895935, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 713.09375, "completions/mean_terminated_length": 692.36669921875, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 2.3919753086419755, "grad_norm": 1.707517061235574, "kl": 0.440185546875, "learning_rate": 2.8017234586768534e-07, "loss": -0.0716, "num_tokens": 21374829.0, "reward": 0.2703780233860016, "reward_std": 0.04431023448705673, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.19625332951545715, "rewards/logprob_reward/std": 0.295492559671402, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 697.9375, "completions/mean_terminated_length": 676.2000122070312, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 2.3950617283950617, "grad_norm": 1.9826822888313012, "kl": 0.459228515625, "learning_rate": 2.796760050580571e-07, "loss": -0.0703, "num_tokens": 21403379.0, "reward": 0.41871219873428345, "reward_std": 0.06391759216785431, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.36106911301612854, "rewards/logprob_reward/std": 0.2663119435310364, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 700.375, "completions/mean_terminated_length": 666.8965454101562, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 2.398148148148148, "grad_norm": 1.7503266332618628, "kl": 0.44873046875, "learning_rate": 2.7917954557522503e-07, "loss": 0.0387, "num_tokens": 21432315.0, "reward": 0.3177770972251892, "reward_std": 0.15793344378471375, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2558634579181671, "rewards/logprob_reward/std": 0.33196964859962463, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 686.71875, "completions/mean_terminated_length": 651.8275756835938, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 2.4012345679012346, "grad_norm": 1.528322662854093, "kl": 0.461181640625, "learning_rate": 2.786829694045116e-07, "loss": 0.0123, "num_tokens": 21460786.0, "reward": 0.38607722520828247, "reward_std": 0.07318758219480515, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3421691060066223, "rewards/logprob_reward/std": 0.2766576111316681, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 689.4375, "completions/mean_terminated_length": 678.6451416015625, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 2.4043209876543212, "grad_norm": 1.4691188753055855, "kl": 0.447509765625, "learning_rate": 2.7818627853170585e-07, "loss": -0.0676, "num_tokens": 21489200.0, "reward": 0.24418659508228302, "reward_std": 0.020462067797780037, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.1636795699596405, "rewards/logprob_reward/std": 0.2964684069156647, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 727.375, "completions/mean_terminated_length": 707.6000366210938, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 2.4074074074074074, "grad_norm": 1.6969455082981286, "kl": 0.47119140625, "learning_rate": 2.7768947494305545e-07, "loss": -0.0137, "num_tokens": 21519524.0, "reward": 0.44820350408554077, "reward_std": 0.07451047748327255, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.39383721351623535, "rewards/logprob_reward/std": 0.3779621124267578, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 679.0625, "completions/mean_terminated_length": 656.0667114257812, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 2.4104938271604937, "grad_norm": 1.4690491879406211, "kl": 0.45751953125, "learning_rate": 2.7719256062525884e-07, "loss": -0.1628, "num_tokens": 21547702.0, "reward": 0.2319009006023407, "reward_std": 0.12212012708187103, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.163917675614357, "rewards/logprob_reward/std": 0.28209683299064636, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 731.15625, "completions/mean_terminated_length": 711.6333618164062, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 2.4135802469135803, "grad_norm": 1.7253139053991688, "kl": 0.439697265625, "learning_rate": 2.766955375654573e-07, "loss": 0.034, "num_tokens": 21577643.0, "reward": 0.3681906461715698, "reward_std": 0.11222346872091293, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3084062337875366, "rewards/logprob_reward/std": 0.30806270241737366, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 731.0625, "completions/mean_terminated_length": 689.2142944335938, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 2.4166666666666665, "grad_norm": 1.681262888501358, "kl": 0.464599609375, "learning_rate": 2.7619840775122695e-07, "loss": -0.0406, "num_tokens": 21607677.0, "reward": 0.3835669755935669, "reward_std": 0.11109921336174011, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3324355483055115, "rewards/logprob_reward/std": 0.28979504108428955, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 660.40625, "completions/mean_terminated_length": 622.7930908203125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 2.419753086419753, "grad_norm": 1.68135858574482, "kl": 0.49072265625, "learning_rate": 2.7570117317057087e-07, "loss": -0.1125, "num_tokens": 21635042.0, "reward": 0.38211822509765625, "reward_std": 0.09188269823789597, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3308258056640625, "rewards/logprob_reward/std": 0.30932605266571045, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 706.96875, "completions/mean_terminated_length": 674.1724243164062, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 2.4228395061728394, "grad_norm": 1.588731789212885, "kl": 0.467041015625, "learning_rate": 2.7520383581191085e-07, "loss": -0.1326, "num_tokens": 21663985.0, "reward": 0.34853044152259827, "reward_std": 0.07723263651132584, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.29350602626800537, "rewards/logprob_reward/std": 0.3292677402496338, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 630.53125, "completions/mean_terminated_length": 617.8386840820312, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 2.425925925925926, "grad_norm": 1.7455689332722968, "kl": 0.5126953125, "learning_rate": 2.7470639766408003e-07, "loss": -0.0755, "num_tokens": 21689946.0, "reward": 0.3185606002807617, "reward_std": 0.07213710993528366, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.24978958070278168, "rewards/logprob_reward/std": 0.3083517253398895, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 693.5625, "completions/mean_terminated_length": 632.370361328125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 2.4290123456790123, "grad_norm": 1.6065002830748993, "kl": 0.4521484375, "learning_rate": 2.7420886071631455e-07, "loss": -0.0038, "num_tokens": 21718592.0, "reward": 0.288696825504303, "reward_std": 0.10083666443824768, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.24091316759586334, "rewards/logprob_reward/std": 0.3241763710975647, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 745.53125, "completions/mean_terminated_length": 716.72412109375, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 2.432098765432099, "grad_norm": 1.5794518463263905, "kl": 0.456298828125, "learning_rate": 2.7371122695824534e-07, "loss": -0.0615, "num_tokens": 21749549.0, "reward": 0.26883482933044434, "reward_std": 0.08599147945642471, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.21884429454803467, "rewards/logprob_reward/std": 0.28142860531806946, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 712.125, "completions/mean_terminated_length": 702.0645141601562, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 2.435185185185185, "grad_norm": 1.652635397138663, "kl": 0.44482421875, "learning_rate": 2.732134983798907e-07, "loss": -0.0307, "num_tokens": 21778781.0, "reward": 0.4192216396331787, "reward_std": 0.08457984775304794, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.36163514852523804, "rewards/logprob_reward/std": 0.30210474133491516, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 671.1875, "completions/mean_terminated_length": 634.6896362304688, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 2.4382716049382718, "grad_norm": 1.8167689300618, "kl": 0.47412109375, "learning_rate": 2.727156769716482e-07, "loss": -0.0783, "num_tokens": 21806643.0, "reward": 0.45189931988716125, "reward_std": 0.17996907234191895, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.4118325710296631, "rewards/logprob_reward/std": 0.33738788962364197, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 698.3125, "completions/mean_terminated_length": 664.6206665039062, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 2.441358024691358, "grad_norm": 1.5791956080972562, "kl": 0.45556640625, "learning_rate": 2.722177647242863e-07, "loss": -0.0439, "num_tokens": 21835121.0, "reward": 0.3335627317428589, "reward_std": 0.11624830961227417, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2734030485153198, "rewards/logprob_reward/std": 0.27783873677253723, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 720.40625, "completions/mean_terminated_length": 700.1666870117188, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 2.4444444444444446, "grad_norm": 1.6960424937301624, "kl": 0.46923828125, "learning_rate": 2.717197636289373e-07, "loss": -0.0047, "num_tokens": 21864682.0, "reward": 0.353654146194458, "reward_std": 0.03367337957024574, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.28878235816955566, "rewards/logprob_reward/std": 0.3158867359161377, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 645.375, "completions/mean_terminated_length": 645.375, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 2.447530864197531, "grad_norm": 1.6987274549920954, "kl": 0.462646484375, "learning_rate": 2.712216756770881e-07, "loss": -0.0352, "num_tokens": 21891502.0, "reward": 0.47273024916648865, "reward_std": 0.07203876227140427, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.42108914256095886, "rewards/logprob_reward/std": 0.3339870870113373, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 647.5, "completions/mean_terminated_length": 622.4000244140625, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 2.450617283950617, "grad_norm": 1.6361015215817512, "kl": 0.48583984375, "learning_rate": 2.7072350286057354e-07, "loss": -0.0693, "num_tokens": 21918650.0, "reward": 0.39775049686431885, "reward_std": 0.0615319088101387, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.33777832984924316, "rewards/logprob_reward/std": 0.30343782901763916, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 702.4375, "completions/mean_terminated_length": 669.1724243164062, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 2.4537037037037037, "grad_norm": 1.4447603271305212, "kl": 0.45166015625, "learning_rate": 2.7022524717156734e-07, "loss": -0.1298, "num_tokens": 21947840.0, "reward": 0.30354562401771545, "reward_std": 0.13660451769828796, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.24699513614177704, "rewards/logprob_reward/std": 0.3191279470920563, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 769.65625, "completions/mean_terminated_length": 743.3448486328125, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 2.45679012345679, "grad_norm": 1.6231483812218137, "kl": 0.42578125, "learning_rate": 2.6972691060257504e-07, "loss": -0.1024, "num_tokens": 21979261.0, "reward": 0.2865647077560425, "reward_std": 0.053033120930194855, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.22118304669857025, "rewards/logprob_reward/std": 0.293706476688385, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 658.46875, "completions/mean_terminated_length": 658.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 2.4598765432098766, "grad_norm": 1.7586865185255538, "kl": 0.510986328125, "learning_rate": 2.6922849514642524e-07, "loss": -0.0984, "num_tokens": 22006732.0, "reward": 0.5659425258636475, "reward_std": 0.024211421608924866, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.5177139043807983, "rewards/logprob_reward/std": 0.2997903525829315, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 646.78125, "completions/mean_terminated_length": 634.6128540039062, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 2.462962962962963, "grad_norm": 1.6941279342364832, "kl": 0.462158203125, "learning_rate": 2.687300027962624e-07, "loss": -0.0586, "num_tokens": 22034025.0, "reward": 0.3576476573944092, "reward_std": 0.05269106477499008, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.29321959614753723, "rewards/logprob_reward/std": 0.3156810700893402, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 664.25, "completions/mean_terminated_length": 652.6451416015625, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 2.4660493827160495, "grad_norm": 1.5782410399983742, "kl": 0.451904296875, "learning_rate": 2.682314355455381e-07, "loss": 0.0106, "num_tokens": 22061813.0, "reward": 0.4643767178058624, "reward_std": 0.04420530050992966, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.41180747747421265, "rewards/logprob_reward/std": 0.31941792368888855, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 722.78125, "completions/mean_terminated_length": 691.6206665039062, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 2.4691358024691357, "grad_norm": 1.5958725073134328, "kl": 0.48486328125, "learning_rate": 2.677327953880038e-07, "loss": 0.0186, "num_tokens": 22091738.0, "reward": 0.5024321675300598, "reward_std": 0.04700267314910889, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.457563579082489, "rewards/logprob_reward/std": 0.32538318634033203, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 693.4375, "completions/mean_terminated_length": 671.4000244140625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 2.4722222222222223, "grad_norm": 1.406252013197836, "kl": 0.4619140625, "learning_rate": 2.6723408431770214e-07, "loss": -0.1586, "num_tokens": 22120720.0, "reward": 0.24721704423427582, "reward_std": 0.07463710010051727, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.1809355914592743, "rewards/logprob_reward/std": 0.3008284568786621, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 690.75, "completions/mean_terminated_length": 680.0, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 2.4753086419753085, "grad_norm": 1.5807418663582646, "kl": 0.4423828125, "learning_rate": 2.6673530432895957e-07, "loss": -0.0385, "num_tokens": 22149500.0, "reward": 0.40218865871429443, "reward_std": 0.07978276163339615, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.34270966053009033, "rewards/logprob_reward/std": 0.35028383135795593, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 660.21875, "completions/mean_terminated_length": 648.4838256835938, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 2.478395061728395, "grad_norm": 1.6979751327737607, "kl": 0.475341796875, "learning_rate": 2.6623645741637815e-07, "loss": -0.1152, "num_tokens": 22176983.0, "reward": 0.4935344457626343, "reward_std": 0.15645596385002136, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4476771354675293, "rewards/logprob_reward/std": 0.3220255970954895, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 676.8125, "completions/mean_terminated_length": 665.6128540039062, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 2.4814814814814814, "grad_norm": 1.7512152932336211, "kl": 0.479248046875, "learning_rate": 2.6573754557482746e-07, "loss": -0.0132, "num_tokens": 22205017.0, "reward": 0.38634347915649414, "reward_std": 0.069093719124794, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.32163164019584656, "rewards/logprob_reward/std": 0.2911610007286072, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 746.125, "completions/mean_terminated_length": 694.6666870117188, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 2.484567901234568, "grad_norm": 1.6752375481129191, "kl": 0.46435546875, "learning_rate": 2.652385707994369e-07, "loss": -0.0154, "num_tokens": 22235205.0, "reward": 0.25764429569244385, "reward_std": 0.07367034256458282, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.19599363207817078, "rewards/logprob_reward/std": 0.3119756281375885, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 652.96875, "completions/mean_terminated_length": 641.0, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 2.4876543209876543, "grad_norm": 1.7700143609171248, "kl": 0.50830078125, "learning_rate": 2.6473953508558726e-07, "loss": 0.0206, "num_tokens": 22262564.0, "reward": 0.5549646019935608, "reward_std": 0.07199626415967941, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.5089884400367737, "rewards/logprob_reward/std": 0.31305375695228577, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 647.0, "completions/mean_terminated_length": 621.86669921875, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 2.490740740740741, "grad_norm": 1.5986281015517148, "kl": 0.49951171875, "learning_rate": 2.6424044042890334e-07, "loss": 0.0085, "num_tokens": 22289172.0, "reward": 0.5017445087432861, "reward_std": 0.17483356595039368, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4533272385597229, "rewards/logprob_reward/std": 0.3023810386657715, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 690.59375, "completions/mean_terminated_length": 656.1034545898438, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 2.493827160493827, "grad_norm": 1.515018379336347, "kl": 0.49072265625, "learning_rate": 2.6374128882524527e-07, "loss": 0.0264, "num_tokens": 22317603.0, "reward": 0.3284122347831726, "reward_std": 0.09335949271917343, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.26420801877975464, "rewards/logprob_reward/std": 0.3220413029193878, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 645.84375, "completions/mean_terminated_length": 591.8214721679688, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 2.496913580246914, "grad_norm": 1.7779902586216205, "kl": 0.51904296875, "learning_rate": 2.6324208227070136e-07, "loss": -0.047, "num_tokens": 22344922.0, "reward": 0.5048031806945801, "reward_std": 0.2261616736650467, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.46714240312576294, "rewards/logprob_reward/std": 0.41016268730163574, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 696.71875, "completions/mean_terminated_length": 674.9000244140625, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 2.5, "grad_norm": 1.4925439070076332, "kl": 0.44873046875, "learning_rate": 2.6274282276157934e-07, "loss": -0.011, "num_tokens": 22373369.0, "reward": 0.47191673517227173, "reward_std": 0.05190592631697655, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.42018529772758484, "rewards/logprob_reward/std": 0.3147827982902527, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 705.53125, "completions/mean_terminated_length": 672.586181640625, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 2.503086419753086, "grad_norm": 1.440310778000435, "kl": 0.476806640625, "learning_rate": 2.622435122943987e-07, "loss": 0.0049, "num_tokens": 22402438.0, "reward": 0.3325193226337433, "reward_std": 0.10486244410276413, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2687714695930481, "rewards/logprob_reward/std": 0.2609803378582001, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 685.71875, "completions/mean_terminated_length": 663.1666870117188, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 2.506172839506173, "grad_norm": 1.541147722268763, "kl": 0.517333984375, "learning_rate": 2.61744152865883e-07, "loss": -0.0228, "num_tokens": 22431161.0, "reward": 0.38658440113067627, "reward_std": 0.06495478749275208, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3253715932369232, "rewards/logprob_reward/std": 0.3246171176433563, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 705.25, "completions/mean_terminated_length": 659.7142944335938, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 2.5092592592592595, "grad_norm": 1.81279122587084, "kl": 0.474609375, "learning_rate": 2.6124474647295137e-07, "loss": 0.0228, "num_tokens": 22459953.0, "reward": 0.3754402995109558, "reward_std": 0.1371288299560547, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3303503394126892, "rewards/logprob_reward/std": 0.36141639947891235, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 695.3125, "completions/mean_terminated_length": 673.4000244140625, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 2.5123456790123457, "grad_norm": 1.7487386057971201, "kl": 0.48046875, "learning_rate": 2.607452951127107e-07, "loss": -0.0416, "num_tokens": 22488531.0, "reward": 0.4579329192638397, "reward_std": 0.09922948479652405, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4046476483345032, "rewards/logprob_reward/std": 0.2927602529525757, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 698.1875, "completions/mean_terminated_length": 664.4827270507812, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 2.515432098765432, "grad_norm": 1.7475635958856053, "kl": 0.5107421875, "learning_rate": 2.6024580078244777e-07, "loss": 0.0363, "num_tokens": 22517377.0, "reward": 0.2742580473423004, "reward_std": 0.03600156307220459, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.20403672754764557, "rewards/logprob_reward/std": 0.3146376311779022, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 603.8125, "completions/mean_terminated_length": 603.8125, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 2.5185185185185186, "grad_norm": 1.4863154296138088, "kl": 0.508056640625, "learning_rate": 2.5974626547962127e-07, "loss": -0.1453, "num_tokens": 22542475.0, "reward": 0.388713538646698, "reward_std": 0.10129021108150482, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3277372419834137, "rewards/logprob_reward/std": 0.2906053066253662, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 676.4375, "completions/mean_terminated_length": 653.2667236328125, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 2.521604938271605, "grad_norm": 1.730874464419731, "kl": 0.47900390625, "learning_rate": 2.5924669120185373e-07, "loss": -0.0055, "num_tokens": 22570421.0, "reward": 0.3788033723831177, "reward_std": 0.13281214237213135, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3201981782913208, "rewards/logprob_reward/std": 0.2801540493965149, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 670.75, "completions/mean_terminated_length": 620.2857666015625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 2.5246913580246915, "grad_norm": 1.8110867648940778, "kl": 0.484130859375, "learning_rate": 2.5874707994692333e-07, "loss": 0.0586, "num_tokens": 22598605.0, "reward": 0.4630173444747925, "reward_std": 0.11789367347955704, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.41724148392677307, "rewards/logprob_reward/std": 0.35140904784202576, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 649.28125, "completions/mean_terminated_length": 649.28125, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 2.5277777777777777, "grad_norm": 1.5791724061340933, "kl": 0.50244140625, "learning_rate": 2.582474337127564e-07, "loss": -0.028, "num_tokens": 22625462.0, "reward": 0.5131837129592896, "reward_std": 0.04158104211091995, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4660375118255615, "rewards/logprob_reward/std": 0.32720914483070374, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 682.34375, "completions/mean_terminated_length": 647.0, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 2.5308641975308643, "grad_norm": 1.6486119986451242, "kl": 0.48681640625, "learning_rate": 2.5774775449741903e-07, "loss": -0.0405, "num_tokens": 22654241.0, "reward": 0.5108381509780884, "reward_std": 0.10270988941192627, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.47384798526763916, "rewards/logprob_reward/std": 0.30215731263160706, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 710.0, "completions/mean_terminated_length": 651.8518676757812, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 2.5339506172839505, "grad_norm": 1.544751634897681, "kl": 0.514892578125, "learning_rate": 2.572480442991092e-07, "loss": -0.0729, "num_tokens": 22683949.0, "reward": 0.37141939997673035, "reward_std": 0.16021966934204102, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3189382553100586, "rewards/logprob_reward/std": 0.31684011220932007, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 701.5625, "completions/mean_terminated_length": 680.0667114257812, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 2.537037037037037, "grad_norm": 1.6286194633204285, "kl": 0.533203125, "learning_rate": 2.567483051161487e-07, "loss": 0.0488, "num_tokens": 22713279.0, "reward": 0.31688347458839417, "reward_std": 0.11532354354858398, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.24792610108852386, "rewards/logprob_reward/std": 0.2903689742088318, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 674.375, "completions/mean_terminated_length": 674.375, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 2.5401234567901234, "grad_norm": 1.6202767790650563, "kl": 0.470703125, "learning_rate": 2.562485389469754e-07, "loss": -0.0134, "num_tokens": 22741259.0, "reward": 0.2672231197357178, "reward_std": 0.019373975694179535, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.18580347299575806, "rewards/logprob_reward/std": 0.2839396595954895, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 647.21875, "completions/mean_terminated_length": 635.0645141601562, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 2.5432098765432096, "grad_norm": 1.7115536407929939, "kl": 0.52099609375, "learning_rate": 2.5574874779013494e-07, "loss": 0.0081, "num_tokens": 22768090.0, "reward": 0.34650537371635437, "reward_std": 0.07716759294271469, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.27736708521842957, "rewards/logprob_reward/std": 0.32999470829963684, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 631.25, "completions/mean_terminated_length": 605.0667114257812, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 2.5462962962962963, "grad_norm": 1.570559649504146, "kl": 0.52392578125, "learning_rate": 2.5524893364427307e-07, "loss": -0.1532, "num_tokens": 22794238.0, "reward": 0.46924713253974915, "reward_std": 0.13800698518753052, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.42069125175476074, "rewards/logprob_reward/std": 0.32838478684425354, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 705.21875, "completions/mean_terminated_length": 694.9354858398438, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 2.549382716049383, "grad_norm": 1.6432330849106913, "kl": 0.487060546875, "learning_rate": 2.547490985081272e-07, "loss": -0.0933, "num_tokens": 22823457.0, "reward": 0.43407464027404785, "reward_std": 0.0960204154253006, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3816107213497162, "rewards/logprob_reward/std": 0.3085387945175171, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 648.6875, "completions/mean_terminated_length": 623.6666870117188, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 2.552469135802469, "grad_norm": 1.6092280501573333, "kl": 0.462646484375, "learning_rate": 2.5424924438051896e-07, "loss": 0.0593, "num_tokens": 22850595.0, "reward": 0.41767969727516174, "reward_std": 0.11000683903694153, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3633941113948822, "rewards/logprob_reward/std": 0.26872149109840393, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 744.78125, "completions/mean_terminated_length": 726.1666870117188, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 2.5555555555555554, "grad_norm": 1.4350042991675251, "kl": 0.48583984375, "learning_rate": 2.5374937326034575e-07, "loss": -0.0721, "num_tokens": 22881628.0, "reward": 0.4250319004058838, "reward_std": 0.0767422765493393, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.37156325578689575, "rewards/logprob_reward/std": 0.3062604069709778, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 701.65625, "completions/mean_terminated_length": 641.9629516601562, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 2.558641975308642, "grad_norm": 1.52997866016975, "kl": 0.482177734375, "learning_rate": 2.5324948714657287e-07, "loss": -0.0019, "num_tokens": 22910773.0, "reward": 0.5132285356521606, "reward_std": 0.13423295319080353, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.4765039086341858, "rewards/logprob_reward/std": 0.36376363039016724, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 662.5, "completions/mean_terminated_length": 625.1034545898438, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 2.5617283950617287, "grad_norm": 1.596476352506031, "kl": 0.46875, "learning_rate": 2.527495880382259e-07, "loss": -0.0607, "num_tokens": 22938289.0, "reward": 0.2702171206474304, "reward_std": 0.19363728165626526, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.20996347069740295, "rewards/logprob_reward/std": 0.2958320379257202, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 690.125, "completions/mean_terminated_length": 690.125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 2.564814814814815, "grad_norm": 1.7168905420569063, "kl": 0.489013671875, "learning_rate": 2.522496779343819e-07, "loss": 0.0507, "num_tokens": 22967013.0, "reward": 0.6483243703842163, "reward_std": 0.08233045041561127, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.6127215623855591, "rewards/logprob_reward/std": 0.31194931268692017, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 742.375, "completions/mean_terminated_length": 723.6000366210938, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 2.567901234567901, "grad_norm": 1.4545863246983801, "kl": 0.43603515625, "learning_rate": 2.5174975883416237e-07, "loss": 0.0067, "num_tokens": 22997289.0, "reward": 0.3937372863292694, "reward_std": 0.05238419771194458, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.33331921696662903, "rewards/logprob_reward/std": 0.34520795941352844, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 675.21875, "completions/mean_terminated_length": 639.137939453125, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 2.5709876543209877, "grad_norm": 1.5016776540539658, "kl": 0.500732421875, "learning_rate": 2.512498327367245e-07, "loss": -0.1546, "num_tokens": 23025492.0, "reward": 0.42243140935897827, "reward_std": 0.19586004316806793, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.37561821937561035, "rewards/logprob_reward/std": 0.3365665376186371, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 645.40625, "completions/mean_terminated_length": 633.1935424804688, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 2.574074074074074, "grad_norm": 1.7531359165911995, "kl": 0.520751953125, "learning_rate": 2.5074990164125355e-07, "loss": -0.0329, "num_tokens": 23052393.0, "reward": 0.4208226203918457, "reward_std": 0.07042469829320908, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.3599418103694916, "rewards/logprob_reward/std": 0.3114452064037323, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 714.65625, "completions/mean_terminated_length": 694.0333862304688, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 2.5771604938271606, "grad_norm": 1.6074066172712087, "kl": 0.518310546875, "learning_rate": 2.502499675469547e-07, "loss": -0.0104, "num_tokens": 23082078.0, "reward": 0.4387815296649933, "reward_std": 0.03708649426698685, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3937850594520569, "rewards/logprob_reward/std": 0.30619168281555176, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 618.4375, "completions/mean_terminated_length": 618.4375, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 2.580246913580247, "grad_norm": 1.6523571855527242, "kl": 0.54248046875, "learning_rate": 2.497500324530453e-07, "loss": -0.0131, "num_tokens": 23108148.0, "reward": 0.25967714190483093, "reward_std": 0.024906985461711884, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.1808912605047226, "rewards/logprob_reward/std": 0.30395814776420593, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 736.8125, "completions/mean_terminated_length": 727.54833984375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 2.5833333333333335, "grad_norm": 1.7052648363364185, "kl": 0.502197265625, "learning_rate": 2.4925009835874643e-07, "loss": -0.0641, "num_tokens": 23139146.0, "reward": 0.3631102442741394, "reward_std": 0.0686516985297203, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.29928913712501526, "rewards/logprob_reward/std": 0.3324436843395233, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 673.1875, "completions/mean_terminated_length": 623.0714721679688, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 2.5864197530864197, "grad_norm": 1.6400371544002221, "kl": 0.52880859375, "learning_rate": 2.4875016726327555e-07, "loss": -0.0002, "num_tokens": 23167212.0, "reward": 0.35659778118133545, "reward_std": 0.038892023265361786, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.29899752140045166, "rewards/logprob_reward/std": 0.3777339458465576, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 689.78125, "completions/mean_terminated_length": 642.0357666015625, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 2.5895061728395063, "grad_norm": 1.7089939387900324, "kl": 0.4912109375, "learning_rate": 2.482502411658376e-07, "loss": -0.0944, "num_tokens": 23195889.0, "reward": 0.5296692252159119, "reward_std": 0.1295604556798935, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.4982436001300812, "rewards/logprob_reward/std": 0.34356120228767395, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 716.84375, "completions/mean_terminated_length": 685.0689697265625, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 2.5925925925925926, "grad_norm": 1.5480474298383975, "kl": 0.504638671875, "learning_rate": 2.477503220656181e-07, "loss": -0.0116, "num_tokens": 23225096.0, "reward": 0.32407814264297485, "reward_std": 0.10831078886985779, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2663368582725525, "rewards/logprob_reward/std": 0.29278406500816345, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 738.625, "completions/mean_terminated_length": 658.719970703125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 2.5956790123456788, "grad_norm": 1.5352563256943508, "kl": 0.492919921875, "learning_rate": 2.472504119617742e-07, "loss": 0.0602, "num_tokens": 23255736.0, "reward": 0.47931939363479614, "reward_std": 0.2703985869884491, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.4457715153694153, "rewards/logprob_reward/std": 0.3891335725784302, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 683.9375, "completions/mean_terminated_length": 661.2667236328125, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 2.5987654320987654, "grad_norm": 1.8242447605980194, "kl": 0.534423828125, "learning_rate": 2.4675051285342716e-07, "loss": 0.0563, "num_tokens": 23283890.0, "reward": 0.45217660069465637, "reward_std": 0.1373641937971115, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4017239809036255, "rewards/logprob_reward/std": 0.38032811880111694, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 644.78125, "completions/mean_terminated_length": 632.54833984375, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 2.601851851851852, "grad_norm": 1.5142130704717909, "kl": 0.521484375, "learning_rate": 2.462506267396543e-07, "loss": 0.0312, "num_tokens": 23310931.0, "reward": 0.3880419135093689, "reward_std": 0.032008636742830276, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.3235188126564026, "rewards/logprob_reward/std": 0.2696700096130371, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 714.03125, "completions/mean_terminated_length": 669.75, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 2.6049382716049383, "grad_norm": 1.5843422454853517, "kl": 0.50732421875, "learning_rate": 2.45750755619481e-07, "loss": -0.0206, "num_tokens": 23340596.0, "reward": 0.36871445178985596, "reward_std": 0.1304899901151657, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3124604821205139, "rewards/logprob_reward/std": 0.326145201921463, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 642.0, "completions/mean_terminated_length": 616.5333862304688, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 2.6080246913580245, "grad_norm": 1.7468668761663477, "kl": 0.523681640625, "learning_rate": 2.452509014918728e-07, "loss": -0.054, "num_tokens": 23367200.0, "reward": 0.49903756380081177, "reward_std": 0.08660802990198135, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4537917673587799, "rewards/logprob_reward/std": 0.3420726954936981, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 661.0625, "completions/mean_terminated_length": 636.86669921875, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 2.611111111111111, "grad_norm": 1.4492779331621453, "kl": 0.49169921875, "learning_rate": 2.4475106635572696e-07, "loss": -0.1075, "num_tokens": 23395114.0, "reward": 0.3111758828163147, "reward_std": 0.12752994894981384, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.24852874875068665, "rewards/logprob_reward/std": 0.31336984038352966, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 713.125, "completions/mean_terminated_length": 713.125, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 2.6141975308641974, "grad_norm": 1.7768380122946759, "kl": 0.53271484375, "learning_rate": 2.4425125220986503e-07, "loss": -0.1375, "num_tokens": 23425138.0, "reward": 0.2993221879005432, "reward_std": 0.0753876268863678, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.22841355204582214, "rewards/logprob_reward/std": 0.24445840716362, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 745.34375, "completions/mean_terminated_length": 716.5172119140625, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 2.617283950617284, "grad_norm": 1.6390007965893845, "kl": 0.53759765625, "learning_rate": 2.437514610530246e-07, "loss": -0.0976, "num_tokens": 23455857.0, "reward": 0.4049127697944641, "reward_std": 0.12876400351524353, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.35268083214759827, "rewards/logprob_reward/std": 0.346796452999115, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 695.21875, "completions/mean_terminated_length": 684.6128540039062, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 2.6203703703703702, "grad_norm": 1.6555782504766203, "kl": 0.528564453125, "learning_rate": 2.4325169488385137e-07, "loss": -0.019, "num_tokens": 23484784.0, "reward": 0.46633949875831604, "reward_std": 0.09534019976854324, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4139883518218994, "rewards/logprob_reward/std": 0.3945707678794861, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 693.40625, "completions/mean_terminated_length": 693.40625, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 2.623456790123457, "grad_norm": 1.5914744654486705, "kl": 0.51953125, "learning_rate": 2.4275195570089083e-07, "loss": -0.1754, "num_tokens": 23513289.0, "reward": 0.15087765455245972, "reward_std": 0.0910133644938469, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.07736405730247498, "rewards/logprob_reward/std": 0.17590440809726715, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 705.71875, "completions/mean_terminated_length": 646.7777709960938, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 2.626543209876543, "grad_norm": 1.521846542776652, "kl": 0.5234375, "learning_rate": 2.42252245502581e-07, "loss": -0.0622, "num_tokens": 23542232.0, "reward": 0.3007517457008362, "reward_std": 0.08910328894853592, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.24389082193374634, "rewards/logprob_reward/std": 0.3227921426296234, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 611.5, "completions/mean_terminated_length": 584.0, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 2.6296296296296298, "grad_norm": 1.6050489815794542, "kl": 0.5517578125, "learning_rate": 2.417525662872436e-07, "loss": -0.1494, "num_tokens": 23568364.0, "reward": 0.4339892268180847, "reward_std": 0.21152611076831818, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3884602189064026, "rewards/logprob_reward/std": 0.3285326361656189, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 808.25, "completions/mean_terminated_length": 747.8399658203125, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 2.632716049382716, "grad_norm": 1.3763381213050085, "kl": 0.447509765625, "learning_rate": 2.412529200530767e-07, "loss": -0.0006, "num_tokens": 23601320.0, "reward": 0.283580482006073, "reward_std": 0.10905754566192627, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.22481165826320648, "rewards/logprob_reward/std": 0.2478126436471939, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 653.0, "completions/mean_terminated_length": 614.6206665039062, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 2.6358024691358026, "grad_norm": 1.6142676230389215, "kl": 0.5478515625, "learning_rate": 2.407533087981463e-07, "loss": -0.0964, "num_tokens": 23628328.0, "reward": 0.3926837146282196, "reward_std": 0.10432277619838715, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.33909302949905396, "rewards/logprob_reward/std": 0.373881995677948, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 767.5625, "completions/mean_terminated_length": 667.2174072265625, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 2.638888888888889, "grad_norm": 1.7634029805173885, "kl": 0.510009765625, "learning_rate": 2.4025373452037865e-07, "loss": -0.0347, "num_tokens": 23660242.0, "reward": 0.38286876678466797, "reward_std": 0.22306621074676514, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.3455486595630646, "rewards/logprob_reward/std": 0.36604323983192444, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 742.375, "completions/mean_terminated_length": 713.2413940429688, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 2.6419753086419755, "grad_norm": 1.5245553486149792, "kl": 0.489013671875, "learning_rate": 2.3975419921755215e-07, "loss": -0.1216, "num_tokens": 23690750.0, "reward": 0.26703959703445435, "reward_std": 0.09657148271799088, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.20643289387226105, "rewards/logprob_reward/std": 0.3106721341609955, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 679.34375, "completions/mean_terminated_length": 643.6896362304688, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 2.6450617283950617, "grad_norm": 1.709241615249849, "kl": 0.539794921875, "learning_rate": 2.3925470488728935e-07, "loss": 0.0329, "num_tokens": 23718677.0, "reward": 0.3068399429321289, "reward_std": 0.10089085251092911, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.24023883044719696, "rewards/logprob_reward/std": 0.26413267850875854, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 688.21875, "completions/mean_terminated_length": 640.25, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 2.648148148148148, "grad_norm": 1.2221951238953301, "kl": 0.484130859375, "learning_rate": 2.3875525352704866e-07, "loss": -0.16, "num_tokens": 23747572.0, "reward": 0.2994483709335327, "reward_std": 0.12233484536409378, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.23897042870521545, "rewards/logprob_reward/std": 0.2732946276664734, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 735.78125, "completions/mean_terminated_length": 716.5667114257812, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 2.6512345679012346, "grad_norm": 1.8528798461216835, "kl": 0.503662109375, "learning_rate": 2.38255847134117e-07, "loss": -0.1911, "num_tokens": 23777897.0, "reward": 0.32587072253227234, "reward_std": 0.07041546702384949, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.26832857728004456, "rewards/logprob_reward/std": 0.3230299651622772, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 686.625, "completions/mean_terminated_length": 651.72412109375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 2.6543209876543212, "grad_norm": 1.6614378804262724, "kl": 0.52880859375, "learning_rate": 2.3775648770560126e-07, "loss": -0.1658, "num_tokens": 23806505.0, "reward": 0.3948863744735718, "reward_std": 0.1995619833469391, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3554292917251587, "rewards/logprob_reward/std": 0.3642072081565857, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 686.46875, "completions/mean_terminated_length": 651.5516967773438, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 2.6574074074074074, "grad_norm": 1.612183459351502, "kl": 0.5078125, "learning_rate": 2.3725717723842066e-07, "loss": 0.0537, "num_tokens": 23835144.0, "reward": 0.3679739832878113, "reward_std": 0.08624902367591858, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3081655204296112, "rewards/logprob_reward/std": 0.3583782911300659, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 688.3125, "completions/mean_terminated_length": 653.586181640625, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 2.6604938271604937, "grad_norm": 1.5857216550315936, "kl": 0.533447265625, "learning_rate": 2.3675791772929862e-07, "loss": -0.0756, "num_tokens": 23863246.0, "reward": 0.4511909782886505, "reward_std": 0.16443760693073273, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4041011333465576, "rewards/logprob_reward/std": 0.3173883557319641, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 659.40625, "completions/mean_terminated_length": 621.6896362304688, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 2.6635802469135803, "grad_norm": 1.4509807926247562, "kl": 0.54833984375, "learning_rate": 2.3625871117475466e-07, "loss": -0.016, "num_tokens": 23890479.0, "reward": 0.3708074986934662, "reward_std": 0.11877764761447906, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3147861063480377, "rewards/logprob_reward/std": 0.3424849808216095, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 757.28125, "completions/mean_terminated_length": 707.888916015625, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 2.6666666666666665, "grad_norm": 1.582444106336958, "kl": 0.493896484375, "learning_rate": 2.357595595710967e-07, "loss": -0.007, "num_tokens": 23921700.0, "reward": 0.34564170241355896, "reward_std": 0.14804410934448242, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2937685549259186, "rewards/logprob_reward/std": 0.31999310851097107, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 706.0, "completions/mean_terminated_length": 673.1034545898438, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 2.669753086419753, "grad_norm": 1.5514191894115137, "kl": 0.502685546875, "learning_rate": 2.3526046491441277e-07, "loss": 0.0208, "num_tokens": 23950560.0, "reward": 0.3274352252483368, "reward_std": 0.09244364500045776, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.26312246918678284, "rewards/logprob_reward/std": 0.2583106458187103, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 681.375, "completions/mean_terminated_length": 645.9310302734375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 2.6728395061728394, "grad_norm": 1.4006684596056505, "kl": 0.525634765625, "learning_rate": 2.3476142920056315e-07, "loss": -0.1007, "num_tokens": 23978836.0, "reward": 0.31745368242263794, "reward_std": 0.15540055930614471, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2589763402938843, "rewards/logprob_reward/std": 0.33760780096054077, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 772.4375, "completions/mean_terminated_length": 688.5833740234375, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 2.675925925925926, "grad_norm": 1.3399987996544778, "kl": 0.492431640625, "learning_rate": 2.3426245442517254e-07, "loss": 0.0169, "num_tokens": 24009870.0, "reward": 0.33422189950942993, "reward_std": 0.1486816257238388, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.28802433609962463, "rewards/logprob_reward/std": 0.30593761801719666, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 703.9375, "completions/mean_terminated_length": 670.8275756835938, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 2.6790123456790123, "grad_norm": 1.4272869218310225, "kl": 0.501953125, "learning_rate": 2.3376354258362185e-07, "loss": -0.053, "num_tokens": 24038428.0, "reward": 0.3577028214931488, "reward_std": 0.1934034526348114, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3071697950363159, "rewards/logprob_reward/std": 0.30327898263931274, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 735.71875, "completions/mean_terminated_length": 669.1923217773438, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 2.682098765432099, "grad_norm": 1.4437672896609954, "kl": 0.536865234375, "learning_rate": 2.3326469567104044e-07, "loss": -0.0478, "num_tokens": 24068083.0, "reward": 0.1920091062784195, "reward_std": 0.11673358082771301, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.13001011312007904, "rewards/logprob_reward/std": 0.22697754204273224, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 684.0, "completions/mean_terminated_length": 661.3333740234375, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 2.685185185185185, "grad_norm": 1.5975546011988933, "kl": 0.566162109375, "learning_rate": 2.3276591568229787e-07, "loss": -0.0344, "num_tokens": 24096175.0, "reward": 0.39360639452934265, "reward_std": 0.08800595998764038, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.33664602041244507, "rewards/logprob_reward/std": 0.29286283254623413, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 721.625, "completions/mean_terminated_length": 701.4666748046875, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 2.6882716049382713, "grad_norm": 1.7571811088280482, "kl": 0.4912109375, "learning_rate": 2.3226720461199626e-07, "loss": 0.0126, "num_tokens": 24125819.0, "reward": 0.5135356783866882, "reward_std": 0.08225229382514954, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4664285182952881, "rewards/logprob_reward/std": 0.3693296015262604, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 721.78125, "completions/mean_terminated_length": 665.8148193359375, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 2.691358024691358, "grad_norm": 1.6061338995978724, "kl": 0.51318359375, "learning_rate": 2.3176856445446187e-07, "loss": -0.0421, "num_tokens": 24155468.0, "reward": 0.36311429738998413, "reward_std": 0.18667030334472656, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.313182532787323, "rewards/logprob_reward/std": 0.35316529870033264, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 666.15625, "completions/mean_terminated_length": 654.6128540039062, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 2.6944444444444446, "grad_norm": 1.3938705327572636, "kl": 0.529052734375, "learning_rate": 2.3126999720373757e-07, "loss": -0.1308, "num_tokens": 24183345.0, "reward": 0.22371020913124084, "reward_std": 0.08425696194171906, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.1478724628686905, "rewards/logprob_reward/std": 0.2766367495059967, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 753.5, "completions/mean_terminated_length": 691.0769653320312, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 2.697530864197531, "grad_norm": 1.4619668703800979, "kl": 0.485107421875, "learning_rate": 2.3077150485357477e-07, "loss": -0.0884, "num_tokens": 24213949.0, "reward": 0.3547605872154236, "reward_std": 0.17821642756462097, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3073728680610657, "rewards/logprob_reward/std": 0.3114413321018219, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 680.09375, "completions/mean_terminated_length": 669.0, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 2.700617283950617, "grad_norm": 1.6674303347257244, "kl": 0.547119140625, "learning_rate": 2.3027308939742502e-07, "loss": 0.115, "num_tokens": 24242228.0, "reward": 0.3722037076950073, "reward_std": 0.058102577924728394, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3093929886817932, "rewards/logprob_reward/std": 0.2828347980976105, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 742.1875, "completions/mean_terminated_length": 663.2799682617188, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 2.7037037037037037, "grad_norm": 1.5097245276821258, "kl": 0.471923828125, "learning_rate": 2.2977475282843266e-07, "loss": 0.1048, "num_tokens": 24272374.0, "reward": 0.31461119651794434, "reward_std": 0.12244805693626404, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2592902183532715, "rewards/logprob_reward/std": 0.29426249861717224, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 697.03125, "completions/mean_terminated_length": 636.4815063476562, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 2.7067901234567904, "grad_norm": 1.5790069650121372, "kl": 0.5712890625, "learning_rate": 2.292764971394265e-07, "loss": -0.0827, "num_tokens": 24301011.0, "reward": 0.2950444519519806, "reward_std": 0.15265390276908875, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.23754939436912537, "rewards/logprob_reward/std": 0.3079538643360138, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 682.0, "completions/mean_terminated_length": 659.2000122070312, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 2.7098765432098766, "grad_norm": 1.4871108788628502, "kl": 0.519287109375, "learning_rate": 2.2877832432291188e-07, "loss": -0.0392, "num_tokens": 24329215.0, "reward": 0.3071898818016052, "reward_std": 0.08153542876243591, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.24062761664390564, "rewards/logprob_reward/std": 0.27667760848999023, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 734.65625, "completions/mean_terminated_length": 681.0740966796875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 2.712962962962963, "grad_norm": 1.552116056655969, "kl": 0.524169921875, "learning_rate": 2.2828023637106273e-07, "loss": -0.0731, "num_tokens": 24359840.0, "reward": 0.3740811347961426, "reward_std": 0.15812793374061584, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.32884013652801514, "rewards/logprob_reward/std": 0.320843368768692, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 723.3125, "completions/mean_terminated_length": 653.923095703125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 2.7160493827160495, "grad_norm": 1.4149352594365225, "kl": 0.520263671875, "learning_rate": 2.2778223527571362e-07, "loss": -0.1317, "num_tokens": 24389286.0, "reward": 0.2960160970687866, "reward_std": 0.10719048976898193, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.24210122227668762, "rewards/logprob_reward/std": 0.28255414962768555, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 656.78125, "completions/mean_terminated_length": 618.7930908203125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 2.7191358024691357, "grad_norm": 1.4456265429893493, "kl": 0.568603515625, "learning_rate": 2.2728432302835183e-07, "loss": -0.0925, "num_tokens": 24416895.0, "reward": 0.29274171590805054, "reward_std": 0.10805127769708633, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.23151856660842896, "rewards/logprob_reward/std": 0.26377037167549133, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 647.28125, "completions/mean_terminated_length": 608.3103637695312, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 2.7222222222222223, "grad_norm": 1.5616213539363208, "kl": 0.545166015625, "learning_rate": 2.2678650162010937e-07, "loss": 0.0015, "num_tokens": 24443896.0, "reward": 0.43333643674850464, "reward_std": 0.11901573836803436, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.38426268100738525, "rewards/logprob_reward/std": 0.3301646113395691, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 735.0625, "completions/mean_terminated_length": 705.1724243164062, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 2.7253086419753085, "grad_norm": 1.6872864054468693, "kl": 0.5400390625, "learning_rate": 2.2628877304175472e-07, "loss": -0.2327, "num_tokens": 24473466.0, "reward": 0.575402021408081, "reward_std": 0.1618824154138565, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.545585572719574, "rewards/logprob_reward/std": 0.36551234126091003, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 646.9375, "completions/mean_terminated_length": 621.800048828125, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 2.728395061728395, "grad_norm": 1.6603005850568686, "kl": 0.518798828125, "learning_rate": 2.2579113928368548e-07, "loss": 0.0116, "num_tokens": 24500644.0, "reward": 0.42678990960121155, "reward_std": 0.12911778688430786, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3769887685775757, "rewards/logprob_reward/std": 0.32073596119880676, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 697.96875, "completions/mean_terminated_length": 676.2333984375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 2.7314814814814814, "grad_norm": 1.5751026672256232, "kl": 0.562744140625, "learning_rate": 2.2529360233591997e-07, "loss": -0.0848, "num_tokens": 24529647.0, "reward": 0.5163202881813049, "reward_std": 0.07627381384372711, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.47299474477767944, "rewards/logprob_reward/std": 0.37878644466400146, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 715.21875, "completions/mean_terminated_length": 658.0370483398438, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 2.734567901234568, "grad_norm": 1.4381116242468504, "kl": 0.531005859375, "learning_rate": 2.2479616418808915e-07, "loss": -0.2219, "num_tokens": 24558910.0, "reward": 0.3112731873989105, "reward_std": 0.18168997764587402, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.2659980058670044, "rewards/logprob_reward/std": 0.3052578866481781, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 765.40625, "completions/mean_terminated_length": 717.5184936523438, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 2.7376543209876543, "grad_norm": 1.4470702241135545, "kl": 0.503662109375, "learning_rate": 2.242988268294292e-07, "loss": 0.0002, "num_tokens": 24590739.0, "reward": 0.23044851422309875, "reward_std": 0.10234861075878143, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.16577613353729248, "rewards/logprob_reward/std": 0.27769696712493896, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 571.21875, "completions/mean_terminated_length": 556.6129150390625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 2.7407407407407405, "grad_norm": 1.8918231929004143, "kl": 0.578369140625, "learning_rate": 2.23801592248773e-07, "loss": -0.0566, "num_tokens": 24615034.0, "reward": 0.25585615634918213, "reward_std": 0.08438281714916229, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.18706239759922028, "rewards/logprob_reward/std": 0.2883966863155365, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 710.3125, "completions/mean_terminated_length": 665.5, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 2.743827160493827, "grad_norm": 1.5015404273938993, "kl": 0.508056640625, "learning_rate": 2.2330446243454265e-07, "loss": -0.097, "num_tokens": 24644196.0, "reward": 0.38483208417892456, "reward_std": 0.22079211473464966, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3407856523990631, "rewards/logprob_reward/std": 0.3045453429222107, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 676.6875, "completions/mean_terminated_length": 640.7586059570312, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 2.746913580246914, "grad_norm": 1.40258178235494, "kl": 0.501220703125, "learning_rate": 2.228074393747412e-07, "loss": -0.1795, "num_tokens": 24672150.0, "reward": 0.3089723289012909, "reward_std": 0.20068970322608948, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2530248165130615, "rewards/logprob_reward/std": 0.32973575592041016, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 691.6875, "completions/mean_terminated_length": 669.5333862304688, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 2.75, "grad_norm": 1.4314674076227618, "kl": 0.496826171875, "learning_rate": 2.2231052505694458e-07, "loss": -0.1107, "num_tokens": 24700572.0, "reward": 0.2994413375854492, "reward_std": 0.1462399959564209, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.23549038171768188, "rewards/logprob_reward/std": 0.29352444410324097, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 744.625, "completions/mean_terminated_length": 666.3999633789062, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 2.753086419753086, "grad_norm": 1.7094218865322846, "kl": 0.510009765625, "learning_rate": 2.2181372146829418e-07, "loss": -0.0327, "num_tokens": 24731008.0, "reward": 0.3822531998157501, "reward_std": 0.21729137003421783, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.3448646664619446, "rewards/logprob_reward/std": 0.36942335963249207, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 671.96875, "completions/mean_terminated_length": 621.6785888671875, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 2.756172839506173, "grad_norm": 1.6218360020146971, "kl": 0.580322265625, "learning_rate": 2.213170305954884e-07, "loss": -0.0643, "num_tokens": 24758687.0, "reward": 0.23415327072143555, "reward_std": 0.12488116323947906, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.16989250481128693, "rewards/logprob_reward/std": 0.18617720901966095, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 666.75, "completions/mean_terminated_length": 629.7930908203125, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 2.7592592592592595, "grad_norm": 1.6454450402358411, "kl": 0.548095703125, "learning_rate": 2.2082045442477497e-07, "loss": -0.0329, "num_tokens": 24786603.0, "reward": 0.28081366419792175, "reward_std": 0.1020376905798912, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.21479295194149017, "rewards/logprob_reward/std": 0.28671324253082275, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 682.0625, "completions/mean_terminated_length": 671.0322265625, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 2.7623456790123457, "grad_norm": 1.606551106958639, "kl": 0.541748046875, "learning_rate": 2.2032399494194292e-07, "loss": -0.2197, "num_tokens": 24815045.0, "reward": 0.21605463325977325, "reward_std": 0.09603306651115417, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.1463106870651245, "rewards/logprob_reward/std": 0.27116838097572327, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 719.40625, "completions/mean_terminated_length": 687.8965454101562, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 2.765432098765432, "grad_norm": 1.481790412612556, "kl": 0.501220703125, "learning_rate": 2.1982765413231466e-07, "loss": -0.2077, "num_tokens": 24844550.0, "reward": 0.35117456316947937, "reward_std": 0.116917684674263, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.29991620779037476, "rewards/logprob_reward/std": 0.31146955490112305, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 685.03125, "completions/mean_terminated_length": 649.9655151367188, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 2.7685185185185186, "grad_norm": 1.7536280409303573, "kl": 0.54443359375, "learning_rate": 2.1933143398073805e-07, "loss": -0.0958, "num_tokens": 24872695.0, "reward": 0.39891064167022705, "reward_std": 0.06956420093774796, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.34253960847854614, "rewards/logprob_reward/std": 0.28865060210227966, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 671.4375, "completions/mean_terminated_length": 621.0714721679688, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 2.771604938271605, "grad_norm": 1.662500525545735, "kl": 0.530517578125, "learning_rate": 2.1883533647157828e-07, "loss": -0.0583, "num_tokens": 24900365.0, "reward": 0.29016566276550293, "reward_std": 0.1365557461977005, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.2390729784965515, "rewards/logprob_reward/std": 0.2963479459285736, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 698.6875, "completions/mean_terminated_length": 677.0000610351562, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 2.7746913580246915, "grad_norm": 1.398145614164361, "kl": 0.53173828125, "learning_rate": 2.1833936358871045e-07, "loss": -0.1185, "num_tokens": 24929199.0, "reward": 0.40480735898017883, "reward_std": 0.08783569931983948, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.352563738822937, "rewards/logprob_reward/std": 0.3398122787475586, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 689.78125, "completions/mean_terminated_length": 679.0, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 2.7777777777777777, "grad_norm": 1.6672241305631355, "kl": 0.58056640625, "learning_rate": 2.1784351731551077e-07, "loss": -0.0678, "num_tokens": 24957752.0, "reward": 0.41671183705329895, "reward_std": 0.06330046057701111, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3588464856147766, "rewards/logprob_reward/std": 0.3123563528060913, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 703.40625, "completions/mean_terminated_length": 644.0370483398438, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 2.7808641975308643, "grad_norm": 1.5753310714207458, "kl": 0.532470703125, "learning_rate": 2.1734779963484959e-07, "loss": -0.1425, "num_tokens": 24986509.0, "reward": 0.28323036432266235, "reward_std": 0.1265164613723755, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.2313670814037323, "rewards/logprob_reward/std": 0.3221213221549988, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 701.21875, "completions/mean_terminated_length": 667.8275756835938, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 2.7839506172839505, "grad_norm": 1.4852727399367296, "kl": 0.55712890625, "learning_rate": 2.1685221252908282e-07, "loss": -0.0709, "num_tokens": 25016164.0, "reward": 0.20718300342559814, "reward_std": 0.12428092211484909, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.13645334541797638, "rewards/logprob_reward/std": 0.24541877210140228, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 679.34375, "completions/mean_terminated_length": 668.2257690429688, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 2.787037037037037, "grad_norm": 1.43059327549612, "kl": 0.5654296875, "learning_rate": 2.163567579800443e-07, "loss": -0.1126, "num_tokens": 25043967.0, "reward": 0.41186678409576416, "reward_std": 0.10857434570789337, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3604075610637665, "rewards/logprob_reward/std": 0.3019101619720459, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 700.09375, "completions/mean_terminated_length": 640.1111450195312, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 2.7901234567901234, "grad_norm": 1.5780216987278628, "kl": 0.511474609375, "learning_rate": 2.1586143796903775e-07, "loss": -0.1396, "num_tokens": 25072838.0, "reward": 0.3253273367881775, "reward_std": 0.083259716629982, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.27814146876335144, "rewards/logprob_reward/std": 0.3033048212528229, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 684.0, "completions/mean_terminated_length": 635.4285888671875, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 2.7932098765432096, "grad_norm": 1.3675954182718006, "kl": 0.546630859375, "learning_rate": 2.1536625447682877e-07, "loss": -0.0705, "num_tokens": 25101234.0, "reward": 0.3946310877799988, "reward_std": 0.09432770311832428, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3447290062904358, "rewards/logprob_reward/std": 0.3518230617046356, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 660.5, "completions/mean_terminated_length": 636.2667236328125, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 2.7962962962962963, "grad_norm": 1.7344729750483563, "kl": 0.5478515625, "learning_rate": 2.1487120948363713e-07, "loss": -0.0574, "num_tokens": 25129002.0, "reward": 0.37551459670066833, "reward_std": 0.11368820071220398, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3234884440898895, "rewards/logprob_reward/std": 0.3555936813354492, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 653.625, "completions/mean_terminated_length": 615.3103637695312, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 2.799382716049383, "grad_norm": 1.8086774739851437, "kl": 0.5810546875, "learning_rate": 2.1437630496912889e-07, "loss": -0.0828, "num_tokens": 25156686.0, "reward": 0.240574449300766, "reward_std": 0.12106387317180634, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.17008273303508759, "rewards/logprob_reward/std": 0.25281617045402527, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 649.84375, "completions/mean_terminated_length": 596.3928833007812, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 2.802469135802469, "grad_norm": 1.821289887713722, "kl": 0.564208984375, "learning_rate": 2.1388154291240794e-07, "loss": 0.0437, "num_tokens": 25184593.0, "reward": 0.24838173389434814, "reward_std": 0.11544694751501083, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.18222969770431519, "rewards/logprob_reward/std": 0.279908686876297, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 679.625, "completions/mean_terminated_length": 630.4285888671875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 2.8055555555555554, "grad_norm": 1.6733508341207823, "kl": 0.580322265625, "learning_rate": 2.133869252920089e-07, "loss": 0.0485, "num_tokens": 25213329.0, "reward": 0.2744399905204773, "reward_std": 0.12470673024654388, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2146555483341217, "rewards/logprob_reward/std": 0.2547256648540497, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 728.28125, "completions/mean_terminated_length": 697.6896362304688, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 2.808641975308642, "grad_norm": 1.4657067270457174, "kl": 0.5244140625, "learning_rate": 2.128924540858885e-07, "loss": -0.1544, "num_tokens": 25243274.0, "reward": 0.24034538865089417, "reward_std": 0.14069397747516632, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.18024487793445587, "rewards/logprob_reward/std": 0.2866882383823395, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 707.875, "completions/mean_terminated_length": 634.923095703125, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 2.8117283950617287, "grad_norm": 1.531128245472277, "kl": 0.466552734375, "learning_rate": 2.1239813127141828e-07, "loss": -0.0049, "num_tokens": 25272486.0, "reward": 0.29680871963500977, "reward_std": 0.13572663068771362, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.246454119682312, "rewards/logprob_reward/std": 0.3111914098262787, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 668.28125, "completions/mean_terminated_length": 617.4642944335938, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 2.814814814814815, "grad_norm": 1.6036778250878838, "kl": 0.588623046875, "learning_rate": 2.1190395882537598e-07, "loss": -0.0896, "num_tokens": 25299943.0, "reward": 0.4475705027580261, "reward_std": 0.10701223462820053, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.40702277421951294, "rewards/logprob_reward/std": 0.3959193825721741, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 614.4375, "completions/mean_terminated_length": 614.4375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 2.817901234567901, "grad_norm": 1.6515574886533446, "kl": 0.565673828125, "learning_rate": 2.1140993872393833e-07, "loss": -0.1664, "num_tokens": 25325697.0, "reward": 0.4288904666900635, "reward_std": 0.08945980668067932, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.37585052847862244, "rewards/logprob_reward/std": 0.28203195333480835, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 704.59375, "completions/mean_terminated_length": 658.9642944335938, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 2.8209876543209877, "grad_norm": 1.3610814642602063, "kl": 0.538818359375, "learning_rate": 2.1091607294267269e-07, "loss": -0.2777, "num_tokens": 25354552.0, "reward": 0.24434241652488708, "reward_std": 0.17712630331516266, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.19510269165039062, "rewards/logprob_reward/std": 0.2911088466644287, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 622.5625, "completions/mean_terminated_length": 609.6129150390625, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 2.824074074074074, "grad_norm": 1.5316433535634861, "kl": 0.57666015625, "learning_rate": 2.1042236345652947e-07, "loss": -0.104, "num_tokens": 25380642.0, "reward": 0.371913880109787, "reward_std": 0.10755741596221924, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.31948769092559814, "rewards/logprob_reward/std": 0.3158982992172241, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 665.3125, "completions/mean_terminated_length": 598.888916015625, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 2.8271604938271606, "grad_norm": 1.6213024363695634, "kl": 0.6240234375, "learning_rate": 2.0992881223983368e-07, "loss": -0.2015, "num_tokens": 25408432.0, "reward": 0.24656179547309875, "reward_std": 0.20851096510887146, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.19756866991519928, "rewards/logprob_reward/std": 0.26970499753952026, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 669.40625, "completions/mean_terminated_length": 669.40625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 2.830246913580247, "grad_norm": 1.5365440120962215, "kl": 0.56103515625, "learning_rate": 2.0943542126627784e-07, "loss": -0.1107, "num_tokens": 25436441.0, "reward": 0.34765151143074036, "reward_std": 0.05223237723112106, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.28558504581451416, "rewards/logprob_reward/std": 0.31305256485939026, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 633.65625, "completions/mean_terminated_length": 621.0645141601562, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 2.8333333333333335, "grad_norm": 1.6939928722533846, "kl": 0.505859375, "learning_rate": 2.0894219250891352e-07, "loss": 0.0828, "num_tokens": 25462930.0, "reward": 0.4255083203315735, "reward_std": 0.09522762894630432, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3755648136138916, "rewards/logprob_reward/std": 0.34028786420822144, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 716.25, "completions/mean_terminated_length": 672.2857666015625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 2.8364197530864197, "grad_norm": 1.9430513215802294, "kl": 0.557861328125, "learning_rate": 2.0844912794014341e-07, "loss": -0.1328, "num_tokens": 25493150.0, "reward": 0.3436409831047058, "reward_std": 0.1632624715566635, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.2984899878501892, "rewards/logprob_reward/std": 0.34502363204956055, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 704.8125, "completions/mean_terminated_length": 683.5333862304688, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 2.8395061728395063, "grad_norm": 1.4432664327648808, "kl": 0.536865234375, "learning_rate": 2.079562295317139e-07, "loss": -0.0149, "num_tokens": 25522332.0, "reward": 0.48003309965133667, "reward_std": 0.16230663657188416, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.43267565965652466, "rewards/logprob_reward/std": 0.3527522385120392, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 623.0, "completions/mean_terminated_length": 596.2667236328125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 2.8425925925925926, "grad_norm": 1.6077066075793758, "kl": 0.56201171875, "learning_rate": 2.0746349925470672e-07, "loss": -0.0601, "num_tokens": 25548120.0, "reward": 0.40079861879348755, "reward_std": 0.10493972897529602, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.34810957312583923, "rewards/logprob_reward/std": 0.315352201461792, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 649.0625, "completions/mean_terminated_length": 636.9677124023438, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 2.8456790123456788, "grad_norm": 1.4929573823334257, "kl": 0.584228515625, "learning_rate": 2.0697093907953134e-07, "loss": -0.0718, "num_tokens": 25575210.0, "reward": 0.44139695167541504, "reward_std": 0.08251407742500305, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.38974660634994507, "rewards/logprob_reward/std": 0.31339970231056213, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 678.5, "completions/mean_terminated_length": 667.3547973632812, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 2.8487654320987654, "grad_norm": 1.7834825069554983, "kl": 0.546630859375, "learning_rate": 2.0647855097591704e-07, "loss": -0.1594, "num_tokens": 25603338.0, "reward": 0.4349474310874939, "reward_std": 0.1167830377817154, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.392997145652771, "rewards/logprob_reward/std": 0.3518584668636322, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 681.34375, "completions/mean_terminated_length": 632.3928833007812, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 2.851851851851852, "grad_norm": 1.5120232171303583, "kl": 0.540283203125, "learning_rate": 2.0598633691290485e-07, "loss": -0.0916, "num_tokens": 25631285.0, "reward": 0.37244874238967896, "reward_std": 0.04259895533323288, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3200819492340088, "rewards/logprob_reward/std": 0.3059520423412323, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 674.84375, "completions/mean_terminated_length": 651.5667114257812, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 2.8549382716049383, "grad_norm": 1.4363333413990096, "kl": 0.542724609375, "learning_rate": 2.054942988588399e-07, "loss": -0.0975, "num_tokens": 25659368.0, "reward": 0.415985107421875, "reward_std": 0.07638801634311676, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.364983469247818, "rewards/logprob_reward/std": 0.3305024206638336, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 649.40625, "completions/mean_terminated_length": 624.433349609375, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 2.8580246913580245, "grad_norm": 1.622804819483048, "kl": 0.53369140625, "learning_rate": 2.050024387813634e-07, "loss": 0.0221, "num_tokens": 25686549.0, "reward": 0.4467061758041382, "reward_std": 0.035639092326164246, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.39217352867126465, "rewards/logprob_reward/std": 0.27315375208854675, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 662.3125, "completions/mean_terminated_length": 638.2000122070312, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 2.861111111111111, "grad_norm": 1.6497047136991174, "kl": 0.569091796875, "learning_rate": 2.0451075864740496e-07, "loss": -0.1614, "num_tokens": 25714367.0, "reward": 0.35596078634262085, "reward_std": 0.08901970088481903, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3017619848251343, "rewards/logprob_reward/std": 0.323722779750824, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 737.03125, "completions/mean_terminated_length": 727.774169921875, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 2.8641975308641974, "grad_norm": 1.652677926129577, "kl": 0.5625, "learning_rate": 2.0401926042317455e-07, "loss": 0.0547, "num_tokens": 25744412.0, "reward": 0.33488377928733826, "reward_std": 0.03420073539018631, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.267926424741745, "rewards/logprob_reward/std": 0.2911895513534546, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 667.5625, "completions/mean_terminated_length": 616.6428833007812, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 2.867283950617284, "grad_norm": 1.9698089097286988, "kl": 0.578125, "learning_rate": 2.0352794607415465e-07, "loss": -0.2448, "num_tokens": 25772206.0, "reward": 0.44320449233055115, "reward_std": 0.15220457315444946, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.40564388036727905, "rewards/logprob_reward/std": 0.38470038771629333, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 677.21875, "completions/mean_terminated_length": 654.1000366210938, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 2.8703703703703702, "grad_norm": 1.260991175410126, "kl": 0.545166015625, "learning_rate": 2.0303681756509254e-07, "loss": -0.1063, "num_tokens": 25800765.0, "reward": 0.3645320534706116, "reward_std": 0.12430260330438614, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.304341197013855, "rewards/logprob_reward/std": 0.3025013208389282, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 715.4375, "completions/mean_terminated_length": 671.357177734375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 2.873456790123457, "grad_norm": 2.4746507907418915, "kl": 0.522216796875, "learning_rate": 2.0254587685999215e-07, "loss": -0.1303, "num_tokens": 25831223.0, "reward": 0.276328980922699, "reward_std": 0.12109735608100891, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.22369886934757233, "rewards/logprob_reward/std": 0.2935755252838135, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 688.3125, "completions/mean_terminated_length": 677.4838256835938, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 2.876543209876543, "grad_norm": 1.6160052079098277, "kl": 0.520751953125, "learning_rate": 2.020551259221066e-07, "loss": -0.072, "num_tokens": 25859841.0, "reward": 0.3427347242832184, "reward_std": 0.0782490074634552, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.28012189269065857, "rewards/logprob_reward/std": 0.33007580041885376, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 691.75, "completions/mean_terminated_length": 657.3793334960938, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 2.8796296296296298, "grad_norm": 1.5664096612396379, "kl": 0.5234375, "learning_rate": 2.0156456671392988e-07, "loss": -0.2349, "num_tokens": 25888417.0, "reward": 0.22212550044059753, "reward_std": 0.03943982347846031, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.15305611491203308, "rewards/logprob_reward/std": 0.30909502506256104, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 722.6875, "completions/mean_terminated_length": 702.6000366210938, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 2.882716049382716, "grad_norm": 1.429641970874162, "kl": 0.531005859375, "learning_rate": 2.010742011971895e-07, "loss": -0.0314, "num_tokens": 25918219.0, "reward": 0.3972403407096863, "reward_std": 0.10206529498100281, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3406836986541748, "rewards/logprob_reward/std": 0.3159555494785309, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 628.53125, "completions/mean_terminated_length": 572.0357666015625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 2.8858024691358026, "grad_norm": 1.782881930212918, "kl": 0.56640625, "learning_rate": 2.005840313328383e-07, "loss": -0.1062, "num_tokens": 25944916.0, "reward": 0.3662363290786743, "reward_std": 0.1922411024570465, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.32012367248535156, "rewards/logprob_reward/std": 0.3413483500480652, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 683.09375, "completions/mean_terminated_length": 672.0967407226562, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 2.888888888888889, "grad_norm": 1.4567715503477405, "kl": 0.5859375, "learning_rate": 2.0009405908104673e-07, "loss": -0.1334, "num_tokens": 25973619.0, "reward": 0.4423273801803589, "reward_std": 0.14750578999519348, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3907804489135742, "rewards/logprob_reward/std": 0.3292495906352997, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 788.125, "completions/mean_terminated_length": 754.4285888671875, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 2.8919753086419755, "grad_norm": 1.6108090251361369, "kl": 0.48486328125, "learning_rate": 1.996042864011951e-07, "loss": 0.0408, "num_tokens": 26005359.0, "reward": 0.2998473644256592, "reward_std": 0.13396574556827545, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.24288594722747803, "rewards/logprob_reward/std": 0.31482431292533875, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 696.125, "completions/mean_terminated_length": 674.2667236328125, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 2.8950617283950617, "grad_norm": 1.6757735045627513, "kl": 0.54736328125, "learning_rate": 1.9911471525186534e-07, "loss": -0.0474, "num_tokens": 26034395.0, "reward": 0.5049779415130615, "reward_std": 0.09324929118156433, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.45691996812820435, "rewards/logprob_reward/std": 0.32111796736717224, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 593.875, "completions/mean_terminated_length": 593.875, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 2.898148148148148, "grad_norm": 1.2743655993505425, "kl": 0.5478515625, "learning_rate": 1.9862534759083379e-07, "loss": -0.3399, "num_tokens": 26059607.0, "reward": 0.39094269275665283, "reward_std": 0.21514630317687988, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.34410300850868225, "rewards/logprob_reward/std": 0.3634238541126251, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 665.3125, "completions/mean_terminated_length": 641.4000244140625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 2.9012345679012346, "grad_norm": 1.4704816161112313, "kl": 0.58984375, "learning_rate": 1.9813618537506302e-07, "loss": -0.2582, "num_tokens": 26087793.0, "reward": 0.27486997842788696, "reward_std": 0.12265075743198395, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.21166110038757324, "rewards/logprob_reward/std": 0.2908506691455841, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 746.375, "completions/mean_terminated_length": 668.6400146484375, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 2.9043209876543212, "grad_norm": 1.151733301207424, "kl": 0.514404296875, "learning_rate": 1.9764723056069365e-07, "loss": -0.107, "num_tokens": 26118325.0, "reward": 0.31674396991729736, "reward_std": 0.20026658475399017, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.26860445737838745, "rewards/logprob_reward/std": 0.339495986700058, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 687.75, "completions/mean_terminated_length": 665.3333740234375, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 2.9074074074074074, "grad_norm": 1.5496855773087328, "kl": 0.569091796875, "learning_rate": 1.9715848510303739e-07, "loss": -0.0299, "num_tokens": 26146729.0, "reward": 0.4302632212638855, "reward_std": 0.134810209274292, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3773757815361023, "rewards/logprob_reward/std": 0.3490449786186218, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 686.96875, "completions/mean_terminated_length": 638.8214721679688, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 2.9104938271604937, "grad_norm": 1.5821457667243193, "kl": 0.568359375, "learning_rate": 1.966699509565685e-07, "loss": -0.1237, "num_tokens": 26175452.0, "reward": 0.3913611173629761, "reward_std": 0.14328306913375854, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3445678949356079, "rewards/logprob_reward/std": 0.3638380467891693, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 687.0, "completions/mean_terminated_length": 609.2307739257812, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 2.9135802469135803, "grad_norm": 1.5942602041075766, "kl": 0.522216796875, "learning_rate": 1.961816300749163e-07, "loss": -0.1198, "num_tokens": 26204180.0, "reward": 0.2621680200099945, "reward_std": 0.18607807159423828, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.2183811217546463, "rewards/logprob_reward/std": 0.29576343297958374, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 689.375, "completions/mean_terminated_length": 641.5714721679688, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 2.9166666666666665, "grad_norm": 1.4164188725237459, "kl": 0.526611328125, "learning_rate": 1.9569352441085712e-07, "loss": -0.0862, "num_tokens": 26232988.0, "reward": 0.4700321555137634, "reward_std": 0.22900857031345367, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.43545234203338623, "rewards/logprob_reward/std": 0.3299541473388672, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 668.125, "completions/mean_terminated_length": 631.3103637695312, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 2.919753086419753, "grad_norm": 1.495688482679631, "kl": 0.5341796875, "learning_rate": 1.9520563591630686e-07, "loss": 0.0342, "num_tokens": 26260704.0, "reward": 0.28929102420806885, "reward_std": 0.0657438188791275, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.22421222925186157, "rewards/logprob_reward/std": 0.27346357703208923, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 637.96875, "completions/mean_terminated_length": 637.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 2.9228395061728394, "grad_norm": 1.6018358377763926, "kl": 0.577392578125, "learning_rate": 1.9471796654231278e-07, "loss": -0.2848, "num_tokens": 26287635.0, "reward": 0.5075594186782837, "reward_std": 0.03790691867470741, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4632605016231537, "rewards/logprob_reward/std": 0.3423454761505127, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 665.3125, "completions/mean_terminated_length": 628.2069091796875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 2.925925925925926, "grad_norm": 1.7443365902439396, "kl": 0.576171875, "learning_rate": 1.9423051823904602e-07, "loss": -0.2446, "num_tokens": 26314989.0, "reward": 0.4088453948497772, "reward_std": 0.17302465438842773, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.36746710538864136, "rewards/logprob_reward/std": 0.3469705879688263, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 702.78125, "completions/mean_terminated_length": 628.6538696289062, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 2.9290123456790123, "grad_norm": 1.7688041236415328, "kl": 0.562744140625, "learning_rate": 1.9374329295579372e-07, "loss": -0.2694, "num_tokens": 26344030.0, "reward": 0.29297879338264465, "reward_std": 0.21942168474197388, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.2560875415802002, "rewards/logprob_reward/std": 0.29844433069229126, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 739.1875, "completions/mean_terminated_length": 673.4615478515625, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 2.932098765432099, "grad_norm": 1.6423791729456205, "kl": 0.596435546875, "learning_rate": 1.9325629264095083e-07, "loss": 0.0696, "num_tokens": 26374344.0, "reward": 0.39348822832107544, "reward_std": 0.17985129356384277, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3504036068916321, "rewards/logprob_reward/std": 0.3036915063858032, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 705.96875, "completions/mean_terminated_length": 660.5357666015625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 2.935185185185185, "grad_norm": 1.5995655325303542, "kl": 0.5146484375, "learning_rate": 1.9276951924201304e-07, "loss": -0.0305, "num_tokens": 26403283.0, "reward": 0.385848730802536, "reward_std": 0.10148420184850693, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3384430408477783, "rewards/logprob_reward/std": 0.3312740921974182, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 689.03125, "completions/mean_terminated_length": 666.7000122070312, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 2.9382716049382713, "grad_norm": 1.7018647165507121, "kl": 0.539794921875, "learning_rate": 1.922829747055684e-07, "loss": -0.1179, "num_tokens": 26432336.0, "reward": 0.43825143575668335, "reward_std": 0.10147584229707718, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.38972383737564087, "rewards/logprob_reward/std": 0.3458053469657898, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 698.40625, "completions/mean_terminated_length": 651.8928833007812, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 2.941358024691358, "grad_norm": 1.5537639223454536, "kl": 0.561279296875, "learning_rate": 1.9179666097728982e-07, "loss": -0.0143, "num_tokens": 26460885.0, "reward": 0.29113829135894775, "reward_std": 0.1937790811061859, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.23320919275283813, "rewards/logprob_reward/std": 0.261476993560791, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 697.03125, "completions/mean_terminated_length": 663.2069091796875, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 2.9444444444444446, "grad_norm": 1.4938059728280608, "kl": 0.541015625, "learning_rate": 1.9131058000192726e-07, "loss": -0.06, "num_tokens": 26489798.0, "reward": 0.2748223543167114, "reward_std": 0.12111122906208038, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2116081863641739, "rewards/logprob_reward/std": 0.26506808400154114, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 625.96875, "completions/mean_terminated_length": 625.96875, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 2.947530864197531, "grad_norm": 1.5762532853019957, "kl": 0.54541015625, "learning_rate": 1.9082473372329983e-07, "loss": -0.1029, "num_tokens": 26516229.0, "reward": 0.4432995617389679, "reward_std": 0.04204420745372772, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.38838836550712585, "rewards/logprob_reward/std": 0.3466719686985016, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 671.1875, "completions/mean_terminated_length": 647.6666870117188, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 2.950617283950617, "grad_norm": 1.7133910487430075, "kl": 0.58984375, "learning_rate": 1.903391240842882e-07, "loss": -0.0363, "num_tokens": 26544019.0, "reward": 0.4687328636646271, "reward_std": 0.11553002148866653, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.42359209060668945, "rewards/logprob_reward/std": 0.345115602016449, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 634.625, "completions/mean_terminated_length": 608.6666870117188, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 2.9537037037037037, "grad_norm": 1.5349993520794352, "kl": 0.641845703125, "learning_rate": 1.8985375302682654e-07, "loss": 0.0362, "num_tokens": 26570675.0, "reward": 0.25780171155929565, "reward_std": 0.040698058903217316, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.18575191497802734, "rewards/logprob_reward/std": 0.30341964960098267, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 737.375, "completions/mean_terminated_length": 684.2963256835938, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 2.9567901234567904, "grad_norm": 1.4815202766597788, "kl": 0.53369140625, "learning_rate": 1.8936862249189515e-07, "loss": -0.1049, "num_tokens": 26601579.0, "reward": 0.2918394207954407, "reward_std": 0.09104181826114655, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.24093273282051086, "rewards/logprob_reward/std": 0.30104365944862366, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 666.9375, "completions/mean_terminated_length": 643.1333618164062, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 2.9598765432098766, "grad_norm": 1.4076694283991262, "kl": 0.513671875, "learning_rate": 1.8888373441951228e-07, "loss": -0.1788, "num_tokens": 26630241.0, "reward": 0.30204713344573975, "reward_std": 0.06955822557210922, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.24185794591903687, "rewards/logprob_reward/std": 0.30637601017951965, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 690.5, "completions/mean_terminated_length": 668.2667236328125, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 2.962962962962963, "grad_norm": 1.4735013228347704, "kl": 0.5712890625, "learning_rate": 1.8839909074872675e-07, "loss": -0.2096, "num_tokens": 26659077.0, "reward": 0.3275967240333557, "reward_std": 0.14218734204769135, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.26677411794662476, "rewards/logprob_reward/std": 0.28722596168518066, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 736.6875, "completions/mean_terminated_length": 717.5333862304688, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 2.9660493827160495, "grad_norm": 1.5080728853295657, "kl": 0.55126953125, "learning_rate": 1.8791469341761e-07, "loss": -0.125, "num_tokens": 26689663.0, "reward": 0.27991896867752075, "reward_std": 0.06829875707626343, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.21379885077476501, "rewards/logprob_reward/std": 0.30514439940452576, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 636.75, "completions/mean_terminated_length": 624.258056640625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 2.9691358024691357, "grad_norm": 1.4518998824316316, "kl": 0.5615234375, "learning_rate": 1.8743054436324835e-07, "loss": -0.1274, "num_tokens": 26716655.0, "reward": 0.35308194160461426, "reward_std": 0.0876440703868866, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.29161882400512695, "rewards/logprob_reward/std": 0.35856080055236816, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 656.78125, "completions/mean_terminated_length": 604.3214721679688, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 2.9722222222222223, "grad_norm": 1.4991214039394727, "kl": 0.5634765625, "learning_rate": 1.8694664552173529e-07, "loss": -0.1049, "num_tokens": 26744224.0, "reward": 0.3103574812412262, "reward_std": 0.13744746148586273, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2545638680458069, "rewards/logprob_reward/std": 0.3364384174346924, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 630.15625, "completions/mean_terminated_length": 603.9000244140625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 2.9753086419753085, "grad_norm": 1.5240867838485324, "kl": 0.53466796875, "learning_rate": 1.8646299882816358e-07, "loss": -0.1374, "num_tokens": 26770661.0, "reward": 0.32643377780914307, "reward_std": 0.13091662526130676, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.27242642641067505, "rewards/logprob_reward/std": 0.3071509003639221, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 663.78125, "completions/mean_terminated_length": 612.3214721679688, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 2.978395061728395, "grad_norm": 1.5749316319692863, "kl": 0.61865234375, "learning_rate": 1.859796062166178e-07, "loss": -0.0651, "num_tokens": 26798254.0, "reward": 0.20578058063983917, "reward_std": 0.03298487886786461, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.13836731016635895, "rewards/logprob_reward/std": 0.31791529059410095, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 614.21875, "completions/mean_terminated_length": 601.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 2.9814814814814814, "grad_norm": 1.6508221599948942, "kl": 0.56982421875, "learning_rate": 1.854964696201666e-07, "loss": -0.0509, "num_tokens": 26824545.0, "reward": 0.48562106490135193, "reward_std": 0.08756967633962631, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.43541228771209717, "rewards/logprob_reward/std": 0.29305925965309143, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 638.09375, "completions/mean_terminated_length": 625.6451416015625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 2.984567901234568, "grad_norm": 1.8821892421833037, "kl": 0.5908203125, "learning_rate": 1.850135909708544e-07, "loss": -0.2115, "num_tokens": 26851228.0, "reward": 0.37982916831970215, "reward_std": 0.1631961464881897, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.331754595041275, "rewards/logprob_reward/std": 0.35719648003578186, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 662.15625, "completions/mean_terminated_length": 638.0333862304688, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 2.9876543209876543, "grad_norm": 1.3996598470658976, "kl": 0.544189453125, "learning_rate": 1.8453097219969448e-07, "loss": -0.1461, "num_tokens": 26878585.0, "reward": 0.37253403663635254, "reward_std": 0.1425914764404297, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.32712116837501526, "rewards/logprob_reward/std": 0.3639918863773346, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 761.96875, "completions/mean_terminated_length": 674.625, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 2.9907407407407405, "grad_norm": 1.7662653344139252, "kl": 0.569580078125, "learning_rate": 1.8404861523666073e-07, "loss": -0.2354, "num_tokens": 26909568.0, "reward": 0.22759459912776947, "reward_std": 0.155604287981987, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.17996624112129211, "rewards/logprob_reward/std": 0.24779672920703888, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 662.0, "completions/mean_terminated_length": 637.86669921875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 2.993827160493827, "grad_norm": 1.4700976661506013, "kl": 0.580322265625, "learning_rate": 1.8356652201068024e-07, "loss": -0.1329, "num_tokens": 26936992.0, "reward": 0.34675121307373047, "reward_std": 0.08430974185466766, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.29152911901474, "rewards/logprob_reward/std": 0.35307955741882324, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 663.8125, "completions/mean_terminated_length": 663.8125, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 2.996913580246914, "grad_norm": 1.5273358493223752, "kl": 0.5546875, "learning_rate": 1.830846944496251e-07, "loss": -0.1564, "num_tokens": 26964574.0, "reward": 0.3864748477935791, "reward_std": 0.05212653428316116, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3252498507499695, "rewards/logprob_reward/std": 0.26678892970085144, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 672.5, "completions/mean_terminated_length": 591.3846435546875, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 3.0, "grad_norm": 1.7190593991775363, "kl": 0.58837890625, "learning_rate": 1.826031344803053e-07, "loss": 0.0105, "num_tokens": 26992930.0, "reward": 0.32739004492759705, "reward_std": 0.1879374384880066, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.2804333567619324, "rewards/logprob_reward/std": 0.3431173861026764, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 576.1875, "completions/mean_terminated_length": 561.741943359375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 3.003086419753086, "grad_norm": 1.8007974784736491, "kl": 0.70654296875, "learning_rate": 1.8212184402846064e-07, "loss": -0.0253, "num_tokens": 27017308.0, "reward": 0.437126100063324, "reward_std": 0.030505899339914322, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.38152897357940674, "rewards/logprob_reward/std": 0.30791381001472473, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 618.78125, "completions/mean_terminated_length": 605.7096557617188, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 3.006172839506173, "grad_norm": 1.4536119968478476, "kl": 0.616943359375, "learning_rate": 1.8164082501875326e-07, "loss": -0.0728, "num_tokens": 27043621.0, "reward": 0.4011768698692322, "reward_std": 0.04011835902929306, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3415854275226593, "rewards/logprob_reward/std": 0.3123927712440491, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 628.78125, "completions/mean_terminated_length": 616.0322265625, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 3.009259259259259, "grad_norm": 1.635832454331608, "kl": 0.574462890625, "learning_rate": 1.8116007937475947e-07, "loss": -0.046, "num_tokens": 27069946.0, "reward": 0.28343909978866577, "reward_std": 0.054336000233888626, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.21076563000679016, "rewards/logprob_reward/std": 0.29697850346565247, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 675.375, "completions/mean_terminated_length": 610.8148193359375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 3.0123456790123457, "grad_norm": 1.7420905779991354, "kl": 0.605224609375, "learning_rate": 1.8067960901896278e-07, "loss": -0.1526, "num_tokens": 27098342.0, "reward": 0.2871285676956177, "reward_std": 0.19570910930633545, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.235698401927948, "rewards/logprob_reward/std": 0.2926475405693054, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 670.0, "completions/mean_terminated_length": 633.3793334960938, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 3.015432098765432, "grad_norm": 1.7910592672657164, "kl": 0.552490234375, "learning_rate": 1.8019941587274565e-07, "loss": -0.2155, "num_tokens": 27126722.0, "reward": 0.3630959391593933, "reward_std": 0.18874549865722656, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3096899390220642, "rewards/logprob_reward/std": 0.3312024772167206, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 663.375, "completions/mean_terminated_length": 639.3333740234375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 3.0185185185185186, "grad_norm": 1.4762846570663113, "kl": 0.58056640625, "learning_rate": 1.7971950185638195e-07, "loss": -0.1006, "num_tokens": 27154482.0, "reward": 0.4070167541503906, "reward_std": 0.20057132840156555, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.35849082469940186, "rewards/logprob_reward/std": 0.33493053913116455, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 643.375, "completions/mean_terminated_length": 604.0, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 3.021604938271605, "grad_norm": 1.8639694098210047, "kl": 0.6162109375, "learning_rate": 1.7923986888902948e-07, "loss": -0.1186, "num_tokens": 27181522.0, "reward": 0.31005948781967163, "reward_std": 0.11657600849866867, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2472882866859436, "rewards/logprob_reward/std": 0.2969168722629547, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 648.0625, "completions/mean_terminated_length": 578.4444580078125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 3.0246913580246915, "grad_norm": 1.4758160876678734, "kl": 0.54833984375, "learning_rate": 1.78760518888722e-07, "loss": -0.1161, "num_tokens": 27209096.0, "reward": 0.30930382013320923, "reward_std": 0.12384960055351257, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.25686538219451904, "rewards/logprob_reward/std": 0.3516753315925598, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 714.78125, "completions/mean_terminated_length": 682.7930908203125, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 3.0277777777777777, "grad_norm": 1.6373842024898082, "kl": 0.55126953125, "learning_rate": 1.782814537723617e-07, "loss": -0.1619, "num_tokens": 27238657.0, "reward": 0.4429128170013428, "reward_std": 0.1525229513645172, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.40531978011131287, "rewards/logprob_reward/std": 0.3112739622592926, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 625.59375, "completions/mean_terminated_length": 599.0333862304688, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 3.0308641975308643, "grad_norm": 1.53838131398423, "kl": 0.544677734375, "learning_rate": 1.7780267545571175e-07, "loss": -0.055, "num_tokens": 27265388.0, "reward": 0.44353002309799194, "reward_std": 0.1688355952501297, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3990611433982849, "rewards/logprob_reward/std": 0.3466712534427643, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 612.21875, "completions/mean_terminated_length": 584.7667236328125, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 3.0339506172839505, "grad_norm": 1.7568816473037987, "kl": 0.56201171875, "learning_rate": 1.7732418585338804e-07, "loss": -0.0575, "num_tokens": 27291315.0, "reward": 0.34485626220703125, "reward_std": 0.0738648846745491, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2928958535194397, "rewards/logprob_reward/std": 0.28815993666648865, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 634.6875, "completions/mean_terminated_length": 634.6875, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 3.037037037037037, "grad_norm": 1.5790878317452577, "kl": 0.569091796875, "learning_rate": 1.7684598687885216e-07, "loss": -0.1394, "num_tokens": 27318121.0, "reward": 0.5584650635719299, "reward_std": 0.12323564291000366, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.5198222994804382, "rewards/logprob_reward/std": 0.32569146156311035, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 693.8125, "completions/mean_terminated_length": 659.6551513671875, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 3.0401234567901234, "grad_norm": 1.6815796193210615, "kl": 0.511962890625, "learning_rate": 1.7636808044440344e-07, "loss": 0.0109, "num_tokens": 27346815.0, "reward": 0.36254405975341797, "reward_std": 0.1329592764377594, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3056045472621918, "rewards/logprob_reward/std": 0.2727596163749695, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 624.25, "completions/mean_terminated_length": 582.8965454101562, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 3.04320987654321, "grad_norm": 1.5801146209838457, "kl": 0.56494140625, "learning_rate": 1.7589046846117132e-07, "loss": 0.0056, "num_tokens": 27373227.0, "reward": 0.3373718857765198, "reward_std": 0.10681183636188507, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2776354253292084, "rewards/logprob_reward/std": 0.33779376745224, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 677.59375, "completions/mean_terminated_length": 628.107177734375, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 3.0462962962962963, "grad_norm": 1.5627283270458023, "kl": 0.546630859375, "learning_rate": 1.754131528391078e-07, "loss": 0.0392, "num_tokens": 27401362.0, "reward": 0.23227828741073608, "reward_std": 0.14310085773468018, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.16780923306941986, "rewards/logprob_reward/std": 0.24691300094127655, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 651.84375, "completions/mean_terminated_length": 613.3448486328125, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 3.049382716049383, "grad_norm": 1.483125936598906, "kl": 0.56005859375, "learning_rate": 1.7493613548697966e-07, "loss": -0.0156, "num_tokens": 27428481.0, "reward": 0.43899619579315186, "reward_std": 0.14786244928836823, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3870791494846344, "rewards/logprob_reward/std": 0.33570045232772827, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 611.40625, "completions/mean_terminated_length": 552.4642944335938, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 3.052469135802469, "grad_norm": 1.5371403496027911, "kl": 0.565673828125, "learning_rate": 1.744594183123611e-07, "loss": -0.103, "num_tokens": 27453838.0, "reward": 0.3349524140357971, "reward_std": 0.12790623307228088, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2784193158149719, "rewards/logprob_reward/std": 0.3025476038455963, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 696.78125, "completions/mean_terminated_length": 650.0357666015625, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 3.0555555555555554, "grad_norm": 1.4829379272122825, "kl": 0.514892578125, "learning_rate": 1.7398300322162563e-07, "loss": 0.0158, "num_tokens": 27483471.0, "reward": 0.44965940713882446, "reward_std": 0.1275881975889206, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.40587157011032104, "rewards/logprob_reward/std": 0.3296172618865967, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 611.21875, "completions/mean_terminated_length": 583.7000122070312, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 3.058641975308642, "grad_norm": 1.64447007821496, "kl": 0.59228515625, "learning_rate": 1.7350689211993902e-07, "loss": -0.009, "num_tokens": 27509918.0, "reward": 0.3634185791015625, "reward_std": 0.12076635658740997, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3065761923789978, "rewards/logprob_reward/std": 0.2917918264865875, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 666.09375, "completions/mean_terminated_length": 614.9642944335938, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 3.0617283950617282, "grad_norm": 1.5790288531454366, "kl": 0.56005859375, "learning_rate": 1.7303108691125107e-07, "loss": -0.1836, "num_tokens": 27537805.0, "reward": 0.4565914273262024, "reward_std": 0.09301479160785675, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.4170460104942322, "rewards/logprob_reward/std": 0.36670926213264465, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 662.34375, "completions/mean_terminated_length": 595.370361328125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 3.064814814814815, "grad_norm": 1.615433682493483, "kl": 0.548095703125, "learning_rate": 1.725555894982887e-07, "loss": 0.0328, "num_tokens": 27565244.0, "reward": 0.45099204778671265, "reward_std": 0.1357956826686859, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.4108244776725769, "rewards/logprob_reward/std": 0.3512701988220215, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 598.40625, "completions/mean_terminated_length": 554.3793334960938, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 3.067901234567901, "grad_norm": 1.4387666185384054, "kl": 0.56787109375, "learning_rate": 1.7208040178254768e-07, "loss": -0.0508, "num_tokens": 27591173.0, "reward": 0.27255770564079285, "reward_std": 0.1437205821275711, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2125641107559204, "rewards/logprob_reward/std": 0.29271194338798523, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 645.4375, "completions/mean_terminated_length": 575.3333129882812, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 3.0709876543209877, "grad_norm": 1.442921765967446, "kl": 0.588623046875, "learning_rate": 1.716055256642855e-07, "loss": -0.0105, "num_tokens": 27618139.0, "reward": 0.240866020321846, "reward_std": 0.11244892328977585, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.17735113203525543, "rewards/logprob_reward/std": 0.2538810074329376, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 550.5625, "completions/mean_terminated_length": 519.0, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 3.074074074074074, "grad_norm": 1.7551091127258258, "kl": 0.656982421875, "learning_rate": 1.711309630425135e-07, "loss": -0.1054, "num_tokens": 27642045.0, "reward": 0.3914669156074524, "reward_std": 0.12993168830871582, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3377409875392914, "rewards/logprob_reward/std": 0.3190624415874481, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 664.3125, "completions/mean_terminated_length": 563.5999755859375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 3.0771604938271606, "grad_norm": 1.8260725406824623, "kl": 0.578125, "learning_rate": 1.7065671581498936e-07, "loss": -0.1912, "num_tokens": 27669371.0, "reward": 0.3002050220966339, "reward_std": 0.17554888129234314, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.25370001792907715, "rewards/logprob_reward/std": 0.26656872034072876, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 690.5, "completions/mean_terminated_length": 613.5385131835938, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 3.080246913580247, "grad_norm": 1.7714715060509851, "kl": 0.601318359375, "learning_rate": 1.701827858782095e-07, "loss": -0.2393, "num_tokens": 27697631.0, "reward": 0.419953852891922, "reward_std": 0.15235841274261475, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.37980982661247253, "rewards/logprob_reward/std": 0.39305579662323, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 691.6875, "completions/mean_terminated_length": 615.0, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 3.0833333333333335, "grad_norm": 1.8667286473262306, "kl": 0.537841796875, "learning_rate": 1.697091751274016e-07, "loss": 0.1769, "num_tokens": 27726553.0, "reward": 0.21865561604499817, "reward_std": 0.1094026118516922, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.15267294645309448, "rewards/logprob_reward/std": 0.2779935598373413, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 583.875, "completions/mean_terminated_length": 554.5333862304688, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 3.0864197530864197, "grad_norm": 2.336357652066918, "kl": 0.66259765625, "learning_rate": 1.6923588545651672e-07, "loss": -0.3219, "num_tokens": 27751085.0, "reward": 0.37373262643814087, "reward_std": 0.19220903515815735, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.338869571685791, "rewards/logprob_reward/std": 0.3463937044143677, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 681.4375, "completions/mean_terminated_length": 602.3846435546875, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 3.0895061728395063, "grad_norm": 1.7593273384574253, "kl": 0.58837890625, "learning_rate": 1.687629187582221e-07, "loss": -0.2114, "num_tokens": 27779399.0, "reward": 0.4934203624725342, "reward_std": 0.28637462854385376, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.4614392817020416, "rewards/logprob_reward/std": 0.37426838278770447, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 651.1875, "completions/mean_terminated_length": 626.3333740234375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 3.0925925925925926, "grad_norm": 1.3078225356829007, "kl": 0.62744140625, "learning_rate": 1.6829027692389343e-07, "loss": -0.2014, "num_tokens": 27807129.0, "reward": 0.41759365797042847, "reward_std": 0.12092991173267365, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.36677074432373047, "rewards/logprob_reward/std": 0.3170064687728882, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 660.59375, "completions/mean_terminated_length": 576.7307739257812, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 3.095679012345679, "grad_norm": 1.6011062417742385, "kl": 0.572509765625, "learning_rate": 1.678179618436073e-07, "loss": -0.1581, "num_tokens": 27834644.0, "reward": 0.1604282110929489, "reward_std": 0.126372292637825, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.09492023289203644, "rewards/logprob_reward/std": 0.23300449550151825, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 597.0625, "completions/mean_terminated_length": 552.8965454101562, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 3.0987654320987654, "grad_norm": 1.6715461172786918, "kl": 0.64111328125, "learning_rate": 1.6734597540613344e-07, "loss": -0.1605, "num_tokens": 27860246.0, "reward": 0.3144886791706085, "reward_std": 0.1495763659477234, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2591541111469269, "rewards/logprob_reward/std": 0.30083563923835754, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 551.0625, "completions/mean_terminated_length": 519.5333862304688, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 3.1018518518518516, "grad_norm": 1.657559904549255, "kl": 0.604736328125, "learning_rate": 1.6687431949892753e-07, "loss": -0.0467, "num_tokens": 27884644.0, "reward": 0.2985716164112091, "reward_std": 0.18520475924015045, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.24146847426891327, "rewards/logprob_reward/std": 0.3187064826488495, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 643.1875, "completions/mean_terminated_length": 588.7857666015625, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 3.1049382716049383, "grad_norm": 1.4265320076147274, "kl": 0.57275390625, "learning_rate": 1.664029960081234e-07, "loss": 0.038, "num_tokens": 27912142.0, "reward": 0.4663579761981964, "reward_std": 0.1498047411441803, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.42095333337783813, "rewards/logprob_reward/std": 0.34987860918045044, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 685.0625, "completions/mean_terminated_length": 636.6428833007812, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 3.1080246913580245, "grad_norm": 1.6241301229672302, "kl": 0.564208984375, "learning_rate": 1.6593200681852574e-07, "loss": -0.1049, "num_tokens": 27941152.0, "reward": 0.2029947191476822, "reward_std": 0.1186952143907547, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.13527190685272217, "rewards/logprob_reward/std": 0.2239813357591629, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 655.53125, "completions/mean_terminated_length": 570.5, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 3.111111111111111, "grad_norm": 1.510930564332726, "kl": 0.52880859375, "learning_rate": 1.6546135381360194e-07, "loss": -0.0905, "num_tokens": 27969265.0, "reward": 0.36217057704925537, "reward_std": 0.19641876220703125, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3190784156322479, "rewards/logprob_reward/std": 0.30892476439476013, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 571.75, "completions/mean_terminated_length": 541.6000366210938, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 3.1141975308641974, "grad_norm": 1.7245476202995518, "kl": 0.677734375, "learning_rate": 1.6499103887547544e-07, "loss": -0.2373, "num_tokens": 27994265.0, "reward": 0.5061334371566772, "reward_std": 0.08385622501373291, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.46514829993247986, "rewards/logprob_reward/std": 0.284529447555542, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 646.65625, "completions/mean_terminated_length": 559.5769653320312, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 3.117283950617284, "grad_norm": 1.4100745218222053, "kl": 0.6455078125, "learning_rate": 1.6452106388491762e-07, "loss": -0.2746, "num_tokens": 28021130.0, "reward": 0.3737699091434479, "reward_std": 0.25161388516426086, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3319665193557739, "rewards/logprob_reward/std": 0.34221014380455017, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 575.03125, "completions/mean_terminated_length": 510.89288330078125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 3.1203703703703702, "grad_norm": 1.8441154418757875, "kl": 0.64599609375, "learning_rate": 1.6405143072134031e-07, "loss": -0.1003, "num_tokens": 28045919.0, "reward": 0.3376331031322479, "reward_std": 0.17758464813232422, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.28834232687950134, "rewards/logprob_reward/std": 0.31091392040252686, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 608.875, "completions/mean_terminated_length": 565.9310302734375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 3.123456790123457, "grad_norm": 1.7659693982242033, "kl": 0.6748046875, "learning_rate": 1.6358214126278855e-07, "loss": -0.0069, "num_tokens": 28071795.0, "reward": 0.2892984449863434, "reward_std": 0.1390882432460785, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.22422048449516296, "rewards/logprob_reward/std": 0.31863313913345337, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 575.3125, "completions/mean_terminated_length": 528.8965454101562, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 3.126543209876543, "grad_norm": 1.413901990858065, "kl": 0.587158203125, "learning_rate": 1.6311319738593281e-07, "loss": -0.037, "num_tokens": 28096337.0, "reward": 0.2890463173389435, "reward_std": 0.1384204626083374, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.22394037246704102, "rewards/logprob_reward/std": 0.26789283752441406, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 615.34375, "completions/mean_terminated_length": 556.9642944335938, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 3.1296296296296298, "grad_norm": 1.7414044685691463, "kl": 0.6220703125, "learning_rate": 1.6264460096606169e-07, "loss": -0.0673, "num_tokens": 28122060.0, "reward": 0.42779749631881714, "reward_std": 0.057273849844932556, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.38158053159713745, "rewards/logprob_reward/std": 0.3666421175003052, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 640.9375, "completions/mean_terminated_length": 570.0, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 3.132716049382716, "grad_norm": 1.6334016593406802, "kl": 0.5458984375, "learning_rate": 1.621763538770743e-07, "loss": -0.0004, "num_tokens": 28149478.0, "reward": 0.44771528244018555, "reward_std": 0.20749744772911072, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.4106558561325073, "rewards/logprob_reward/std": 0.3538627028465271, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 645.9375, "completions/mean_terminated_length": 620.7333374023438, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 3.1358024691358026, "grad_norm": 1.8459359031439606, "kl": 0.628662109375, "learning_rate": 1.6170845799147266e-07, "loss": 0.0734, "num_tokens": 28176468.0, "reward": 0.40354466438293457, "reward_std": 0.09393371641635895, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3442162871360779, "rewards/logprob_reward/std": 0.34413978457450867, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 691.96875, "completions/mean_terminated_length": 657.6206665039062, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 3.138888888888889, "grad_norm": 1.6969048380118483, "kl": 0.587890625, "learning_rate": 1.6124091518035443e-07, "loss": -0.0144, "num_tokens": 28205823.0, "reward": 0.289227694272995, "reward_std": 0.07834997773170471, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2276141196489334, "rewards/logprob_reward/std": 0.3229430317878723, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 631.21875, "completions/mean_terminated_length": 575.107177734375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 3.1419753086419755, "grad_norm": 1.629147505020284, "kl": 0.5546875, "learning_rate": 1.607737273134054e-07, "loss": -0.1312, "num_tokens": 28233050.0, "reward": 0.2789977192878723, "reward_std": 0.22021444141864777, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.23013633489608765, "rewards/logprob_reward/std": 0.28368300199508667, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 570.71875, "completions/mean_terminated_length": 523.8275756835938, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 3.1450617283950617, "grad_norm": 1.5626035938763778, "kl": 0.63818359375, "learning_rate": 1.603068962588918e-07, "loss": -0.2036, "num_tokens": 28257437.0, "reward": 0.4252150058746338, "reward_std": 0.06866255402565002, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.37176668643951416, "rewards/logprob_reward/std": 0.34374406933784485, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 582.09375, "completions/mean_terminated_length": 552.6333618164062, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 3.148148148148148, "grad_norm": 1.612987173818302, "kl": 0.66015625, "learning_rate": 1.598404238836532e-07, "loss": -0.1469, "num_tokens": 28282292.0, "reward": 0.40612971782684326, "reward_std": 0.15641291439533234, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3575052320957184, "rewards/logprob_reward/std": 0.35471856594085693, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 665.28125, "completions/mean_terminated_length": 628.1724243164062, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 3.1512345679012346, "grad_norm": 1.5966501175806007, "kl": 0.55419921875, "learning_rate": 1.5937431205309465e-07, "loss": -0.0442, "num_tokens": 28310477.0, "reward": 0.3304664194583893, "reward_std": 0.1023988127708435, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.26649051904678345, "rewards/logprob_reward/std": 0.25933215022087097, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 636.90625, "completions/mean_terminated_length": 565.2222290039062, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 3.154320987654321, "grad_norm": 1.6379624487279199, "kl": 0.617919921875, "learning_rate": 1.589085626311795e-07, "loss": 0.0451, "num_tokens": 28337374.0, "reward": 0.2869553565979004, "reward_std": 0.15828555822372437, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.22856149077415466, "rewards/logprob_reward/std": 0.3365638852119446, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 619.3125, "completions/mean_terminated_length": 544.370361328125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 3.1574074074074074, "grad_norm": 1.7088570152872822, "kl": 0.60693359375, "learning_rate": 1.5844317748042167e-07, "loss": 0.0788, "num_tokens": 28364196.0, "reward": 0.3010578155517578, "reward_std": 0.13353212177753448, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.24423091113567352, "rewards/logprob_reward/std": 0.26673829555511475, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 646.1875, "completions/mean_terminated_length": 607.1034545898438, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 3.1604938271604937, "grad_norm": 1.600399504193296, "kl": 0.61083984375, "learning_rate": 1.5797815846187868e-07, "loss": -0.1473, "num_tokens": 28391338.0, "reward": 0.42298513650894165, "reward_std": 0.10946369916200638, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3831779360771179, "rewards/logprob_reward/std": 0.3629390597343445, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 621.46875, "completions/mean_terminated_length": 563.9642944335938, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 3.1635802469135803, "grad_norm": 1.7699508974589317, "kl": 0.612060546875, "learning_rate": 1.575135074351435e-07, "loss": 0.0418, "num_tokens": 28417629.0, "reward": 0.4187324643135071, "reward_std": 0.1128990650177002, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3680361211299896, "rewards/logprob_reward/std": 0.28924593329429626, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 660.9375, "completions/mean_terminated_length": 539.9166870117188, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 3.1666666666666665, "grad_norm": 1.8940048545488648, "kl": 0.57958984375, "learning_rate": 1.5704922625833784e-07, "loss": -0.1152, "num_tokens": 28445499.0, "reward": 0.23804780840873718, "reward_std": 0.18229520320892334, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.1915808916091919, "rewards/logprob_reward/std": 0.26734328269958496, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 567.21875, "completions/mean_terminated_length": 536.7667236328125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 3.169753086419753, "grad_norm": 1.879797417973036, "kl": 0.6357421875, "learning_rate": 1.565853167881042e-07, "loss": -0.0434, "num_tokens": 28469654.0, "reward": 0.46710044145584106, "reward_std": 0.042482927441596985, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.41830605268478394, "rewards/logprob_reward/std": 0.3543635904788971, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 654.46875, "completions/mean_terminated_length": 601.6785888671875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 3.1728395061728394, "grad_norm": 1.578830900186581, "kl": 0.540771484375, "learning_rate": 1.5612178087959887e-07, "loss": 0.0235, "num_tokens": 28496821.0, "reward": 0.27795135974884033, "reward_std": 0.12753109633922577, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.21508488059043884, "rewards/logprob_reward/std": 0.28229713439941406, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 622.90625, "completions/mean_terminated_length": 596.1666870117188, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 3.175925925925926, "grad_norm": 1.6896666349871494, "kl": 0.755859375, "learning_rate": 1.556586203864841e-07, "loss": -0.127, "num_tokens": 28523334.0, "reward": 0.298500657081604, "reward_std": 0.07422243058681488, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.23097294569015503, "rewards/logprob_reward/std": 0.2663733959197998, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 564.34375, "completions/mean_terminated_length": 533.7000122070312, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 3.1790123456790123, "grad_norm": 1.7906715892846248, "kl": 0.662109375, "learning_rate": 1.5519583716092077e-07, "loss": -0.1169, "num_tokens": 28547881.0, "reward": 0.4204936623573303, "reward_std": 0.0876433476805687, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3630484938621521, "rewards/logprob_reward/std": 0.330705851316452, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 553.6875, "completions/mean_terminated_length": 522.3333740234375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 3.182098765432099, "grad_norm": 1.5348750917789504, "kl": 0.619873046875, "learning_rate": 1.5473343305356136e-07, "loss": -0.1193, "num_tokens": 28572415.0, "reward": 0.29837608337402344, "reward_std": 0.09605137258768082, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.23430678248405457, "rewards/logprob_reward/std": 0.32801106572151184, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 599.0, "completions/mean_terminated_length": 555.0344848632812, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 3.185185185185185, "grad_norm": 1.3906576922147589, "kl": 0.587890625, "learning_rate": 1.5427140991354215e-07, "loss": -0.0367, "num_tokens": 28598307.0, "reward": 0.3950592577457428, "reward_std": 0.09976682811975479, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.338260293006897, "rewards/logprob_reward/std": 0.30378663539886475, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 556.9375, "completions/mean_terminated_length": 541.8709716796875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 3.1882716049382718, "grad_norm": 1.5763673802528964, "kl": 0.631103515625, "learning_rate": 1.5380976958847572e-07, "loss": -0.1324, "num_tokens": 28623029.0, "reward": 0.42324697971343994, "reward_std": 0.10285536199808121, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3695799708366394, "rewards/logprob_reward/std": 0.38899150490760803, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 587.875, "completions/mean_terminated_length": 542.7586059570312, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 3.191358024691358, "grad_norm": 1.6767337280020316, "kl": 0.59130859375, "learning_rate": 1.5334851392444412e-07, "loss": 0.078, "num_tokens": 28648453.0, "reward": 0.42817485332489014, "reward_std": 0.11229575425386429, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.37505537271499634, "rewards/logprob_reward/std": 0.3230380713939667, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 626.625, "completions/mean_terminated_length": 553.0370483398438, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 3.1944444444444446, "grad_norm": 1.800280911911533, "kl": 0.5927734375, "learning_rate": 1.5288764476599102e-07, "loss": -0.0814, "num_tokens": 28674853.0, "reward": 0.3544710874557495, "reward_std": 0.1654333621263504, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3035790026187897, "rewards/logprob_reward/std": 0.26495543122291565, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 667.0, "completions/mean_terminated_length": 584.6154174804688, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 3.197530864197531, "grad_norm": 1.886312309173642, "kl": 0.59375, "learning_rate": 1.524271639561145e-07, "loss": -0.069, "num_tokens": 28702705.0, "reward": 0.38714027404785156, "reward_std": 0.22471016645431519, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.343350350856781, "rewards/logprob_reward/std": 0.3490927219390869, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 601.75, "completions/mean_terminated_length": 573.6000366210938, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 3.200617283950617, "grad_norm": 1.7375914336925933, "kl": 0.62353515625, "learning_rate": 1.5196707333625959e-07, "loss": -0.0889, "num_tokens": 28728645.0, "reward": 0.36194413900375366, "reward_std": 0.10919761657714844, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.308410108089447, "rewards/logprob_reward/std": 0.3839351534843445, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 642.34375, "completions/mean_terminated_length": 630.0322265625, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 3.2037037037037037, "grad_norm": 1.8484051669607835, "kl": 0.70361328125, "learning_rate": 1.5150737474631092e-07, "loss": -0.0807, "num_tokens": 28755872.0, "reward": 0.47151175141334534, "reward_std": 0.04500328004360199, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.419735312461853, "rewards/logprob_reward/std": 0.27513185143470764, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 527.875, "completions/mean_terminated_length": 494.8000183105469, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 3.20679012345679, "grad_norm": 1.8293106403440333, "kl": 0.64453125, "learning_rate": 1.5104807002458564e-07, "loss": -0.0667, "num_tokens": 28779688.0, "reward": 0.5878417491912842, "reward_std": 0.03789553791284561, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.5489908456802368, "rewards/logprob_reward/std": 0.4021238684654236, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 690.5, "completions/mean_terminated_length": 597.1199951171875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 3.2098765432098766, "grad_norm": 1.4484924015219156, "kl": 0.578857421875, "learning_rate": 1.5058916100782555e-07, "loss": -0.1464, "num_tokens": 28808672.0, "reward": 0.40581101179122925, "reward_std": 0.16933806240558624, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.364095538854599, "rewards/logprob_reward/std": 0.36228108406066895, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 652.8125, "completions/mean_terminated_length": 599.7857666015625, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 3.212962962962963, "grad_norm": 1.5265940279394412, "kl": 0.6103515625, "learning_rate": 1.5013064953119036e-07, "loss": -0.0539, "num_tokens": 28835622.0, "reward": 0.5913912057876587, "reward_std": 0.20584437251091003, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.5633513927459717, "rewards/logprob_reward/std": 0.3501444160938263, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 648.21875, "completions/mean_terminated_length": 594.5357666015625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 3.2160493827160495, "grad_norm": 2.393995986678892, "kl": 0.66552734375, "learning_rate": 1.4967253742824962e-07, "loss": -0.3208, "num_tokens": 28862461.0, "reward": 0.3824414312839508, "reward_std": 0.1752820611000061, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3381294012069702, "rewards/logprob_reward/std": 0.37462732195854187, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 605.03125, "completions/mean_terminated_length": 577.1000366210938, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 3.2191358024691357, "grad_norm": 1.6699222732644259, "kl": 0.679931640625, "learning_rate": 1.4921482653097614e-07, "loss": -0.0653, "num_tokens": 28888102.0, "reward": 0.33684250712394714, "reward_std": 0.049305111169815063, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.27704721689224243, "rewards/logprob_reward/std": 0.34254732728004456, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 623.4375, "completions/mean_terminated_length": 596.7333374023438, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 3.2222222222222223, "grad_norm": 1.737843257910827, "kl": 0.562255859375, "learning_rate": 1.487575186697381e-07, "loss": -0.0278, "num_tokens": 28914548.0, "reward": 0.27623450756073, "reward_std": 0.1220836341381073, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.20276057720184326, "rewards/logprob_reward/std": 0.3237999677658081, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 525.25, "completions/mean_terminated_length": 525.25, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 3.2253086419753085, "grad_norm": 1.5385819671390621, "kl": 0.61669921875, "learning_rate": 1.4830061567329223e-07, "loss": -0.1825, "num_tokens": 28937620.0, "reward": 0.47827523946762085, "reward_std": 0.05654080957174301, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4272502660751343, "rewards/logprob_reward/std": 0.37457022070884705, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 700.40625, "completions/mean_terminated_length": 625.7307739257812, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 3.228395061728395, "grad_norm": 1.5682849372518113, "kl": 0.56103515625, "learning_rate": 1.4784411936877596e-07, "loss": -0.0045, "num_tokens": 28966849.0, "reward": 0.301060289144516, "reward_std": 0.17500239610671997, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.24770589172840118, "rewards/logprob_reward/std": 0.27333498001098633, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 687.0625, "completions/mean_terminated_length": 592.719970703125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 3.2314814814814814, "grad_norm": 1.8409571961096793, "kl": 0.59375, "learning_rate": 1.4738803158170043e-07, "loss": -0.0853, "num_tokens": 28995359.0, "reward": 0.34305641055107117, "reward_std": 0.2502124309539795, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.3013126850128174, "rewards/logprob_reward/std": 0.31826338171958923, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 591.03125, "completions/mean_terminated_length": 546.2413940429688, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 3.234567901234568, "grad_norm": 1.6550419774348695, "kl": 0.60302734375, "learning_rate": 1.469323541359433e-07, "loss": -0.0591, "num_tokens": 29020216.0, "reward": 0.22798359394073486, "reward_std": 0.1025586798787117, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.15956509113311768, "rewards/logprob_reward/std": 0.2425576001405716, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 613.1875, "completions/mean_terminated_length": 613.1875, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 3.2376543209876543, "grad_norm": 1.6129044277848084, "kl": 0.65478515625, "learning_rate": 1.4647708885374105e-07, "loss": -0.1007, "num_tokens": 29046050.0, "reward": 0.26686233282089233, "reward_std": 0.019060403108596802, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.18887482583522797, "rewards/logprob_reward/std": 0.30885618925094604, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 648.65625, "completions/mean_terminated_length": 609.8275756835938, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 3.240740740740741, "grad_norm": 1.6130424401712877, "kl": 0.581298828125, "learning_rate": 1.4602223755568212e-07, "loss": 0.0441, "num_tokens": 29073607.0, "reward": 0.5260618925094604, "reward_std": 0.12912505865097046, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.48729100823402405, "rewards/logprob_reward/std": 0.3527923822402954, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 609.75, "completions/mean_terminated_length": 533.0370483398438, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 3.243827160493827, "grad_norm": 1.6375757909024389, "kl": 0.619140625, "learning_rate": 1.4556780206069925e-07, "loss": -0.1203, "num_tokens": 29099555.0, "reward": 0.29521650075912476, "reward_std": 0.13325199484825134, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.23426832258701324, "rewards/logprob_reward/std": 0.3197242021560669, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 579.0, "completions/mean_terminated_length": 549.3333740234375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 3.246913580246914, "grad_norm": 1.6918185546212396, "kl": 0.588134765625, "learning_rate": 1.4511378418606272e-07, "loss": 0.0389, "num_tokens": 29124487.0, "reward": 0.3926424980163574, "reward_std": 0.12146599590778351, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3321027457714081, "rewards/logprob_reward/std": 0.3268602192401886, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 622.84375, "completions/mean_terminated_length": 581.3448486328125, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 3.25, "grad_norm": 1.439112211090384, "kl": 0.56201171875, "learning_rate": 1.4466018574737236e-07, "loss": 0.105, "num_tokens": 29150942.0, "reward": 0.37503230571746826, "reward_std": 0.04226832836866379, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.31600815057754517, "rewards/logprob_reward/std": 0.31322717666625977, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 636.78125, "completions/mean_terminated_length": 596.72412109375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 3.253086419753086, "grad_norm": 1.6416726479893313, "kl": 0.651611328125, "learning_rate": 1.4420700855855093e-07, "loss": -0.1163, "num_tokens": 29177631.0, "reward": 0.48174142837524414, "reward_std": 0.19191685318946838, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.44151824712753296, "rewards/logprob_reward/std": 0.3638496994972229, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 619.46875, "completions/mean_terminated_length": 561.6785888671875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 3.256172839506173, "grad_norm": 1.6985108878209185, "kl": 0.62451171875, "learning_rate": 1.4375425443183675e-07, "loss": -0.1705, "num_tokens": 29204462.0, "reward": 0.21836280822753906, "reward_std": 0.11152079701423645, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.15234756469726562, "rewards/logprob_reward/std": 0.18570563197135925, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 704.90625, "completions/mean_terminated_length": 683.6333618164062, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 3.259259259259259, "grad_norm": 1.5944095824003324, "kl": 0.62548828125, "learning_rate": 1.43301925177776e-07, "loss": 0.0488, "num_tokens": 29234251.0, "reward": 0.29340559244155884, "reward_std": 0.042558517307043076, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2218395173549652, "rewards/logprob_reward/std": 0.309664785861969, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 587.1875, "completions/mean_terminated_length": 558.0667114257812, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 3.2623456790123457, "grad_norm": 1.471941369655027, "kl": 0.66845703125, "learning_rate": 1.4285002260521617e-07, "loss": -0.1138, "num_tokens": 29259085.0, "reward": 0.48032107949256897, "reward_std": 0.14920565485954285, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4364679157733917, "rewards/logprob_reward/std": 0.33724239468574524, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 550.40625, "completions/mean_terminated_length": 535.1290283203125, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 3.265432098765432, "grad_norm": 1.8290318688247666, "kl": 0.650390625, "learning_rate": 1.4239854852129807e-07, "loss": -0.0121, "num_tokens": 29283598.0, "reward": 0.39153677225112915, "reward_std": 0.12002336978912354, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.33087414503097534, "rewards/logprob_reward/std": 0.35592126846313477, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 595.84375, "completions/mean_terminated_length": 567.300048828125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 3.2685185185185186, "grad_norm": 1.809728788374709, "kl": 0.6669921875, "learning_rate": 1.419475047314493e-07, "loss": 0.0424, "num_tokens": 29308965.0, "reward": 0.26966333389282227, "reward_std": 0.10493199527263641, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.1989314705133438, "rewards/logprob_reward/std": 0.2696654200553894, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 671.5, "completions/mean_terminated_length": 606.2222290039062, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 3.271604938271605, "grad_norm": 1.4212701317831784, "kl": 0.5908203125, "learning_rate": 1.4149689303937662e-07, "loss": -0.3272, "num_tokens": 29337041.0, "reward": 0.29761844873428345, "reward_std": 0.15586286783218384, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.24388161301612854, "rewards/logprob_reward/std": 0.28149139881134033, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 594.5, "completions/mean_terminated_length": 550.0689697265625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 3.2746913580246915, "grad_norm": 1.7217964565362298, "kl": 0.64208984375, "learning_rate": 1.4104671524705892e-07, "loss": -0.1159, "num_tokens": 29362609.0, "reward": 0.22979126870632172, "reward_std": 0.0327754020690918, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.1615736186504364, "rewards/logprob_reward/std": 0.3214859068393707, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 607.0625, "completions/mean_terminated_length": 529.8518676757812, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 3.2777777777777777, "grad_norm": 1.6224742135207588, "kl": 0.57080078125, "learning_rate": 1.4059697315473988e-07, "loss": 0.0453, "num_tokens": 29388115.0, "reward": 0.41203248500823975, "reward_std": 0.210663303732872, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.36753612756729126, "rewards/logprob_reward/std": 0.32754823565483093, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 654.40625, "completions/mean_terminated_length": 585.9629516601562, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 3.2808641975308643, "grad_norm": 1.846197626377806, "kl": 0.568359375, "learning_rate": 1.4014766856092081e-07, "loss": -0.2027, "num_tokens": 29415488.0, "reward": 0.29443079233169556, "reward_std": 0.1279766857624054, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.24033978581428528, "rewards/logprob_reward/std": 0.2795492112636566, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 653.71875, "completions/mean_terminated_length": 568.2692260742188, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 3.2839506172839505, "grad_norm": 1.6597221359699499, "kl": 0.58251953125, "learning_rate": 1.3969880326235362e-07, "loss": -0.1819, "num_tokens": 29442951.0, "reward": 0.22676339745521545, "reward_std": 0.10189741849899292, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.16168153285980225, "rewards/logprob_reward/std": 0.26672130823135376, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 626.84375, "completions/mean_terminated_length": 585.7586059570312, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 3.287037037037037, "grad_norm": 1.7740512395522166, "kl": 0.597900390625, "learning_rate": 1.3925037905403324e-07, "loss": -0.0707, "num_tokens": 29469582.0, "reward": 0.3717727065086365, "reward_std": 0.08288693428039551, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.31238633394241333, "rewards/logprob_reward/std": 0.31982317566871643, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 685.59375, "completions/mean_terminated_length": 607.5, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 3.2901234567901234, "grad_norm": 1.7016353479202007, "kl": 0.575439453125, "learning_rate": 1.38802397729191e-07, "loss": -0.0039, "num_tokens": 29498017.0, "reward": 0.26197075843811035, "reward_std": 0.12701329588890076, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2008008509874344, "rewards/logprob_reward/std": 0.27895596623420715, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 665.5, "completions/mean_terminated_length": 599.1111450195312, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 3.29320987654321, "grad_norm": 1.6380850816931458, "kl": 0.58349609375, "learning_rate": 1.3835486107928678e-07, "loss": -0.0344, "num_tokens": 29526057.0, "reward": 0.5195534229278564, "reward_std": 0.24383017420768738, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.48700377345085144, "rewards/logprob_reward/std": 0.37730515003204346, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 601.40625, "completions/mean_terminated_length": 587.774169921875, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 3.2962962962962963, "grad_norm": 1.6764127413783962, "kl": 0.5859375, "learning_rate": 1.3790777089400262e-07, "loss": -0.1623, "num_tokens": 29551726.0, "reward": 0.30332884192466736, "reward_std": 0.08421826362609863, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2398098260164261, "rewards/logprob_reward/std": 0.27327045798301697, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 628.40625, "completions/mean_terminated_length": 571.8928833007812, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 3.299382716049383, "grad_norm": 1.5426065724118818, "kl": 0.552734375, "learning_rate": 1.3746112896123494e-07, "loss": -0.0431, "num_tokens": 29578639.0, "reward": 0.4480639100074768, "reward_std": 0.16417789459228516, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.4110432267189026, "rewards/logprob_reward/std": 0.3517591953277588, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 621.53125, "completions/mean_terminated_length": 594.7000122070312, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 3.302469135802469, "grad_norm": 1.6375636812969918, "kl": 0.673828125, "learning_rate": 1.3701493706708768e-07, "loss": -0.1988, "num_tokens": 29604916.0, "reward": 0.3857731223106384, "reward_std": 0.10362157225608826, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3348867893218994, "rewards/logprob_reward/std": 0.30565401911735535, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 594.65625, "completions/mean_terminated_length": 566.0333862304688, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 3.3055555555555554, "grad_norm": 1.4619079521043503, "kl": 0.67626953125, "learning_rate": 1.3656919699586503e-07, "loss": -0.0092, "num_tokens": 29630389.0, "reward": 0.27149152755737305, "reward_std": 0.10315725207328796, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2044350504875183, "rewards/logprob_reward/std": 0.28392648696899414, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 654.1875, "completions/mean_terminated_length": 585.7037353515625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 3.308641975308642, "grad_norm": 1.6800207816271475, "kl": 0.591796875, "learning_rate": 1.3612391053006446e-07, "loss": -0.0625, "num_tokens": 29657527.0, "reward": 0.42090684175491333, "reward_std": 0.14389416575431824, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.37739646434783936, "rewards/logprob_reward/std": 0.32922157645225525, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 642.8125, "completions/mean_terminated_length": 630.51611328125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 3.3117283950617282, "grad_norm": 1.5980572510934523, "kl": 0.64697265625, "learning_rate": 1.356790794503694e-07, "loss": -0.0338, "num_tokens": 29684633.0, "reward": 0.4833519160747528, "reward_std": 0.09717842936515808, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.43289101123809814, "rewards/logprob_reward/std": 0.29184091091156006, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 547.8125, "completions/mean_terminated_length": 516.0667114257812, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 3.314814814814815, "grad_norm": 1.5452545496540804, "kl": 0.64013671875, "learning_rate": 1.3523470553564238e-07, "loss": -0.1197, "num_tokens": 29708511.0, "reward": 0.2788340151309967, "reward_std": 0.09231580048799515, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.20912115275859833, "rewards/logprob_reward/std": 0.270794540643692, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 591.34375, "completions/mean_terminated_length": 529.5357666015625, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 3.317901234567901, "grad_norm": 1.9055897411206488, "kl": 0.62646484375, "learning_rate": 1.3479079056291738e-07, "loss": 0.028, "num_tokens": 29734286.0, "reward": 0.4049166440963745, "reward_std": 0.19579170644283295, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3665740489959717, "rewards/logprob_reward/std": 0.32956579327583313, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 608.8125, "completions/mean_terminated_length": 581.1333618164062, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 3.3209876543209877, "grad_norm": 1.6211318820091052, "kl": 0.574951171875, "learning_rate": 1.3434733630739345e-07, "loss": -0.0506, "num_tokens": 29760844.0, "reward": 0.27963757514953613, "reward_std": 0.04915326088666916, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2134861946105957, "rewards/logprob_reward/std": 0.2779744267463684, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 626.84375, "completions/mean_terminated_length": 553.2963256835938, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 3.324074074074074, "grad_norm": 1.8045264278750213, "kl": 0.630859375, "learning_rate": 1.3390434454242704e-07, "loss": -0.0414, "num_tokens": 29787479.0, "reward": 0.17516064643859863, "reward_std": 0.09810721129179001, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.10087292641401291, "rewards/logprob_reward/std": 0.24153846502304077, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 598.1875, "completions/mean_terminated_length": 519.3333129882812, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 3.3271604938271606, "grad_norm": 1.5924995117747525, "kl": 0.67822265625, "learning_rate": 1.334618170395254e-07, "loss": -0.1145, "num_tokens": 29812741.0, "reward": 0.3365400433540344, "reward_std": 0.16300618648529053, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.290600061416626, "rewards/logprob_reward/std": 0.31993094086647034, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 606.75, "completions/mean_terminated_length": 593.290283203125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 3.330246913580247, "grad_norm": 1.765404985887736, "kl": 0.662109375, "learning_rate": 1.3301975556833872e-07, "loss": -0.028, "num_tokens": 29838565.0, "reward": 0.4006649851799011, "reward_std": 0.05659489706158638, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3410167098045349, "rewards/logprob_reward/std": 0.30969756841659546, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 648.1875, "completions/mean_terminated_length": 623.1333618164062, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 3.3333333333333335, "grad_norm": 1.675170545151013, "kl": 0.598388671875, "learning_rate": 1.3257816189665398e-07, "loss": -0.0246, "num_tokens": 29866559.0, "reward": 0.3480982780456543, "reward_std": 0.032585326582193375, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2826091945171356, "rewards/logprob_reward/std": 0.314553439617157, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 616.625, "completions/mean_terminated_length": 541.1851806640625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 3.3364197530864197, "grad_norm": 1.8513162359372906, "kl": 0.60107421875, "learning_rate": 1.3213703779038726e-07, "loss": -0.1016, "num_tokens": 29892687.0, "reward": 0.2545337975025177, "reward_std": 0.1435617059469223, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.1960097700357437, "rewards/logprob_reward/std": 0.22824765741825104, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 664.71875, "completions/mean_terminated_length": 613.3928833007812, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 3.3395061728395063, "grad_norm": 1.490218931037361, "kl": 0.623291015625, "learning_rate": 1.3169638501357697e-07, "loss": 0.0314, "num_tokens": 29919786.0, "reward": 0.35751891136169434, "reward_std": 0.06444726139307022, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3034932613372803, "rewards/logprob_reward/std": 0.33416232466697693, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 636.5625, "completions/mean_terminated_length": 564.8148193359375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 3.3425925925925926, "grad_norm": 1.6296729519585749, "kl": 0.62841796875, "learning_rate": 1.3125620532837667e-07, "loss": -0.1823, "num_tokens": 29946812.0, "reward": 0.33085882663726807, "reward_std": 0.22058260440826416, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.28428757190704346, "rewards/logprob_reward/std": 0.3524380028247833, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 636.4375, "completions/mean_terminated_length": 564.6666870117188, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 3.3456790123456788, "grad_norm": 1.5131571195065738, "kl": 0.5595703125, "learning_rate": 1.3081650049504784e-07, "loss": -0.0441, "num_tokens": 29973942.0, "reward": 0.2053545117378235, "reward_std": 0.10468994081020355, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.1413661390542984, "rewards/logprob_reward/std": 0.25054052472114563, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 658.78125, "completions/mean_terminated_length": 634.433349609375, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 3.3487654320987654, "grad_norm": 1.4629338007394854, "kl": 0.5458984375, "learning_rate": 1.3037727227195333e-07, "loss": -0.09, "num_tokens": 30001115.0, "reward": 0.3101038634777069, "reward_std": 0.044922687113285065, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.24386541545391083, "rewards/logprob_reward/std": 0.29151010513305664, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 685.625, "completions/mean_terminated_length": 590.8800048828125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 3.351851851851852, "grad_norm": 1.6481921183945591, "kl": 0.591796875, "learning_rate": 1.2993852241554986e-07, "loss": -0.0439, "num_tokens": 30029467.0, "reward": 0.3749828338623047, "reward_std": 0.15863452851772308, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3298420310020447, "rewards/logprob_reward/std": 0.37655797600746155, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 559.71875, "completions/mean_terminated_length": 528.7667236328125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 3.3549382716049383, "grad_norm": 1.6242508732384713, "kl": 0.616455078125, "learning_rate": 1.295002526803813e-07, "loss": -0.0476, "num_tokens": 30053886.0, "reward": 0.3461126983165741, "reward_std": 0.12686535716056824, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2873474359512329, "rewards/logprob_reward/std": 0.375998318195343, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 636.28125, "completions/mean_terminated_length": 596.1724243164062, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 3.3580246913580245, "grad_norm": 1.7837579287264445, "kl": 0.61865234375, "learning_rate": 1.2906246481907145e-07, "loss": -0.1406, "num_tokens": 30080579.0, "reward": 0.5377302169799805, "reward_std": 0.10653539001941681, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4967835545539856, "rewards/logprob_reward/std": 0.3262912631034851, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 666.4375, "completions/mean_terminated_length": 654.9031982421875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 3.361111111111111, "grad_norm": 1.4832068592515977, "kl": 0.6650390625, "learning_rate": 1.2862516058231718e-07, "loss": 0.079, "num_tokens": 30108437.0, "reward": 0.47601842880249023, "reward_std": 0.07172292470932007, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.421270489692688, "rewards/logprob_reward/std": 0.35327115654945374, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 593.125, "completions/mean_terminated_length": 548.5516967773438, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 3.3641975308641974, "grad_norm": 1.7696345047646673, "kl": 0.642578125, "learning_rate": 1.2818834171888136e-07, "loss": -0.0849, "num_tokens": 30134089.0, "reward": 0.2968178689479828, "reward_std": 0.06522770971059799, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2325754016637802, "rewards/logprob_reward/std": 0.3224456012248993, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 609.21875, "completions/mean_terminated_length": 566.3103637695312, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 3.367283950617284, "grad_norm": 1.814220898690948, "kl": 0.62109375, "learning_rate": 1.277520099755857e-07, "loss": -0.2162, "num_tokens": 30159844.0, "reward": 0.36629819869995117, "reward_std": 0.0823051780462265, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3201924264431, "rewards/logprob_reward/std": 0.35958778858184814, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 686.09375, "completions/mean_terminated_length": 637.8214721679688, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 3.3703703703703702, "grad_norm": 1.2941855868971164, "kl": 0.587158203125, "learning_rate": 1.2731616709730428e-07, "loss": -0.2521, "num_tokens": 30187899.0, "reward": 0.229031503200531, "reward_std": 0.13865499198436737, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.16420167684555054, "rewards/logprob_reward/std": 0.27741530537605286, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 625.375, "completions/mean_terminated_length": 584.137939453125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 3.373456790123457, "grad_norm": 1.7404868520363808, "kl": 0.66259765625, "learning_rate": 1.2688081482695577e-07, "loss": -0.0445, "num_tokens": 30214887.0, "reward": 0.3399721682071686, "reward_std": 0.0830267071723938, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2770524024963379, "rewards/logprob_reward/std": 0.2879287898540497, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 723.75, "completions/mean_terminated_length": 654.4615478515625, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 3.376543209876543, "grad_norm": 1.595179258897045, "kl": 0.6220703125, "learning_rate": 1.264459549054973e-07, "loss": -0.0595, "num_tokens": 30244603.0, "reward": 0.4819941520690918, "reward_std": 0.21061614155769348, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.4452713429927826, "rewards/logprob_reward/std": 0.3953382074832916, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 573.8125, "completions/mean_terminated_length": 543.800048828125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 3.3796296296296298, "grad_norm": 1.465336251942891, "kl": 0.65966796875, "learning_rate": 1.2601158907191696e-07, "loss": 0.0169, "num_tokens": 30269037.0, "reward": 0.27754175662994385, "reward_std": 0.047023847699165344, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.20421306788921356, "rewards/logprob_reward/std": 0.3283768594264984, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 648.375, "completions/mean_terminated_length": 594.7142944335938, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 3.382716049382716, "grad_norm": 1.5907518952105966, "kl": 0.6181640625, "learning_rate": 1.2557771906322704e-07, "loss": 0.0754, "num_tokens": 30296161.0, "reward": 0.3691512644290924, "reward_std": 0.13944527506828308, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3164180517196655, "rewards/logprob_reward/std": 0.2769349217414856, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 602.875, "completions/mean_terminated_length": 574.800048828125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 3.3858024691358026, "grad_norm": 1.5284256141259713, "kl": 0.68359375, "learning_rate": 1.2514434661445706e-07, "loss": -0.0787, "num_tokens": 30321493.0, "reward": 0.30661773681640625, "reward_std": 0.04521657153964043, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2365196943283081, "rewards/logprob_reward/std": 0.2958061993122101, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 649.8125, "completions/mean_terminated_length": 545.0399780273438, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 3.388888888888889, "grad_norm": 1.889549414810151, "kl": 0.65771484375, "learning_rate": 1.2471147345864672e-07, "loss": -0.0581, "num_tokens": 30348739.0, "reward": 0.39388036727905273, "reward_std": 0.14803561568260193, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3543114960193634, "rewards/logprob_reward/std": 0.34003373980522156, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 598.8125, "completions/mean_terminated_length": 554.8275756835938, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 3.3919753086419755, "grad_norm": 1.493251955232433, "kl": 0.6416015625, "learning_rate": 1.2427910132683928e-07, "loss": -0.1928, "num_tokens": 30374149.0, "reward": 0.3996676206588745, "reward_std": 0.09966995567083359, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3433806896209717, "rewards/logprob_reward/std": 0.34844088554382324, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 615.40625, "completions/mean_terminated_length": 539.74072265625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 3.3950617283950617, "grad_norm": 1.5628835367260707, "kl": 0.6142578125, "learning_rate": 1.2384723194807408e-07, "loss": -0.2194, "num_tokens": 30399922.0, "reward": 0.33343660831451416, "reward_std": 0.18256554007530212, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.2906240224838257, "rewards/logprob_reward/std": 0.306649774312973, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 592.78125, "completions/mean_terminated_length": 564.0333862304688, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 3.398148148148148, "grad_norm": 1.6135724161541016, "kl": 0.6083984375, "learning_rate": 1.234158670493803e-07, "loss": 0.0161, "num_tokens": 30425055.0, "reward": 0.3206292688846588, "reward_std": 0.1352352797985077, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2555602788925171, "rewards/logprob_reward/std": 0.33655843138694763, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 743.09375, "completions/mean_terminated_length": 664.4400024414062, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 3.4012345679012346, "grad_norm": 1.7417311233185393, "kl": 0.58642578125, "learning_rate": 1.229850083557695e-07, "loss": -0.3009, "num_tokens": 30455222.0, "reward": 0.29841750860214233, "reward_std": 0.22552506625652313, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.25518614053726196, "rewards/logprob_reward/std": 0.3033534288406372, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 571.28125, "completions/mean_terminated_length": 506.607177734375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 3.4043209876543212, "grad_norm": 1.553664033155292, "kl": 0.6123046875, "learning_rate": 1.2255465759022913e-07, "loss": -0.1514, "num_tokens": 30480195.0, "reward": 0.3899286389350891, "reward_std": 0.09467620402574539, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.34297633171081543, "rewards/logprob_reward/std": 0.3824572265148163, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 632.6875, "completions/mean_terminated_length": 606.6000366210938, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 3.4074074074074074, "grad_norm": 1.6020690395475483, "kl": 0.61669921875, "learning_rate": 1.2212481647371542e-07, "loss": -0.1178, "num_tokens": 30506949.0, "reward": 0.5209455490112305, "reward_std": 0.06658852100372314, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4746617078781128, "rewards/logprob_reward/std": 0.3561067581176758, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 597.15625, "completions/mean_terminated_length": 536.1785888671875, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 3.4104938271604937, "grad_norm": 1.674948967576668, "kl": 0.64306640625, "learning_rate": 1.2169548672514625e-07, "loss": -0.1577, "num_tokens": 30532090.0, "reward": 0.3671531081199646, "reward_std": 0.14162789285182953, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3176701068878174, "rewards/logprob_reward/std": 0.2842181921005249, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 609.4375, "completions/mean_terminated_length": 532.6666870117188, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 3.4135802469135803, "grad_norm": 1.729990583446982, "kl": 0.66162109375, "learning_rate": 1.2126667006139495e-07, "loss": -0.1978, "num_tokens": 30558116.0, "reward": 0.3402765095233917, "reward_std": 0.15435782074928284, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.29475167393684387, "rewards/logprob_reward/std": 0.3170633018016815, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 692.0, "completions/mean_terminated_length": 562.0869750976562, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 3.4166666666666665, "grad_norm": 1.6725028747032116, "kl": 0.58935546875, "learning_rate": 1.208383681972829e-07, "loss": -0.128, "num_tokens": 30586620.0, "reward": 0.34621477127075195, "reward_std": 0.2143259197473526, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.3048219680786133, "rewards/logprob_reward/std": 0.29767462611198425, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 637.875, "completions/mean_terminated_length": 566.370361328125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 3.419753086419753, "grad_norm": 1.5362688167067555, "kl": 0.587158203125, "learning_rate": 1.2041058284557277e-07, "loss": -0.1522, "num_tokens": 30613736.0, "reward": 0.35709428787231445, "reward_std": 0.05244668200612068, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3030214309692383, "rewards/logprob_reward/std": 0.3098762631416321, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 744.0625, "completions/mean_terminated_length": 634.521728515625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 3.4228395061728394, "grad_norm": 1.3871456427215179, "kl": 0.554931640625, "learning_rate": 1.1998331571696162e-07, "loss": -0.0913, "num_tokens": 30644542.0, "reward": 0.3386214077472687, "reward_std": 0.17864054441452026, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.29985710978507996, "rewards/logprob_reward/std": 0.322572261095047, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 629.3125, "completions/mean_terminated_length": 556.2222290039062, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 3.425925925925926, "grad_norm": 1.6339179506348098, "kl": 0.647705078125, "learning_rate": 1.1955656852007438e-07, "loss": 0.0338, "num_tokens": 30671212.0, "reward": 0.23168964684009552, "reward_std": 0.04667447507381439, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.16368290781974792, "rewards/logprob_reward/std": 0.32006317377090454, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 698.34375, "completions/mean_terminated_length": 623.1923217773438, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 3.4290123456790123, "grad_norm": 1.3734706910204084, "kl": 0.62109375, "learning_rate": 1.1913034296145669e-07, "loss": -0.05, "num_tokens": 30699743.0, "reward": 0.33158159255981445, "reward_std": 0.14356598258018494, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.2885628640651703, "rewards/logprob_reward/std": 0.33034321665763855, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 616.0, "completions/mean_terminated_length": 602.8386840820312, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 3.432098765432099, "grad_norm": 1.3672520203559115, "kl": 0.6162109375, "learning_rate": 1.1870464074556816e-07, "loss": -0.1592, "num_tokens": 30725871.0, "reward": 0.32787197828292847, "reward_std": 0.11220750212669373, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2670799493789673, "rewards/logprob_reward/std": 0.30624881386756897, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 700.6875, "completions/mean_terminated_length": 626.0769653320312, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 3.435185185185185, "grad_norm": 1.4842924747501292, "kl": 0.63427734375, "learning_rate": 1.1827946357477559e-07, "loss": -0.0607, "num_tokens": 30755213.0, "reward": 0.29215937852859497, "reward_std": 0.2283041775226593, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.2447604089975357, "rewards/logprob_reward/std": 0.29037296772003174, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 638.34375, "completions/mean_terminated_length": 583.25, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 3.4382716049382718, "grad_norm": 1.1175403648105628, "kl": 0.62841796875, "learning_rate": 1.1785481314934618e-07, "loss": -0.3392, "num_tokens": 30782000.0, "reward": 0.34883296489715576, "reward_std": 0.12057096511125565, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.29731443524360657, "rewards/logprob_reward/std": 0.30843859910964966, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 703.25, "completions/mean_terminated_length": 613.4400024414062, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 3.441358024691358, "grad_norm": 1.4701359863454377, "kl": 0.574951171875, "learning_rate": 1.1743069116744064e-07, "loss": -0.05, "num_tokens": 30811108.0, "reward": 0.2984737455844879, "reward_std": 0.1810063123703003, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.24830418825149536, "rewards/logprob_reward/std": 0.31538039445877075, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 577.1875, "completions/mean_terminated_length": 547.4000244140625, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 3.4444444444444446, "grad_norm": 1.574966422355556, "kl": 0.65673828125, "learning_rate": 1.1700709932510656e-07, "loss": -0.1659, "num_tokens": 30835930.0, "reward": 0.4301937520503998, "reward_std": 0.054608847945928574, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.37382638454437256, "rewards/logprob_reward/std": 0.2904582619667053, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 564.71875, "completions/mean_terminated_length": 549.9031982421875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 3.447530864197531, "grad_norm": 1.6469184512864525, "kl": 0.6240234375, "learning_rate": 1.1658403931627125e-07, "loss": 0.0533, "num_tokens": 30860553.0, "reward": 0.3630666434764862, "reward_std": 0.043702732771635056, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.29576849937438965, "rewards/logprob_reward/std": 0.2920466363430023, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 647.71875, "completions/mean_terminated_length": 593.9642944335938, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 3.450617283950617, "grad_norm": 1.5109180609449904, "kl": 0.60693359375, "learning_rate": 1.1616151283273565e-07, "loss": -0.2314, "num_tokens": 30887308.0, "reward": 0.36325109004974365, "reward_std": 0.09341035038232803, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.31333452463150024, "rewards/logprob_reward/std": 0.3296356499195099, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 612.1875, "completions/mean_terminated_length": 584.7333374023438, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 3.4537037037037037, "grad_norm": 1.2577653646195082, "kl": 0.6484375, "learning_rate": 1.1573952156416672e-07, "loss": -0.1925, "num_tokens": 30912998.0, "reward": 0.5925281047821045, "reward_std": 0.1340741068124771, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.5576701164245605, "rewards/logprob_reward/std": 0.33110782504081726, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 628.1875, "completions/mean_terminated_length": 601.800048828125, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 3.45679012345679, "grad_norm": 1.3693786188684665, "kl": 0.609619140625, "learning_rate": 1.1531806719809142e-07, "loss": -0.211, "num_tokens": 30939404.0, "reward": 0.35620665550231934, "reward_std": 0.12374468147754669, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2985629439353943, "rewards/logprob_reward/std": 0.3304969370365143, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 688.4375, "completions/mean_terminated_length": 611.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 3.4598765432098766, "grad_norm": 1.4099214153245974, "kl": 0.62841796875, "learning_rate": 1.1489715141988954e-07, "loss": -0.03, "num_tokens": 30968078.0, "reward": 0.3758558928966522, "reward_std": 0.12142229080200195, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.33081209659576416, "rewards/logprob_reward/std": 0.3216583728790283, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 621.4375, "completions/mean_terminated_length": 579.7930908203125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 3.462962962962963, "grad_norm": 1.5459738210947256, "kl": 0.60498046875, "learning_rate": 1.1447677591278715e-07, "loss": -0.0277, "num_tokens": 30994868.0, "reward": 0.2888350188732147, "reward_std": 0.05116552114486694, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.22023338079452515, "rewards/logprob_reward/std": 0.3013761341571808, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 608.6875, "completions/mean_terminated_length": 565.72412109375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 3.4660493827160495, "grad_norm": 1.4077430359870675, "kl": 0.6591796875, "learning_rate": 1.1405694235784972e-07, "loss": -0.0684, "num_tokens": 31020342.0, "reward": 0.4559347629547119, "reward_std": 0.17741703987121582, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.4163163900375366, "rewards/logprob_reward/std": 0.3690919280052185, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 653.5625, "completions/mean_terminated_length": 584.9629516601562, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 3.4691358024691357, "grad_norm": 1.429964765753266, "kl": 0.637451171875, "learning_rate": 1.1363765243397555e-07, "loss": -0.0116, "num_tokens": 31047960.0, "reward": 0.301893413066864, "reward_std": 0.13449373841285706, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2451593428850174, "rewards/logprob_reward/std": 0.29199519753456116, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 625.21875, "completions/mean_terminated_length": 551.370361328125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 3.4722222222222223, "grad_norm": 1.5441331522373685, "kl": 0.60205078125, "learning_rate": 1.1321890781788884e-07, "loss": -0.1964, "num_tokens": 31074455.0, "reward": 0.26061758399009705, "reward_std": 0.14660178124904633, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.20276953279972076, "rewards/logprob_reward/std": 0.24858509004116058, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 597.3125, "completions/mean_terminated_length": 553.1724243164062, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 3.4753086419753085, "grad_norm": 1.7303719593753193, "kl": 0.64697265625, "learning_rate": 1.1280071018413326e-07, "loss": -0.1039, "num_tokens": 31100301.0, "reward": 0.4069730341434479, "reward_std": 0.10395349562168121, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3584422469139099, "rewards/logprob_reward/std": 0.31166872382164, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 683.0, "completions/mean_terminated_length": 604.3077392578125, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 3.478395061728395, "grad_norm": 1.3987037146663976, "kl": 0.6005859375, "learning_rate": 1.1238306120506505e-07, "loss": -0.0638, "num_tokens": 31128781.0, "reward": 0.37336382269859314, "reward_std": 0.17229002714157104, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.324570894241333, "rewards/logprob_reward/std": 0.30927029252052307, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 653.78125, "completions/mean_terminated_length": 568.34619140625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 3.4814814814814814, "grad_norm": 1.4836040211534796, "kl": 0.58837890625, "learning_rate": 1.1196596255084648e-07, "loss": -0.0502, "num_tokens": 31156558.0, "reward": 0.16266363859176636, "reward_std": 0.10930730402469635, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.09740404784679413, "rewards/logprob_reward/std": 0.23911280930042267, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 590.625, "completions/mean_terminated_length": 545.7930908203125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 3.484567901234568, "grad_norm": 1.5083116691676641, "kl": 0.6181640625, "learning_rate": 1.11549415889439e-07, "loss": -0.2165, "num_tokens": 31182282.0, "reward": 0.2890745997428894, "reward_std": 0.12487000226974487, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.22397175431251526, "rewards/logprob_reward/std": 0.2747332453727722, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 613.125, "completions/mean_terminated_length": 585.7333374023438, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 3.4876543209876543, "grad_norm": 1.5451935665056695, "kl": 0.65087890625, "learning_rate": 1.1113342288659683e-07, "loss": -0.0962, "num_tokens": 31208318.0, "reward": 0.5225491523742676, "reward_std": 0.11408531665802002, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.47991570830345154, "rewards/logprob_reward/std": 0.33328577876091003, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 679.40625, "completions/mean_terminated_length": 630.1785888671875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 3.490740740740741, "grad_norm": 1.634271355773306, "kl": 0.5732421875, "learning_rate": 1.1071798520585979e-07, "loss": 0.016, "num_tokens": 31237479.0, "reward": 0.39724069833755493, "reward_std": 0.18502068519592285, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3511008024215698, "rewards/logprob_reward/std": 0.3156222999095917, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 597.0, "completions/mean_terminated_length": 568.5333862304688, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 3.493827160493827, "grad_norm": 1.7317364516872387, "kl": 0.65673828125, "learning_rate": 1.1030310450854729e-07, "loss": -0.133, "num_tokens": 31262935.0, "reward": 0.380543053150177, "reward_std": 0.13363078236579895, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3256034255027771, "rewards/logprob_reward/std": 0.29789382219314575, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 653.5625, "completions/mean_terminated_length": 600.6428833007812, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 3.496913580246914, "grad_norm": 1.3863254068846573, "kl": 0.57861328125, "learning_rate": 1.0988878245375138e-07, "loss": -0.0603, "num_tokens": 31290473.0, "reward": 0.4821934700012207, "reward_std": 0.12684616446495056, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.4489649534225464, "rewards/logprob_reward/std": 0.38165926933288574, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 577.5, "completions/mean_terminated_length": 531.3103637695312, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 3.5, "grad_norm": 1.6368514011955335, "kl": 0.76513671875, "learning_rate": 1.094750206983299e-07, "loss": -0.0873, "num_tokens": 31315413.0, "reward": 0.4378281235694885, "reward_std": 0.16084182262420654, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3892534375190735, "rewards/logprob_reward/std": 0.30764782428741455, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 622.03125, "completions/mean_terminated_length": 609.0645141601562, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 3.503086419753086, "grad_norm": 1.4748988839665675, "kl": 0.67578125, "learning_rate": 1.0906182089690025e-07, "loss": -0.1324, "num_tokens": 31341670.0, "reward": 0.5403875112533569, "reward_std": 0.1284765601158142, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.49626392126083374, "rewards/logprob_reward/std": 0.3353906273841858, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 666.125, "completions/mean_terminated_length": 583.5385131835938, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 3.506172839506173, "grad_norm": 1.7040330424659738, "kl": 0.604248046875, "learning_rate": 1.0864918470183258e-07, "loss": -0.1492, "num_tokens": 31369566.0, "reward": 0.33770298957824707, "reward_std": 0.13649258017539978, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.2918921709060669, "rewards/logprob_reward/std": 0.30185896158218384, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 703.4375, "completions/mean_terminated_length": 644.0740966796875, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 3.5092592592592595, "grad_norm": 1.5764056755628206, "kl": 0.80224609375, "learning_rate": 1.0823711376324313e-07, "loss": -0.1878, "num_tokens": 31398456.0, "reward": 0.4259088635444641, "reward_std": 0.0855926051735878, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.38295429944992065, "rewards/logprob_reward/std": 0.34812501072883606, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 579.15625, "completions/mean_terminated_length": 533.137939453125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 3.5123456790123457, "grad_norm": 1.668065532416565, "kl": 0.66357421875, "learning_rate": 1.0782560972898783e-07, "loss": -0.103, "num_tokens": 31423417.0, "reward": 0.4618340730667114, "reward_std": 0.1394243836402893, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4159267544746399, "rewards/logprob_reward/std": 0.3394949436187744, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 676.15625, "completions/mean_terminated_length": 560.2083740234375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 3.515432098765432, "grad_norm": 1.8437580829286215, "kl": 0.69921875, "learning_rate": 1.0741467424465544e-07, "loss": -0.188, "num_tokens": 31451266.0, "reward": 0.31274715065956116, "reward_std": 0.21833008527755737, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.2711079716682434, "rewards/logprob_reward/std": 0.3479544222354889, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 693.1875, "completions/mean_terminated_length": 645.9285888671875, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 3.5185185185185186, "grad_norm": 1.4400221435359177, "kl": 0.572509765625, "learning_rate": 1.0700430895356119e-07, "loss": -0.0241, "num_tokens": 31479784.0, "reward": 0.3138567805290222, "reward_std": 0.06348590552806854, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2549797296524048, "rewards/logprob_reward/std": 0.3326827585697174, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 653.75, "completions/mean_terminated_length": 641.8064575195312, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 3.521604938271605, "grad_norm": 1.5457859257966657, "kl": 0.669921875, "learning_rate": 1.0659451549674018e-07, "loss": -0.09, "num_tokens": 31507004.0, "reward": 0.39033544063568115, "reward_std": 0.12292561680078506, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.33648383617401123, "rewards/logprob_reward/std": 0.31555411219596863, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 717.78125, "completions/mean_terminated_length": 632.0399780273438, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 3.5246913580246915, "grad_norm": 1.7182697658538861, "kl": 0.650390625, "learning_rate": 1.0618529551294053e-07, "loss": -0.0203, "num_tokens": 31536409.0, "reward": 0.2874813675880432, "reward_std": 0.17831245064735413, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.24303485453128815, "rewards/logprob_reward/std": 0.2871967554092407, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 643.0625, "completions/mean_terminated_length": 588.6428833007812, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 3.5277777777777777, "grad_norm": 1.7757731467602669, "kl": 0.63037109375, "learning_rate": 1.0577665063861735e-07, "loss": -0.209, "num_tokens": 31563319.0, "reward": 0.3928435146808624, "reward_std": 0.09342263638973236, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.34621500968933105, "rewards/logprob_reward/std": 0.38094180822372437, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 680.21875, "completions/mean_terminated_length": 657.300048828125, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 3.5308641975308643, "grad_norm": 1.2800563297293623, "kl": 0.591796875, "learning_rate": 1.0536858250792582e-07, "loss": -0.0202, "num_tokens": 31591706.0, "reward": 0.34894514083862305, "reward_std": 0.05065072700381279, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2870224118232727, "rewards/logprob_reward/std": 0.2924309968948364, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 654.5625, "completions/mean_terminated_length": 616.3448486328125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 3.5339506172839505, "grad_norm": 1.7516122967473882, "kl": 0.645751953125, "learning_rate": 1.0496109275271456e-07, "loss": -0.0696, "num_tokens": 31619280.0, "reward": 0.25322163105010986, "reward_std": 0.027570677921175957, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.18760736286640167, "rewards/logprob_reward/std": 0.3199167251586914, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 630.46875, "completions/mean_terminated_length": 604.2333374023438, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 3.537037037037037, "grad_norm": 1.507481899739302, "kl": 0.6083984375, "learning_rate": 1.0455418300251953e-07, "loss": -0.1099, "num_tokens": 31645411.0, "reward": 0.3934812843799591, "reward_std": 0.1197938621044159, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3399792015552521, "rewards/logprob_reward/std": 0.29883864521980286, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 679.1875, "completions/mean_terminated_length": 615.3333129882812, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 3.5401234567901234, "grad_norm": 1.8708089886823052, "kl": 0.576171875, "learning_rate": 1.0414785488455718e-07, "loss": -0.189, "num_tokens": 31673389.0, "reward": 0.29044121503829956, "reward_std": 0.07480251789093018, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.2359069436788559, "rewards/logprob_reward/std": 0.2968895137310028, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 614.375, "completions/mean_terminated_length": 587.0667114257812, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 3.5432098765432096, "grad_norm": 1.5134122161779358, "kl": 0.66357421875, "learning_rate": 1.0374211002371808e-07, "loss": -0.0345, "num_tokens": 31699073.0, "reward": 0.3460198640823364, "reward_std": 0.044870004057884216, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2802998721599579, "rewards/logprob_reward/std": 0.3291281759738922, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 638.125, "completions/mean_terminated_length": 625.6774291992188, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 3.5462962962962963, "grad_norm": 1.4770480309257032, "kl": 0.599853515625, "learning_rate": 1.0333695004256035e-07, "loss": -0.125, "num_tokens": 31725805.0, "reward": 0.3110332787036896, "reward_std": 0.09296546876430511, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.24837031960487366, "rewards/logprob_reward/std": 0.33549872040748596, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 567.625, "completions/mean_terminated_length": 537.2000122070312, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 3.549382716049383, "grad_norm": 1.8854231529904184, "kl": 0.68603515625, "learning_rate": 1.0293237656130304e-07, "loss": -0.0304, "num_tokens": 31751137.0, "reward": 0.3460163176059723, "reward_std": 0.10626482218503952, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2907125949859619, "rewards/logprob_reward/std": 0.2519916594028473, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 677.0625, "completions/mean_terminated_length": 597.0, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 3.552469135802469, "grad_norm": 1.3466357686086194, "kl": 0.61279296875, "learning_rate": 1.0252839119782006e-07, "loss": -0.1561, "num_tokens": 31779243.0, "reward": 0.43575024604797363, "reward_std": 0.14454954862594604, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3973613679409027, "rewards/logprob_reward/std": 0.36536338925361633, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 546.75, "completions/mean_terminated_length": 514.933349609375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 3.5555555555555554, "grad_norm": 1.591001495161738, "kl": 0.6767578125, "learning_rate": 1.0212499556763335e-07, "loss": -0.0449, "num_tokens": 31803455.0, "reward": 0.27542269229888916, "reward_std": 0.024573683738708496, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2018585354089737, "rewards/logprob_reward/std": 0.3127794563770294, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 623.03125, "completions/mean_terminated_length": 581.5516967773438, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 3.558641975308642, "grad_norm": 1.8958624780819886, "kl": 0.679931640625, "learning_rate": 1.017221912839065e-07, "loss": -0.2078, "num_tokens": 31829752.0, "reward": 0.37684008479118347, "reward_std": 0.13595686852931976, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.32148897647857666, "rewards/logprob_reward/std": 0.29760798811912537, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 593.9375, "completions/mean_terminated_length": 549.4483032226562, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 3.5617283950617287, "grad_norm": 1.4516619592217925, "kl": 0.60205078125, "learning_rate": 1.0131997995743838e-07, "loss": -0.0025, "num_tokens": 31855526.0, "reward": 0.33809512853622437, "reward_std": 0.0680900290608406, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.274966835975647, "rewards/logprob_reward/std": 0.3176954686641693, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 647.21875, "completions/mean_terminated_length": 593.3928833007812, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 3.564814814814815, "grad_norm": 1.4859159995456148, "kl": 0.6328125, "learning_rate": 1.0091836319665664e-07, "loss": -0.0988, "num_tokens": 31882585.0, "reward": 0.4344070553779602, "reward_std": 0.0973142683506012, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3854522705078125, "rewards/logprob_reward/std": 0.2963982820510864, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 589.5, "completions/mean_terminated_length": 589.5, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 3.567901234567901, "grad_norm": 1.4863044330296242, "kl": 0.689453125, "learning_rate": 1.0051734260761135e-07, "loss": -0.1654, "num_tokens": 31907385.0, "reward": 0.37134212255477905, "reward_std": 0.10774151980876923, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.30843567848205566, "rewards/logprob_reward/std": 0.2768935561180115, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 669.84375, "completions/mean_terminated_length": 633.2069091796875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 3.5709876543209877, "grad_norm": 1.5475146969072169, "kl": 0.57373046875, "learning_rate": 1.0011691979396827e-07, "loss": -0.0515, "num_tokens": 31935376.0, "reward": 0.4923964738845825, "reward_std": 0.12749487161636353, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.44988495111465454, "rewards/logprob_reward/std": 0.3246994614601135, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 643.09375, "completions/mean_terminated_length": 603.6896362304688, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 3.574074074074074, "grad_norm": 1.600056129508097, "kl": 0.662109375, "learning_rate": 9.971709635700301e-08, "loss": -0.0542, "num_tokens": 31961863.0, "reward": 0.5149340629577637, "reward_std": 0.10323864966630936, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4749267101287842, "rewards/logprob_reward/std": 0.34150388836860657, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 777.71875, "completions/mean_terminated_length": 695.625, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 3.5771604938271606, "grad_norm": 1.5411699704019122, "kl": 0.59912109375, "learning_rate": 9.931787389559393e-08, "loss": -0.1556, "num_tokens": 31993134.0, "reward": 0.2163485884666443, "reward_std": 0.2078610360622406, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.17441509664058685, "rewards/logprob_reward/std": 0.26096010208129883, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 608.75, "completions/mean_terminated_length": 565.7930908203125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 3.580246913580247, "grad_norm": 1.3233918237497488, "kl": 0.63623046875, "learning_rate": 9.891925400621642e-08, "loss": -0.0508, "num_tokens": 32018934.0, "reward": 0.21660475432872772, "reward_std": 0.09873118251562119, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.14692194759845734, "rewards/logprob_reward/std": 0.2692233622074127, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 632.8125, "completions/mean_terminated_length": 592.3448486328125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 3.5833333333333335, "grad_norm": 1.4266294943451288, "kl": 0.662109375, "learning_rate": 9.852123828293612e-08, "loss": -0.0735, "num_tokens": 32045372.0, "reward": 0.3539179861545563, "reward_std": 0.10721288621425629, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2960200011730194, "rewards/logprob_reward/std": 0.29382362961769104, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 663.90625, "completions/mean_terminated_length": 612.4642944335938, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 3.5864197530864197, "grad_norm": 1.3745473218527964, "kl": 0.534912109375, "learning_rate": 9.812382831740259e-08, "loss": -0.2678, "num_tokens": 32072937.0, "reward": 0.32847434282302856, "reward_std": 0.1505904644727707, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.27816593647003174, "rewards/logprob_reward/std": 0.3010132610797882, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 645.34375, "completions/mean_terminated_length": 620.1000366210938, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 3.5895061728395063, "grad_norm": 1.6531163888274811, "kl": 0.6328125, "learning_rate": 9.772702569884301e-08, "loss": -0.0394, "num_tokens": 32100028.0, "reward": 0.35335785150527954, "reward_std": 0.16304102540016174, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2988698184490204, "rewards/logprob_reward/std": 0.30086803436279297, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 623.90625, "completions/mean_terminated_length": 582.5172119140625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 3.5925925925925926, "grad_norm": 1.56234783734129, "kl": 0.7177734375, "learning_rate": 9.733083201405578e-08, "loss": -0.0633, "num_tokens": 32126377.0, "reward": 0.28468450903892517, "reward_std": 0.12321436405181885, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2260383516550064, "rewards/logprob_reward/std": 0.33295297622680664, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 687.625, "completions/mean_terminated_length": 639.5714721679688, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 3.5956790123456788, "grad_norm": 1.4613568960213625, "kl": 0.585693359375, "learning_rate": 9.693524884740425e-08, "loss": -0.0326, "num_tokens": 32154553.0, "reward": 0.32489991188049316, "reward_std": 0.1478879451751709, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2672499120235443, "rewards/logprob_reward/std": 0.3002356290817261, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 618.09375, "completions/mean_terminated_length": 560.107177734375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 3.5987654320987654, "grad_norm": 1.6318142002034077, "kl": 0.68115234375, "learning_rate": 9.654027778081042e-08, "loss": -0.313, "num_tokens": 32181092.0, "reward": 0.38554126024246216, "reward_std": 0.26342812180519104, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.3485180735588074, "rewards/logprob_reward/std": 0.35556691884994507, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 581.5625, "completions/mean_terminated_length": 518.357177734375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 3.601851851851852, "grad_norm": 1.725743278607369, "kl": 0.7021484375, "learning_rate": 9.614592039374817e-08, "loss": -0.0742, "num_tokens": 32206258.0, "reward": 0.3572676181793213, "reward_std": 0.09078796952962875, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.30321401357650757, "rewards/logprob_reward/std": 0.3150313198566437, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 645.46875, "completions/mean_terminated_length": 606.3103637695312, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 3.6049382716049383, "grad_norm": 1.9572532310456967, "kl": 0.61328125, "learning_rate": 9.575217826323761e-08, "loss": 0.0323, "num_tokens": 32232697.0, "reward": 0.3760623335838318, "reward_std": 0.058747660368680954, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.31715255975723267, "rewards/logprob_reward/std": 0.3510567843914032, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 662.59375, "completions/mean_terminated_length": 595.6666870117188, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 3.6080246913580245, "grad_norm": 1.456986651999529, "kl": 0.68212890625, "learning_rate": 9.535905296383848e-08, "loss": -0.092, "num_tokens": 32259804.0, "reward": 0.3835259675979614, "reward_std": 0.20102471113204956, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3393344283103943, "rewards/logprob_reward/std": 0.37340250611305237, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 578.9375, "completions/mean_terminated_length": 549.2667236328125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 3.611111111111111, "grad_norm": 1.62462139531255, "kl": 0.65966796875, "learning_rate": 9.496654606764373e-08, "loss": 0.0212, "num_tokens": 32285646.0, "reward": 0.5288812518119812, "reward_std": 0.1287810206413269, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.48347920179367065, "rewards/logprob_reward/std": 0.3513549864292145, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 672.96875, "completions/mean_terminated_length": 555.9583740234375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 3.6141975308641974, "grad_norm": 1.6873621557548437, "kl": 0.612548828125, "learning_rate": 9.457465914427326e-08, "loss": -0.022, "num_tokens": 32314073.0, "reward": 0.3767319321632385, "reward_std": 0.2978549003601074, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.3387298882007599, "rewards/logprob_reward/std": 0.36025676131248474, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 604.6875, "completions/mean_terminated_length": 561.3103637695312, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 3.617283950617284, "grad_norm": 1.4320655822777943, "kl": 0.66748046875, "learning_rate": 9.418339376086785e-08, "loss": 0.0379, "num_tokens": 32339535.0, "reward": 0.4400266408920288, "reward_std": 0.07509122043848038, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3916962444782257, "rewards/logprob_reward/std": 0.3412719964981079, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 641.90625, "completions/mean_terminated_length": 602.3793334960938, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 3.6203703703703702, "grad_norm": 1.5953023823096595, "kl": 0.61328125, "learning_rate": 9.379275148208276e-08, "loss": -0.0791, "num_tokens": 32367132.0, "reward": 0.38722512125968933, "reward_std": 0.14664363861083984, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3365001380443573, "rewards/logprob_reward/std": 0.33196738362312317, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 603.5625, "completions/mean_terminated_length": 543.5, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 3.623456790123457, "grad_norm": 1.6155064783795163, "kl": 0.6669921875, "learning_rate": 9.340273387008152e-08, "loss": -0.113, "num_tokens": 32393206.0, "reward": 0.2770156264305115, "reward_std": 0.1713065505027771, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.22793403267860413, "rewards/logprob_reward/std": 0.2620437443256378, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 613.96875, "completions/mean_terminated_length": 600.741943359375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 3.626543209876543, "grad_norm": 1.4701931713445366, "kl": 0.60302734375, "learning_rate": 9.30133424845294e-08, "loss": -0.165, "num_tokens": 32419849.0, "reward": 0.40696001052856445, "reward_std": 0.052062686532735825, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3514833152294159, "rewards/logprob_reward/std": 0.37287959456443787, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 611.375, "completions/mean_terminated_length": 598.0645141601562, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 3.6296296296296298, "grad_norm": 1.593588221321078, "kl": 0.6025390625, "learning_rate": 9.26245788825877e-08, "loss": -0.0822, "num_tokens": 32445729.0, "reward": 0.4661984443664551, "reward_std": 0.09785902500152588, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.41035935282707214, "rewards/logprob_reward/std": 0.3363848328590393, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 718.75, "completions/mean_terminated_length": 648.3077392578125, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 3.632716049382716, "grad_norm": 1.7170134602939677, "kl": 0.66357421875, "learning_rate": 9.223644461890711e-08, "loss": -0.0532, "num_tokens": 32475225.0, "reward": 0.3788502812385559, "reward_std": 0.2433241456747055, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3341391682624817, "rewards/logprob_reward/std": 0.32729458808898926, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 652.875, "completions/mean_terminated_length": 628.1333618164062, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 3.6358024691358026, "grad_norm": 1.5846019829879863, "kl": 0.60986328125, "learning_rate": 9.184894124562162e-08, "loss": -0.0865, "num_tokens": 32502957.0, "reward": 0.3730984330177307, "reward_std": 0.08567028492689133, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.31385934352874756, "rewards/logprob_reward/std": 0.34269511699676514, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 700.0, "completions/mean_terminated_length": 640.0, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 3.638888888888889, "grad_norm": 1.4249872454453294, "kl": 0.67431640625, "learning_rate": 9.146207031234232e-08, "loss": -0.1728, "num_tokens": 32531377.0, "reward": 0.2653504014015198, "reward_std": 0.15301001071929932, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.21150043606758118, "rewards/logprob_reward/std": 0.29029980301856995, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 706.09375, "completions/mean_terminated_length": 647.2222290039062, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 3.6419753086419755, "grad_norm": 1.5245203406790724, "kl": 0.595458984375, "learning_rate": 9.107583336615124e-08, "loss": -0.0112, "num_tokens": 32560184.0, "reward": 0.3728831112384796, "reward_std": 0.18103161454200745, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.32750898599624634, "rewards/logprob_reward/std": 0.2768535315990448, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 653.03125, "completions/mean_terminated_length": 641.0645141601562, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 3.6450617283950617, "grad_norm": 1.325835995890627, "kl": 0.59814453125, "learning_rate": 9.069023195159505e-08, "loss": -0.2285, "num_tokens": 32587845.0, "reward": 0.385254830121994, "reward_std": 0.17644436657428741, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.330838680267334, "rewards/logprob_reward/std": 0.31310945749282837, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 690.96875, "completions/mean_terminated_length": 597.719970703125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 3.648148148148148, "grad_norm": 1.4873248138954542, "kl": 0.60791015625, "learning_rate": 9.030526761067911e-08, "loss": -0.0957, "num_tokens": 32616660.0, "reward": 0.319694846868515, "reward_std": 0.15912795066833496, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.26841095089912415, "rewards/logprob_reward/std": 0.28951480984687805, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 652.40625, "completions/mean_terminated_length": 548.3599853515625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 3.6512345679012346, "grad_norm": 1.749314585624194, "kl": 0.6796875, "learning_rate": 8.992094188286081e-08, "loss": -0.1158, "num_tokens": 32643993.0, "reward": 0.22635987401008606, "reward_std": 0.1640135943889618, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.17164987325668335, "rewards/logprob_reward/std": 0.2694656550884247, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 635.21875, "completions/mean_terminated_length": 579.6785888671875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 3.6543209876543212, "grad_norm": 1.7094323373539235, "kl": 0.62939453125, "learning_rate": 8.953725630504419e-08, "loss": -0.0012, "num_tokens": 32671072.0, "reward": 0.44129329919815063, "reward_std": 0.08078556507825851, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3931036591529846, "rewards/logprob_reward/std": 0.3803010880947113, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 657.9375, "completions/mean_terminated_length": 605.6428833007812, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 3.6574074074074074, "grad_norm": 1.6103356055998646, "kl": 0.6259765625, "learning_rate": 8.915421241157292e-08, "loss": -0.0971, "num_tokens": 32698598.0, "reward": 0.3443487286567688, "reward_std": 0.11127196997404099, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.28885969519615173, "rewards/logprob_reward/std": 0.311379611492157, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 568.0625, "completions/mean_terminated_length": 520.8965454101562, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 3.6604938271604937, "grad_norm": 1.5693058007396616, "kl": 0.65673828125, "learning_rate": 8.877181173422487e-08, "loss": -0.1664, "num_tokens": 32723036.0, "reward": 0.33123403787612915, "reward_std": 0.21193596720695496, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.27776002883911133, "rewards/logprob_reward/std": 0.36652976274490356, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 635.28125, "completions/mean_terminated_length": 609.36669921875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 3.6635802469135803, "grad_norm": 1.4563262579743834, "kl": 0.630859375, "learning_rate": 8.839005580220574e-08, "loss": -0.0113, "num_tokens": 32749777.0, "reward": 0.48891663551330566, "reward_std": 0.08150345087051392, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.43907400965690613, "rewards/logprob_reward/std": 0.3934820890426636, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 592.59375, "completions/mean_terminated_length": 563.8333740234375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 3.6666666666666665, "grad_norm": 1.5428853485683154, "kl": 0.6220703125, "learning_rate": 8.800894614214274e-08, "loss": -0.0719, "num_tokens": 32775452.0, "reward": 0.29155224561691284, "reward_std": 0.06475499272346497, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.22672472894191742, "rewards/logprob_reward/std": 0.30406031012535095, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 667.75, "completions/mean_terminated_length": 616.857177734375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 3.669753086419753, "grad_norm": 1.4553005566843147, "kl": 0.6611328125, "learning_rate": 8.762848427807882e-08, "loss": -0.0833, "num_tokens": 32802952.0, "reward": 0.1873951554298401, "reward_std": 0.1251406967639923, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.12488351762294769, "rewards/logprob_reward/std": 0.22092244029045105, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 666.0, "completions/mean_terminated_length": 599.7037353515625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 3.6728395061728394, "grad_norm": 1.3585231154655995, "kl": 0.593017578125, "learning_rate": 8.724867173146633e-08, "loss": -0.1728, "num_tokens": 32830504.0, "reward": 0.26585179567337036, "reward_std": 0.13681931793689728, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.20511312782764435, "rewards/logprob_reward/std": 0.24927610158920288, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 733.25, "completions/mean_terminated_length": 666.1538696289062, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 3.675925925925926, "grad_norm": 1.638088425789349, "kl": 0.59814453125, "learning_rate": 8.686951002116111e-08, "loss": -0.1325, "num_tokens": 32861084.0, "reward": 0.33138343691825867, "reward_std": 0.10933730751276016, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2779260277748108, "rewards/logprob_reward/std": 0.3327729403972626, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 667.90625, "completions/mean_terminated_length": 601.9629516601562, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 3.6790123456790123, "grad_norm": 1.6004225819987363, "kl": 0.6337890625, "learning_rate": 8.649100066341614e-08, "loss": -0.0643, "num_tokens": 32889045.0, "reward": 0.3450513482093811, "reward_std": 0.13792775571346283, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.293112576007843, "rewards/logprob_reward/std": 0.2576487958431244, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 661.34375, "completions/mean_terminated_length": 594.1851806640625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 3.682098765432099, "grad_norm": 1.6200488567306581, "kl": 0.593505859375, "learning_rate": 8.611314517187584e-08, "loss": -0.0302, "num_tokens": 32917444.0, "reward": 0.2871719002723694, "reward_std": 0.10577880591154099, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2288021445274353, "rewards/logprob_reward/std": 0.30300500988960266, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 622.1875, "completions/mean_terminated_length": 547.7777709960938, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 3.685185185185185, "grad_norm": 1.2872324042499943, "kl": 0.650390625, "learning_rate": 8.573594505756982e-08, "loss": -0.1888, "num_tokens": 32944022.0, "reward": 0.3486315608024597, "reward_std": 0.13570910692214966, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.29361841082572937, "rewards/logprob_reward/std": 0.272303968667984, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 606.75, "completions/mean_terminated_length": 529.4815063476562, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 3.6882716049382713, "grad_norm": 1.3895911331455617, "kl": 0.631591796875, "learning_rate": 8.535940182890685e-08, "loss": -0.1253, "num_tokens": 32970270.0, "reward": 0.2680356502532959, "reward_std": 0.11199108511209488, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.2179563045501709, "rewards/logprob_reward/std": 0.25299572944641113, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 643.125, "completions/mean_terminated_length": 630.8386840820312, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 3.691358024691358, "grad_norm": 1.568193631528757, "kl": 0.634033203125, "learning_rate": 8.498351699166889e-08, "loss": -0.0909, "num_tokens": 32997170.0, "reward": 0.41443365812301636, "reward_std": 0.07908113300800323, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.35631516575813293, "rewards/logprob_reward/std": 0.32262909412384033, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 599.1875, "completions/mean_terminated_length": 585.4838256835938, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 3.6944444444444446, "grad_norm": 1.3736160268614062, "kl": 0.62109375, "learning_rate": 8.460829204900483e-08, "loss": -0.0882, "num_tokens": 33022596.0, "reward": 0.5498887300491333, "reward_std": 0.13131669163703918, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.5137653350830078, "rewards/logprob_reward/std": 0.3385542333126068, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 667.65625, "completions/mean_terminated_length": 601.6666870117188, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 3.697530864197531, "grad_norm": 1.5113605993523962, "kl": 0.647216796875, "learning_rate": 8.423372850142482e-08, "loss": -0.0297, "num_tokens": 33050333.0, "reward": 0.3848552405834198, "reward_std": 0.19258618354797363, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3373391628265381, "rewards/logprob_reward/std": 0.325681209564209, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 648.75, "completions/mean_terminated_length": 543.6799926757812, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 3.700617283950617, "grad_norm": 1.5715599285755708, "kl": 0.60009765625, "learning_rate": 8.385982784679416e-08, "loss": 0.0104, "num_tokens": 33077729.0, "reward": 0.2544242739677429, "reward_std": 0.04725019633769989, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.19588808715343475, "rewards/logprob_reward/std": 0.31646108627319336, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 711.03125, "completions/mean_terminated_length": 638.8077392578125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 3.7037037037037037, "grad_norm": 1.5672091872910265, "kl": 0.62890625, "learning_rate": 8.348659158032723e-08, "loss": -0.037, "num_tokens": 33107162.0, "reward": 0.2589210271835327, "reward_std": 0.1481604129076004, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.19741225242614746, "rewards/logprob_reward/std": 0.2522323429584503, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 632.6875, "completions/mean_terminated_length": 592.2069091796875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 3.7067901234567904, "grad_norm": 1.4446864622564701, "kl": 0.6064453125, "learning_rate": 8.311402119458138e-08, "loss": -0.0665, "num_tokens": 33134608.0, "reward": 0.44427964091300964, "reward_std": 0.09629363566637039, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3964218199253082, "rewards/logprob_reward/std": 0.3845255970954895, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 564.46875, "completions/mean_terminated_length": 533.8333740234375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 3.7098765432098766, "grad_norm": 1.6051224747015462, "kl": 0.6484375, "learning_rate": 8.274211817945135e-08, "loss": 0.0662, "num_tokens": 33158923.0, "reward": 0.2796996235847473, "reward_std": 0.056269869208335876, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.21008290350437164, "rewards/logprob_reward/std": 0.3029422163963318, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 685.28125, "completions/mean_terminated_length": 607.1154174804688, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 3.712962962962963, "grad_norm": 1.5182309814475454, "kl": 0.595458984375, "learning_rate": 8.237088402216297e-08, "loss": -0.1099, "num_tokens": 33187300.0, "reward": 0.3590266704559326, "reward_std": 0.16204357147216797, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3155851662158966, "rewards/logprob_reward/std": 0.35690730810165405, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 647.09375, "completions/mean_terminated_length": 577.2963256835938, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 3.7160493827160495, "grad_norm": 1.7803198239822073, "kl": 0.657470703125, "learning_rate": 8.20003202072674e-08, "loss": 0.0406, "num_tokens": 33214719.0, "reward": 0.35787051916122437, "reward_std": 0.16010317206382751, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.30735617876052856, "rewards/logprob_reward/std": 0.3296995759010315, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 597.3125, "completions/mean_terminated_length": 553.1724243164062, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 3.7191358024691357, "grad_norm": 1.5941732340107966, "kl": 0.6005859375, "learning_rate": 8.163042821663507e-08, "loss": -0.1959, "num_tokens": 33240189.0, "reward": 0.47532472014427185, "reward_std": 0.1232820451259613, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.43438857793807983, "rewards/logprob_reward/std": 0.3066123127937317, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 576.78125, "completions/mean_terminated_length": 546.9666748046875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 3.7222222222222223, "grad_norm": 1.592802087009154, "kl": 0.701171875, "learning_rate": 8.126120952944987e-08, "loss": 0.0804, "num_tokens": 33265174.0, "reward": 0.3203010559082031, "reward_std": 0.039864569902420044, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.25519564747810364, "rewards/logprob_reward/std": 0.2906298041343689, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 643.71875, "completions/mean_terminated_length": 573.2963256835938, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 3.7253086419753085, "grad_norm": 1.5986783906247357, "kl": 0.6455078125, "learning_rate": 8.089266562220312e-08, "loss": 0.0022, "num_tokens": 33292005.0, "reward": 0.4726009964942932, "reward_std": 0.15671907365322113, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.43483442068099976, "rewards/logprob_reward/std": 0.36324116587638855, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 662.25, "completions/mean_terminated_length": 638.1333618164062, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 3.728395061728395, "grad_norm": 1.6335610625745935, "kl": 0.70556640625, "learning_rate": 8.052479796868784e-08, "loss": -0.2793, "num_tokens": 33319361.0, "reward": 0.5669397711753845, "reward_std": 0.14754000306129456, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.5327109098434448, "rewards/logprob_reward/std": 0.36529815196990967, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 635.4375, "completions/mean_terminated_length": 563.4815063476562, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 3.7314814814814814, "grad_norm": 1.5211155696144696, "kl": 0.69970703125, "learning_rate": 8.015760803999244e-08, "loss": -0.0974, "num_tokens": 33346279.0, "reward": 0.2777225971221924, "reward_std": 0.1545521467924118, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.2217751145362854, "rewards/logprob_reward/std": 0.2927882671356201, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 600.34375, "completions/mean_terminated_length": 556.5172119140625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 3.734567901234568, "grad_norm": 1.8142316803183383, "kl": 0.65576171875, "learning_rate": 7.979109730449552e-08, "loss": -0.1916, "num_tokens": 33371954.0, "reward": 0.28780338168144226, "reward_std": 0.1453692764043808, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.22950373589992523, "rewards/logprob_reward/std": 0.27637821435928345, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 641.0625, "completions/mean_terminated_length": 586.357177734375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 3.7376543209876543, "grad_norm": 1.7673273085263494, "kl": 0.67578125, "learning_rate": 7.942526722785927e-08, "loss": -0.1766, "num_tokens": 33398596.0, "reward": 0.4262838363647461, "reward_std": 0.2134953737258911, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.39031538367271423, "rewards/logprob_reward/std": 0.3708057403564453, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 679.34375, "completions/mean_terminated_length": 599.8077392578125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 3.7407407407407405, "grad_norm": 1.7607445545693379, "kl": 0.658203125, "learning_rate": 7.906011927302417e-08, "loss": 0.0339, "num_tokens": 33427619.0, "reward": 0.3066402077674866, "reward_std": 0.14983443915843964, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2504335641860962, "rewards/logprob_reward/std": 0.289556086063385, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 666.0625, "completions/mean_terminated_length": 599.7777709960938, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 3.743827160493827, "grad_norm": 1.5917843632473188, "kl": 0.630615234375, "learning_rate": 7.869565490020288e-08, "loss": 0.0172, "num_tokens": 33455361.0, "reward": 0.44125351309776306, "reward_std": 0.15123844146728516, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3965316414833069, "rewards/logprob_reward/std": 0.33952760696411133, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 572.5625, "completions/mean_terminated_length": 572.5625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 3.746913580246914, "grad_norm": 1.4503809506299967, "kl": 0.64111328125, "learning_rate": 7.833187556687443e-08, "loss": -0.089, "num_tokens": 33479707.0, "reward": 0.45401543378829956, "reward_std": 0.10067898035049438, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4072393476963043, "rewards/logprob_reward/std": 0.3262330889701843, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 616.0625, "completions/mean_terminated_length": 557.7857666015625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 3.75, "grad_norm": 1.6946134622446645, "kl": 0.6689453125, "learning_rate": 7.796878272777835e-08, "loss": -0.1119, "num_tokens": 33506025.0, "reward": 0.3758144974708557, "reward_std": 0.16854849457740784, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.32382166385650635, "rewards/logprob_reward/std": 0.2726430594921112, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 595.125, "completions/mean_terminated_length": 550.7586059570312, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 3.753086419753086, "grad_norm": 1.653090980815534, "kl": 0.67822265625, "learning_rate": 7.760637783490906e-08, "loss": -0.0688, "num_tokens": 33531141.0, "reward": 0.4190971553325653, "reward_std": 0.0972963199019432, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.36496907472610474, "rewards/logprob_reward/std": 0.3415915369987488, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 660.1875, "completions/mean_terminated_length": 576.2307739257812, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 3.756172839506173, "grad_norm": 1.4678832148248693, "kl": 0.6201171875, "learning_rate": 7.724466233750961e-08, "loss": 0.0427, "num_tokens": 33558931.0, "reward": 0.16276215016841888, "reward_std": 0.10913573950529099, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.09056904911994934, "rewards/logprob_reward/std": 0.23285876214504242, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 598.5625, "completions/mean_terminated_length": 537.7857666015625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 3.7592592592592595, "grad_norm": 1.652644917011061, "kl": 0.6513671875, "learning_rate": 7.688363768206651e-08, "loss": -0.0914, "num_tokens": 33585281.0, "reward": 0.3667665421962738, "reward_std": 0.18292245268821716, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3207128345966339, "rewards/logprob_reward/std": 0.313364177942276, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 673.125, "completions/mean_terminated_length": 608.1481323242188, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 3.7623456790123457, "grad_norm": 1.4539824608795209, "kl": 0.646484375, "learning_rate": 7.652330531230344e-08, "loss": -0.0386, "num_tokens": 33613605.0, "reward": 0.437442809343338, "reward_std": 0.12461131811141968, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.39576980471611023, "rewards/logprob_reward/std": 0.3473474681377411, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 621.5625, "completions/mean_terminated_length": 564.0714721679688, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 3.765432098765432, "grad_norm": 1.767162702631273, "kl": 0.669677734375, "learning_rate": 7.616366666917571e-08, "loss": -0.0946, "num_tokens": 33639575.0, "reward": 0.40628647804260254, "reward_std": 0.13574805855751038, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.36115163564682007, "rewards/logprob_reward/std": 0.33610114455223083, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 628.6875, "completions/mean_terminated_length": 587.7930908203125, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 3.7685185185185186, "grad_norm": 2.1917740690226166, "kl": 0.66162109375, "learning_rate": 7.580472319086442e-08, "loss": -0.1228, "num_tokens": 33666193.0, "reward": 0.4277956783771515, "reward_std": 0.10116873681545258, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3781062960624695, "rewards/logprob_reward/std": 0.36697736382484436, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 658.25, "completions/mean_terminated_length": 590.5184936523438, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 3.771604938271605, "grad_norm": 1.4318939998072981, "kl": 0.615234375, "learning_rate": 7.544647631277085e-08, "loss": -0.0207, "num_tokens": 33694345.0, "reward": 0.32787346839904785, "reward_std": 0.17947228252887726, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.274026095867157, "rewards/logprob_reward/std": 0.3015620708465576, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 570.1875, "completions/mean_terminated_length": 555.5484008789062, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 3.7746913580246915, "grad_norm": 1.6570799604347657, "kl": 0.6455078125, "learning_rate": 7.508892746751034e-08, "loss": 0.0467, "num_tokens": 33718667.0, "reward": 0.4487326443195343, "reward_std": 0.035165730863809586, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.3909529149532318, "rewards/logprob_reward/std": 0.33395156264305115, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 670.34375, "completions/mean_terminated_length": 646.7667236328125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 3.7777777777777777, "grad_norm": 1.3800621661628902, "kl": 0.61083984375, "learning_rate": 7.473207808490701e-08, "loss": 0.0016, "num_tokens": 33746834.0, "reward": 0.3769424557685852, "reward_std": 0.03390040621161461, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3146583139896393, "rewards/logprob_reward/std": 0.3573896586894989, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 658.0625, "completions/mean_terminated_length": 605.7857666015625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 3.7808641975308643, "grad_norm": 1.2250453353423525, "kl": 0.56884765625, "learning_rate": 7.437592959198796e-08, "loss": -0.2939, "num_tokens": 33774344.0, "reward": 0.3117009997367859, "reward_std": 0.11314510554075241, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.25952887535095215, "rewards/logprob_reward/std": 0.311544805765152, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 634.09375, "completions/mean_terminated_length": 561.888916015625, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 3.7839506172839505, "grad_norm": 1.4597054100818718, "kl": 0.62939453125, "learning_rate": 7.402048341297718e-08, "loss": -0.2022, "num_tokens": 33801191.0, "reward": 0.4220396876335144, "reward_std": 0.2032833993434906, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.37865522503852844, "rewards/logprob_reward/std": 0.39630627632141113, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 660.90625, "completions/mean_terminated_length": 623.3448486328125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 3.787037037037037, "grad_norm": 1.641406890981269, "kl": 0.62890625, "learning_rate": 7.36657409692903e-08, "loss": -0.0048, "num_tokens": 33829352.0, "reward": 0.41330787539482117, "reward_std": 0.1278248131275177, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3620087206363678, "rewards/logprob_reward/std": 0.30893799662590027, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 606.5625, "completions/mean_terminated_length": 578.7333374023438, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 3.7901234567901234, "grad_norm": 1.5268610665076006, "kl": 0.65087890625, "learning_rate": 7.331170367952874e-08, "loss": -0.0618, "num_tokens": 33855058.0, "reward": 0.4139877259731293, "reward_std": 0.11275386810302734, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3592919111251831, "rewards/logprob_reward/std": 0.3068212568759918, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 672.71875, "completions/mean_terminated_length": 622.5357666015625, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 3.7932098765432096, "grad_norm": 1.5957422584871803, "kl": 0.7421875, "learning_rate": 7.295837295947404e-08, "loss": -0.0712, "num_tokens": 33882913.0, "reward": 0.40974748134613037, "reward_std": 0.13264474272727966, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3615249693393707, "rewards/logprob_reward/std": 0.3195522427558899, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 557.0625, "completions/mean_terminated_length": 542.0, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 3.7962962962962963, "grad_norm": 1.6450072271887897, "kl": 0.6611328125, "learning_rate": 7.260575022208218e-08, "loss": -0.0891, "num_tokens": 33906939.0, "reward": 0.26366308331489563, "reward_std": 0.12328067421913147, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.19920897483825684, "rewards/logprob_reward/std": 0.27628880739212036, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 623.40625, "completions/mean_terminated_length": 566.1785888671875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 3.799382716049383, "grad_norm": 1.6222560894195537, "kl": 0.6728515625, "learning_rate": 7.225383687747789e-08, "loss": -0.0066, "num_tokens": 33933752.0, "reward": 0.4484564960002899, "reward_std": 0.0920545905828476, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4010627269744873, "rewards/logprob_reward/std": 0.3538465201854706, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 668.21875, "completions/mean_terminated_length": 568.5999755859375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 3.802469135802469, "grad_norm": 1.8042984700537723, "kl": 0.662109375, "learning_rate": 7.190263433294913e-08, "loss": -0.0596, "num_tokens": 33962127.0, "reward": 0.3203108310699463, "reward_std": 0.14274707436561584, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.27256762981414795, "rewards/logprob_reward/std": 0.3008267879486084, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 657.65625, "completions/mean_terminated_length": 589.8148193359375, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 3.8055555555555554, "grad_norm": 1.3226306578572455, "kl": 0.540283203125, "learning_rate": 7.155214399294146e-08, "loss": 0.0048, "num_tokens": 33989864.0, "reward": 0.38480716943740845, "reward_std": 0.19276383519172668, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.33728572726249695, "rewards/logprob_reward/std": 0.302777498960495, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 626.875, "completions/mean_terminated_length": 570.1428833007812, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 3.808641975308642, "grad_norm": 1.5777244268506005, "kl": 0.621826171875, "learning_rate": 7.120236725905215e-08, "loss": -0.1608, "num_tokens": 34016172.0, "reward": 0.35396015644073486, "reward_std": 0.14630940556526184, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.29606688022613525, "rewards/logprob_reward/std": 0.2978813648223877, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 629.84375, "completions/mean_terminated_length": 519.47998046875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 3.8117283950617287, "grad_norm": 1.5890898160496787, "kl": 0.6318359375, "learning_rate": 7.085330553002494e-08, "loss": 0.0834, "num_tokens": 34043259.0, "reward": 0.37638550996780396, "reward_std": 0.20296669006347656, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3314005732536316, "rewards/logprob_reward/std": 0.30333438515663147, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 612.78125, "completions/mean_terminated_length": 585.36669921875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 3.814814814814815, "grad_norm": 1.5471074012590407, "kl": 0.7431640625, "learning_rate": 7.05049602017444e-08, "loss": -0.1598, "num_tokens": 34069692.0, "reward": 0.3362053334712982, "reward_std": 0.12041358649730682, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.27633926272392273, "rewards/logprob_reward/std": 0.31953147053718567, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 631.34375, "completions/mean_terminated_length": 575.25, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 3.817901234567901, "grad_norm": 1.645827558624852, "kl": 0.675048828125, "learning_rate": 7.015733266722993e-08, "loss": -0.0165, "num_tokens": 34096263.0, "reward": 0.3282025456428528, "reward_std": 0.11031550168991089, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2709195017814636, "rewards/logprob_reward/std": 0.31229135394096375, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 598.375, "completions/mean_terminated_length": 584.6451416015625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 3.8209876543209877, "grad_norm": 1.5331507754984715, "kl": 0.611328125, "learning_rate": 6.981042431663075e-08, "loss": -0.0251, "num_tokens": 34121847.0, "reward": 0.5011603832244873, "reward_std": 0.07828878611326218, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.44920602440834045, "rewards/logprob_reward/std": 0.3380409777164459, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 619.125, "completions/mean_terminated_length": 561.2857666015625, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 3.824074074074074, "grad_norm": 1.6144074185442467, "kl": 0.644287109375, "learning_rate": 6.946423653722006e-08, "loss": -0.011, "num_tokens": 34148347.0, "reward": 0.40728798508644104, "reward_std": 0.19359657168388367, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3587922155857086, "rewards/logprob_reward/std": 0.33166587352752686, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 677.09375, "completions/mean_terminated_length": 612.8518676757812, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 3.8271604938271606, "grad_norm": 1.4265769581330283, "kl": 0.596923828125, "learning_rate": 6.911877071338942e-08, "loss": 0.0041, "num_tokens": 34176634.0, "reward": 0.3167506754398346, "reward_std": 0.1083531528711319, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2581952214241028, "rewards/logprob_reward/std": 0.3211752772331238, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 549.53125, "completions/mean_terminated_length": 549.53125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 3.830246913580247, "grad_norm": 1.5474284730267964, "kl": 0.70849609375, "learning_rate": 6.877402822664352e-08, "loss": -0.1667, "num_tokens": 34200719.0, "reward": 0.5134786367416382, "reward_std": 0.1157788634300232, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.46636509895324707, "rewards/logprob_reward/std": 0.34950605034828186, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 696.09375, "completions/mean_terminated_length": 567.7825927734375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 3.8333333333333335, "grad_norm": 1.3395793372276423, "kl": 0.66064453125, "learning_rate": 6.843001045559416e-08, "loss": -0.2422, "num_tokens": 34229586.0, "reward": 0.3991442620754242, "reward_std": 0.23215091228485107, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.3636325001716614, "rewards/logprob_reward/std": 0.3384113609790802, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 621.875, "completions/mean_terminated_length": 547.4074096679688, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 3.8364197530864197, "grad_norm": 1.6711462885375263, "kl": 0.6162109375, "learning_rate": 6.808671877595524e-08, "loss": 0.0888, "num_tokens": 34256010.0, "reward": 0.39668774604797363, "reward_std": 0.1716841757297516, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3470141291618347, "rewards/logprob_reward/std": 0.3154861330986023, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 617.4375, "completions/mean_terminated_length": 590.3333740234375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 3.8395061728395063, "grad_norm": 1.628400392880948, "kl": 0.650146484375, "learning_rate": 6.774415456053697e-08, "loss": -0.0583, "num_tokens": 34282492.0, "reward": 0.3933296799659729, "reward_std": 0.16838446259498596, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3398107886314392, "rewards/logprob_reward/std": 0.35341179370880127, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 609.15625, "completions/mean_terminated_length": 581.5, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 3.8425925925925926, "grad_norm": 1.748077843392194, "kl": 0.681640625, "learning_rate": 6.740231917924053e-08, "loss": -0.0741, "num_tokens": 34308517.0, "reward": 0.5198233127593994, "reward_std": 0.10517364740371704, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4734148383140564, "rewards/logprob_reward/std": 0.3178054988384247, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 627.28125, "completions/mean_terminated_length": 570.607177734375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 3.8456790123456788, "grad_norm": 1.6498444138132442, "kl": 0.6943359375, "learning_rate": 6.706121399905245e-08, "loss": -0.0758, "num_tokens": 34334906.0, "reward": 0.3762926459312439, "reward_std": 0.1321469396352768, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3312973380088806, "rewards/logprob_reward/std": 0.37105339765548706, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 615.9375, "completions/mean_terminated_length": 573.72412109375, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 3.8487654320987654, "grad_norm": 1.6027279773335463, "kl": 0.68798828125, "learning_rate": 6.672084038403927e-08, "loss": -0.2314, "num_tokens": 34360964.0, "reward": 0.4543178081512451, "reward_std": 0.06060335040092468, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4041031002998352, "rewards/logprob_reward/std": 0.35475975275039673, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 630.59375, "completions/mean_terminated_length": 617.9031982421875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 3.851851851851852, "grad_norm": 1.4495905822134696, "kl": 0.5751953125, "learning_rate": 6.638119969534201e-08, "loss": 0.0524, "num_tokens": 34387799.0, "reward": 0.36646562814712524, "reward_std": 0.0265357606112957, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.29954513907432556, "rewards/logprob_reward/std": 0.29479172825813293, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 668.5, "completions/mean_terminated_length": 631.72412109375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 3.8549382716049383, "grad_norm": 1.7437694462175068, "kl": 0.6591796875, "learning_rate": 6.604229329117064e-08, "loss": -0.0937, "num_tokens": 34415903.0, "reward": 0.33727431297302246, "reward_std": 0.1496143639087677, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2775270342826843, "rewards/logprob_reward/std": 0.3308427035808563, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 622.4375, "completions/mean_terminated_length": 580.8965454101562, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 3.8580246913580245, "grad_norm": 1.5684332212802337, "kl": 0.64794921875, "learning_rate": 6.570412252679894e-08, "loss": -0.0562, "num_tokens": 34442285.0, "reward": 0.40241140127182007, "reward_std": 0.04576746001839638, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.34990155696868896, "rewards/logprob_reward/std": 0.3036700189113617, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 698.3125, "completions/mean_terminated_length": 638.0, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 3.861111111111111, "grad_norm": 1.3377662296418327, "kl": 0.601806640625, "learning_rate": 6.536668875455869e-08, "loss": -0.0959, "num_tokens": 34471423.0, "reward": 0.3972879946231842, "reward_std": 0.15928718447685242, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3476811349391937, "rewards/logprob_reward/std": 0.3621588945388794, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 633.75, "completions/mean_terminated_length": 561.4815063476562, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 3.8641975308641974, "grad_norm": 2.0399778883581186, "kl": 0.67236328125, "learning_rate": 6.502999332383465e-08, "loss": -0.2049, "num_tokens": 34498539.0, "reward": 0.4112207889556885, "reward_std": 0.19010446965694427, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.37357866764068604, "rewards/logprob_reward/std": 0.33467158675193787, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 660.15625, "completions/mean_terminated_length": 592.7777709960938, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 3.867283950617284, "grad_norm": 1.5343256750190204, "kl": 0.648193359375, "learning_rate": 6.469403758105894e-08, "loss": -0.0556, "num_tokens": 34526324.0, "reward": 0.21720468997955322, "reward_std": 0.12811216711997986, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.15453296899795532, "rewards/logprob_reward/std": 0.2828380763530731, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 664.65625, "completions/mean_terminated_length": 581.7307739257812, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 3.8703703703703702, "grad_norm": 1.6800147853507033, "kl": 0.66748046875, "learning_rate": 6.435882286970556e-08, "loss": 0.0472, "num_tokens": 34553993.0, "reward": 0.39377695322036743, "reward_std": 0.2438797801733017, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.35072436928749084, "rewards/logprob_reward/std": 0.31812044978141785, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 642.21875, "completions/mean_terminated_length": 616.7667236328125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 3.873456790123457, "grad_norm": 1.5501358111907908, "kl": 0.65625, "learning_rate": 6.402435053028538e-08, "loss": -0.0169, "num_tokens": 34581436.0, "reward": 0.26868483424186707, "reward_std": 0.021270418539643288, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.1943720430135727, "rewards/logprob_reward/std": 0.3160381317138672, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 699.9375, "completions/mean_terminated_length": 609.2000122070312, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 3.876543209876543, "grad_norm": 1.572733483554164, "kl": 0.662109375, "learning_rate": 6.369062190034036e-08, "loss": -0.1132, "num_tokens": 34610274.0, "reward": 0.31400439143180847, "reward_std": 0.16174638271331787, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.26208820939064026, "rewards/logprob_reward/std": 0.2554265856742859, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 634.46875, "completions/mean_terminated_length": 578.8214721679688, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 3.8796296296296298, "grad_norm": 1.5888713450124243, "kl": 0.63525390625, "learning_rate": 6.335763831443847e-08, "loss": -0.1124, "num_tokens": 34637077.0, "reward": 0.3970712423324585, "reward_std": 0.19375097751617432, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.35785698890686035, "rewards/logprob_reward/std": 0.3994970917701721, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 708.4375, "completions/mean_terminated_length": 635.6154174804688, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 3.882716049382716, "grad_norm": 1.526837918157266, "kl": 0.589111328125, "learning_rate": 6.302540110416837e-08, "loss": -0.0294, "num_tokens": 34666847.0, "reward": 0.39022889733314514, "reward_std": 0.12641775608062744, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3467820882797241, "rewards/logprob_reward/std": 0.37466949224472046, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 633.15625, "completions/mean_terminated_length": 577.3214721679688, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 3.8858024691358026, "grad_norm": 1.5608002806464103, "kl": 0.6787109375, "learning_rate": 6.269391159813372e-08, "loss": -0.1392, "num_tokens": 34693976.0, "reward": 0.4023326337337494, "reward_std": 0.10683275759220123, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3532862365245819, "rewards/logprob_reward/std": 0.30176272988319397, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 649.9375, "completions/mean_terminated_length": 611.2413940429688, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 3.888888888888889, "grad_norm": 1.5983765927354556, "kl": 0.6357421875, "learning_rate": 6.236317112194844e-08, "loss": 0.0327, "num_tokens": 34721826.0, "reward": 0.36325234174728394, "reward_std": 0.09056113660335541, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.30291926860809326, "rewards/logprob_reward/std": 0.28544744849205017, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 601.625, "completions/mean_terminated_length": 573.4666748046875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 3.8919753086419755, "grad_norm": 1.5202744542985778, "kl": 0.611083984375, "learning_rate": 6.203318099823094e-08, "loss": -0.0647, "num_tokens": 34748310.0, "reward": 0.3364274203777313, "reward_std": 0.0355452336370945, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.26964157819747925, "rewards/logprob_reward/std": 0.30754345655441284, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 593.40625, "completions/mean_terminated_length": 564.7000122070312, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 3.8950617283950617, "grad_norm": 1.419153970997085, "kl": 0.65673828125, "learning_rate": 6.17039425465991e-08, "loss": 0.0225, "num_tokens": 34774379.0, "reward": 0.2905309796333313, "reward_std": 0.05059904605150223, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.21864554286003113, "rewards/logprob_reward/std": 0.3025275468826294, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 638.375, "completions/mean_terminated_length": 583.2857666015625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 3.898148148148148, "grad_norm": 1.5330818165797981, "kl": 0.58740234375, "learning_rate": 6.137545708366476e-08, "loss": -0.0375, "num_tokens": 34801959.0, "reward": 0.402209609746933, "reward_std": 0.09554566442966461, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.35662180185317993, "rewards/logprob_reward/std": 0.36648234724998474, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 626.71875, "completions/mean_terminated_length": 585.6206665039062, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 3.9012345679012346, "grad_norm": 1.611847564810495, "kl": 0.63232421875, "learning_rate": 6.104772592302868e-08, "loss": 0.0238, "num_tokens": 34828554.0, "reward": 0.48285889625549316, "reward_std": 0.2392449527978897, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.4497043490409851, "rewards/logprob_reward/std": 0.3772594928741455, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 582.125, "completions/mean_terminated_length": 582.125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 3.9043209876543212, "grad_norm": 1.620839307168573, "kl": 0.69287109375, "learning_rate": 6.072075037527519e-08, "loss": -0.1141, "num_tokens": 34853578.0, "reward": 0.4572755992412567, "reward_std": 0.04966471716761589, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4039173126220703, "rewards/logprob_reward/std": 0.30155524611473083, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 647.40625, "completions/mean_terminated_length": 622.300048828125, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 3.9074074074074074, "grad_norm": 1.4123583995451279, "kl": 0.61474609375, "learning_rate": 6.039453174796699e-08, "loss": -0.1148, "num_tokens": 34880663.0, "reward": 0.41138163208961487, "reward_std": 0.04933255910873413, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.35639625787734985, "rewards/logprob_reward/std": 0.32915884256362915, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 637.53125, "completions/mean_terminated_length": 582.3214721679688, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 3.9104938271604937, "grad_norm": 1.4726011453452343, "kl": 0.623046875, "learning_rate": 6.006907134563973e-08, "loss": -0.112, "num_tokens": 34907444.0, "reward": 0.2765713036060333, "reward_std": 0.13091786205768585, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.21355144679546356, "rewards/logprob_reward/std": 0.2878705561161041, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 699.40625, "completions/mean_terminated_length": 665.8275756835938, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 3.9135802469135803, "grad_norm": 1.5235830292734727, "kl": 0.6630859375, "learning_rate": 5.974437046979711e-08, "loss": -0.1557, "num_tokens": 34936345.0, "reward": 0.2745678722858429, "reward_std": 0.08497855067253113, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.21479764580726624, "rewards/logprob_reward/std": 0.29989174008369446, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 645.46875, "completions/mean_terminated_length": 591.3928833007812, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 3.9166666666666665, "grad_norm": 1.5316227839428251, "kl": 0.6640625, "learning_rate": 5.9420430418905435e-08, "loss": -0.0112, "num_tokens": 34963040.0, "reward": 0.3786429762840271, "reward_std": 0.1327960193157196, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.323492169380188, "rewards/logprob_reward/std": 0.33515629172325134, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 677.9375, "completions/mean_terminated_length": 598.0769653320312, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 3.919753086419753, "grad_norm": 1.719696083090038, "kl": 0.663818359375, "learning_rate": 5.909725248838854e-08, "loss": -0.0212, "num_tokens": 34991462.0, "reward": 0.3442852795124054, "reward_std": 0.13406158983707428, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.29573363065719604, "rewards/logprob_reward/std": 0.2855747640132904, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 674.15625, "completions/mean_terminated_length": 609.370361328125, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 3.9228395061728394, "grad_norm": 1.668342662882343, "kl": 0.609375, "learning_rate": 5.877483797062255e-08, "loss": -0.0832, "num_tokens": 35019191.0, "reward": 0.3113741874694824, "reward_std": 0.14887499809265137, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.25569355487823486, "rewards/logprob_reward/std": 0.27087441086769104, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 680.46875, "completions/mean_terminated_length": 616.8518676757812, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 3.925925925925926, "grad_norm": 1.5447902717174986, "kl": 0.595947265625, "learning_rate": 5.845318815493069e-08, "loss": -0.0091, "num_tokens": 35047326.0, "reward": 0.2544844150543213, "reward_std": 0.14136172831058502, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.19942711293697357, "rewards/logprob_reward/std": 0.26652219891548157, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 678.0, "completions/mean_terminated_length": 598.1538696289062, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 3.9290123456790123, "grad_norm": 1.4308508461780884, "kl": 0.62890625, "learning_rate": 5.813230432757829e-08, "loss": -0.2376, "num_tokens": 35075774.0, "reward": 0.2056887447834015, "reward_std": 0.11640875041484833, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.14173752069473267, "rewards/logprob_reward/std": 0.2575426995754242, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 572.875, "completions/mean_terminated_length": 572.875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 3.932098765432099, "grad_norm": 1.5185767887895074, "kl": 0.64208984375, "learning_rate": 5.781218777176744e-08, "loss": 0.013, "num_tokens": 35100782.0, "reward": 0.3815149664878845, "reward_std": 0.047988198697566986, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.31973883509635925, "rewards/logprob_reward/std": 0.30301719903945923, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 647.6875, "completions/mean_terminated_length": 593.9285888671875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 3.935185185185185, "grad_norm": 1.528843002963712, "kl": 0.602294921875, "learning_rate": 5.749283976763186e-08, "loss": -0.1529, "num_tokens": 35128000.0, "reward": 0.35921698808670044, "reward_std": 0.1569749265909195, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3088522255420685, "rewards/logprob_reward/std": 0.3140522539615631, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 645.6875, "completions/mean_terminated_length": 606.5516967773438, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 3.9382716049382713, "grad_norm": 1.4321368351197294, "kl": 0.61181640625, "learning_rate": 5.717426159223204e-08, "loss": 0.0275, "num_tokens": 35155130.0, "reward": 0.3208833932876587, "reward_std": 0.029335714876651764, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2558426558971405, "rewards/logprob_reward/std": 0.3032721281051636, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 726.875, "completions/mean_terminated_length": 571.2380981445312, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 3.941358024691358, "grad_norm": 1.4658744292011114, "kl": 0.630859375, "learning_rate": 5.685645451954976e-08, "loss": -0.0311, "num_tokens": 35184578.0, "reward": 0.29100263118743896, "reward_std": 0.16550323367118835, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.25389185547828674, "rewards/logprob_reward/std": 0.33822762966156006, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 619.25, "completions/mean_terminated_length": 577.3793334960938, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 3.9444444444444446, "grad_norm": 1.6278719848003336, "kl": 0.6826171875, "learning_rate": 5.653941982048333e-08, "loss": -0.0599, "num_tokens": 35211074.0, "reward": 0.2912065088748932, "reward_std": 0.08770634979009628, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.22286835312843323, "rewards/logprob_reward/std": 0.27239713072776794, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 655.65625, "completions/mean_terminated_length": 587.4444580078125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 3.947530864197531, "grad_norm": 1.5827298809682313, "kl": 0.6650390625, "learning_rate": 5.6223158762842336e-08, "loss": -0.0954, "num_tokens": 35238259.0, "reward": 0.37354421615600586, "reward_std": 0.2048719823360443, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.32477134466171265, "rewards/logprob_reward/std": 0.34890225529670715, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 652.625, "completions/mean_terminated_length": 583.8518676757812, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 3.950617283950617, "grad_norm": 1.4990823536227658, "kl": 0.64013671875, "learning_rate": 5.59076726113426e-08, "loss": -0.0437, "num_tokens": 35265547.0, "reward": 0.4134724736213684, "reward_std": 0.114122673869133, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3656638562679291, "rewards/logprob_reward/std": 0.3126649260520935, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 641.9375, "completions/mean_terminated_length": 571.1851806640625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 3.9537037037037037, "grad_norm": 1.6015486955228493, "kl": 0.626953125, "learning_rate": 5.55929626276011e-08, "loss": -0.0979, "num_tokens": 35293125.0, "reward": 0.3014741837978363, "reward_std": 0.193168044090271, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.2516379654407501, "rewards/logprob_reward/std": 0.31512510776519775, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 755.59375, "completions/mean_terminated_length": 680.4400024414062, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 3.9567901234567904, "grad_norm": 1.6975793158456394, "kl": 0.6630859375, "learning_rate": 5.527903007013099e-08, "loss": 0.0014, "num_tokens": 35324060.0, "reward": 0.27478718757629395, "reward_std": 0.13014784455299377, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.22198577225208282, "rewards/logprob_reward/std": 0.2623629570007324, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 644.21875, "completions/mean_terminated_length": 604.9310302734375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 3.9598765432098766, "grad_norm": 1.61822262586784, "kl": 0.6474609375, "learning_rate": 5.4965876194336567e-08, "loss": -0.0981, "num_tokens": 35350719.0, "reward": 0.32840633392333984, "reward_std": 0.15930701792240143, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.27114593982696533, "rewards/logprob_reward/std": 0.2519806921482086, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 578.0625, "completions/mean_terminated_length": 548.3333740234375, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 3.962962962962963, "grad_norm": 1.6467846704719629, "kl": 0.66357421875, "learning_rate": 5.465350225250801e-08, "loss": -0.0149, "num_tokens": 35375349.0, "reward": 0.31877440214157104, "reward_std": 0.09764313697814941, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2534993290901184, "rewards/logprob_reward/std": 0.28164201974868774, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 576.125, "completions/mean_terminated_length": 561.6774291992188, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 3.9660493827160495, "grad_norm": 1.4078347968866725, "kl": 0.6533203125, "learning_rate": 5.4341909493816786e-08, "loss": -0.1572, "num_tokens": 35400221.0, "reward": 0.34113186597824097, "reward_std": 0.10074251890182495, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2783409655094147, "rewards/logprob_reward/std": 0.2916436195373535, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 626.125, "completions/mean_terminated_length": 584.9655151367188, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 3.9691358024691357, "grad_norm": 1.7374252093996143, "kl": 0.66552734375, "learning_rate": 5.4031099164310314e-08, "loss": -0.0336, "num_tokens": 35426817.0, "reward": 0.2515743374824524, "reward_std": 0.039866816252470016, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.17883259057998657, "rewards/logprob_reward/std": 0.30172520875930786, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 615.71875, "completions/mean_terminated_length": 573.4827270507812, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 3.9722222222222223, "grad_norm": 1.5262911395766772, "kl": 0.70703125, "learning_rate": 5.372107250690719e-08, "loss": -0.191, "num_tokens": 35452552.0, "reward": 0.2727322578430176, "reward_std": 0.07770296931266785, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.20581361651420593, "rewards/logprob_reward/std": 0.3140930235385895, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 600.1875, "completions/mean_terminated_length": 586.51611328125, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 3.9753086419753085, "grad_norm": 1.4888070611186808, "kl": 0.60595703125, "learning_rate": 5.341183076139219e-08, "loss": -0.0856, "num_tokens": 35478358.0, "reward": 0.28676971793174744, "reward_std": 0.04906601831316948, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.214466392993927, "rewards/logprob_reward/std": 0.30827146768569946, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 685.8125, "completions/mean_terminated_length": 607.7692260742188, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 3.978395061728395, "grad_norm": 1.4402153048329898, "kl": 0.65283203125, "learning_rate": 5.310337516441102e-08, "loss": -0.1167, "num_tokens": 35506900.0, "reward": 0.3246450424194336, "reward_std": 0.09617385268211365, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.27391117811203003, "rewards/logprob_reward/std": 0.30571624636650085, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 607.71875, "completions/mean_terminated_length": 594.290283203125, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 3.9814814814814814, "grad_norm": 1.4104035508714146, "kl": 0.602294921875, "learning_rate": 5.279570694946581e-08, "loss": -0.0427, "num_tokens": 35533055.0, "reward": 0.2898106873035431, "reward_std": 0.11499236524105072, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.22131744027137756, "rewards/logprob_reward/std": 0.28126585483551025, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 672.15625, "completions/mean_terminated_length": 648.7000122070312, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 3.984567901234568, "grad_norm": 1.454247318397381, "kl": 0.59912109375, "learning_rate": 5.2488827346910015e-08, "loss": -0.1406, "num_tokens": 35561580.0, "reward": 0.4133351445198059, "reward_std": 0.06446068733930588, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3585668206214905, "rewards/logprob_reward/std": 0.31888890266418457, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 626.09375, "completions/mean_terminated_length": 584.9310302734375, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 3.9876543209876543, "grad_norm": 1.5536448623148758, "kl": 0.66552734375, "learning_rate": 5.21827375839432e-08, "loss": -0.1493, "num_tokens": 35588447.0, "reward": 0.2680965065956116, "reward_std": 0.14740540087223053, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.204135000705719, "rewards/logprob_reward/std": 0.24552080035209656, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 721.65625, "completions/mean_terminated_length": 563.2857055664062, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 3.9907407407407405, "grad_norm": 1.2738372668992615, "kl": 0.65576171875, "learning_rate": 5.187743888460669e-08, "loss": -0.0415, "num_tokens": 35617728.0, "reward": 0.2604145407676697, "reward_std": 0.08412320911884308, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.2233772724866867, "rewards/logprob_reward/std": 0.3067440092563629, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 629.40625, "completions/mean_terminated_length": 603.1000366210938, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 3.993827160493827, "grad_norm": 1.362061872730838, "kl": 0.62744140625, "learning_rate": 5.15729324697782e-08, "loss": 0.0343, "num_tokens": 35644869.0, "reward": 0.40486735105514526, "reward_std": 0.05047126114368439, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3456859588623047, "rewards/logprob_reward/std": 0.37427324056625366, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 675.5, "completions/mean_terminated_length": 559.3333740234375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 3.996913580246914, "grad_norm": 1.5493786480105591, "kl": 0.68701171875, "learning_rate": 5.126921955716723e-08, "loss": -0.1431, "num_tokens": 35673021.0, "reward": 0.28546327352523804, "reward_std": 0.19677306711673737, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.23384806513786316, "rewards/logprob_reward/std": 0.3095080554485321, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 651.875, "completions/mean_terminated_length": 598.7142944335938, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 4.0, "grad_norm": 1.3973768058306164, "kl": 0.61474609375, "learning_rate": 5.096630136131e-08, "loss": -0.0185, "num_tokens": 35700401.0, "reward": 0.31301987171173096, "reward_std": 0.1146269142627716, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2540498375892639, "rewards/logprob_reward/std": 0.30604037642478943, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 665.03125, "completions/mean_terminated_length": 564.5199584960938, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 4.003086419753086, "grad_norm": 1.5132043083344895, "kl": 0.6357421875, "learning_rate": 5.0664179093564765e-08, "loss": -0.0573, "num_tokens": 35727974.0, "reward": 0.3799540400505066, "reward_std": 0.1852685809135437, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3353655934333801, "rewards/logprob_reward/std": 0.34042030572891235, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 711.8125, "completions/mean_terminated_length": 624.3999633789062, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 4.006172839506172, "grad_norm": 1.6278923114204555, "kl": 0.62060546875, "learning_rate": 5.036285396210685e-08, "loss": -0.2452, "num_tokens": 35757464.0, "reward": 0.4110715985298157, "reward_std": 0.1897403597831726, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3734128475189209, "rewards/logprob_reward/std": 0.3951837122440338, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 710.65625, "completions/mean_terminated_length": 665.8928833007812, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 4.0092592592592595, "grad_norm": 1.376587079918233, "kl": 0.546142578125, "learning_rate": 5.0062327171923935e-08, "loss": -0.0925, "num_tokens": 35787301.0, "reward": 0.3561769127845764, "reward_std": 0.1046433076262474, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.30547434091567993, "rewards/logprob_reward/std": 0.30560293793678284, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 635.25, "completions/mean_terminated_length": 595.0344848632812, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 4.012345679012346, "grad_norm": 1.4352191076254313, "kl": 0.655029296875, "learning_rate": 4.976259992481097e-08, "loss": -0.1527, "num_tokens": 35813581.0, "reward": 0.5306516885757446, "reward_std": 0.19127212464809418, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.49239078164100647, "rewards/logprob_reward/std": 0.3370811939239502, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 614.625, "completions/mean_terminated_length": 587.3333740234375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 4.015432098765432, "grad_norm": 1.7485731259635175, "kl": 0.65576171875, "learning_rate": 4.946367341936578e-08, "loss": 0.0799, "num_tokens": 35839221.0, "reward": 0.4768450856208801, "reward_std": 0.12481103837490082, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.42566123604774475, "rewards/logprob_reward/std": 0.3414471447467804, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 637.75, "completions/mean_terminated_length": 612.0000610351562, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 4.018518518518518, "grad_norm": 1.4913145733088944, "kl": 0.62646484375, "learning_rate": 4.916554885098403e-08, "loss": -0.0589, "num_tokens": 35865589.0, "reward": 0.36542898416519165, "reward_std": 0.09641439467668533, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3018655776977539, "rewards/logprob_reward/std": 0.2642173171043396, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 632.3125, "completions/mean_terminated_length": 576.357177734375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 4.021604938271605, "grad_norm": 1.7601523925716558, "kl": 0.640625, "learning_rate": 4.8868227411854287e-08, "loss": -0.003, "num_tokens": 35892187.0, "reward": 0.35813435912132263, "reward_std": 0.08750680088996887, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.30417707562446594, "rewards/logprob_reward/std": 0.3202640116214752, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 652.03125, "completions/mean_terminated_length": 583.1481323242188, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 4.0246913580246915, "grad_norm": 1.4326492649018785, "kl": 0.6064453125, "learning_rate": 4.857171029095364e-08, "loss": 0.0058, "num_tokens": 35919128.0, "reward": 0.2767089903354645, "reward_std": 0.1150752454996109, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.2206488996744156, "rewards/logprob_reward/std": 0.28647875785827637, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 655.09375, "completions/mean_terminated_length": 630.5000610351562, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 4.027777777777778, "grad_norm": 1.421129916547761, "kl": 0.591796875, "learning_rate": 4.827599867404261e-08, "loss": -0.1012, "num_tokens": 35946527.0, "reward": 0.33308881521224976, "reward_std": 0.06174364686012268, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.26940426230430603, "rewards/logprob_reward/std": 0.3211418688297272, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 660.4375, "completions/mean_terminated_length": 593.1111450195312, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 4.030864197530864, "grad_norm": 1.6499800855374784, "kl": 0.6767578125, "learning_rate": 4.7981093743660634e-08, "loss": -0.0229, "num_tokens": 35973909.0, "reward": 0.3509872853755951, "reward_std": 0.0768103301525116, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2962358891963959, "rewards/logprob_reward/std": 0.30836957693099976, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 640.84375, "completions/mean_terminated_length": 615.300048828125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 4.033950617283951, "grad_norm": 1.362694363052582, "kl": 0.62646484375, "learning_rate": 4.768699667912118e-08, "loss": -0.1409, "num_tokens": 36000572.0, "reward": 0.4100096821784973, "reward_std": 0.1373266875743866, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.35834410786628723, "rewards/logprob_reward/std": 0.3530668318271637, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 684.8125, "completions/mean_terminated_length": 622.0, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 4.037037037037037, "grad_norm": 1.6206615989079185, "kl": 0.65966796875, "learning_rate": 4.739370865650716e-08, "loss": -0.0135, "num_tokens": 36028746.0, "reward": 0.2361714243888855, "reward_std": 0.12348444759845734, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.1721349060535431, "rewards/logprob_reward/std": 0.24297897517681122, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 664.71875, "completions/mean_terminated_length": 613.3928833007812, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 4.040123456790123, "grad_norm": 1.5289335444225096, "kl": 0.62451171875, "learning_rate": 4.710123084866602e-08, "loss": 0.0587, "num_tokens": 36056785.0, "reward": 0.37501078844070435, "reward_std": 0.13581761717796326, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.31945642828941345, "rewards/logprob_reward/std": 0.2692832052707672, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 694.125, "completions/mean_terminated_length": 601.760009765625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 4.04320987654321, "grad_norm": 1.6044974504910015, "kl": 0.64453125, "learning_rate": 4.6809564425205286e-08, "loss": 0.0465, "num_tokens": 36085553.0, "reward": 0.30243799090385437, "reward_std": 0.10716494917869568, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.2527088522911072, "rewards/logprob_reward/std": 0.3134048581123352, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 682.8125, "completions/mean_terminated_length": 587.2799682617188, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 4.046296296296297, "grad_norm": 1.8898988583912188, "kl": 0.64013671875, "learning_rate": 4.6518710552487796e-08, "loss": 0.0456, "num_tokens": 36113703.0, "reward": 0.4060022234916687, "reward_std": 0.19832013547420502, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3643079996109009, "rewards/logprob_reward/std": 0.3305397927761078, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 675.59375, "completions/mean_terminated_length": 625.8214721679688, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 4.049382716049383, "grad_norm": 1.3270567562649145, "kl": 0.60888671875, "learning_rate": 4.6228670393627014e-08, "loss": -0.0861, "num_tokens": 36141926.0, "reward": 0.47588175535202026, "reward_std": 0.2443270981311798, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.4350075125694275, "rewards/logprob_reward/std": 0.3911864161491394, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 598.375, "completions/mean_terminated_length": 570.0, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 4.052469135802469, "grad_norm": 1.8518505180637388, "kl": 0.62646484375, "learning_rate": 4.5939445108482466e-08, "loss": -0.0929, "num_tokens": 36167534.0, "reward": 0.42044419050216675, "reward_std": 0.07666278630495071, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3699380159378052, "rewards/logprob_reward/std": 0.3580783009529114, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 702.3125, "completions/mean_terminated_length": 656.357177734375, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 4.055555555555555, "grad_norm": 1.491128454938137, "kl": 0.577392578125, "learning_rate": 4.565103585365479e-08, "loss": 0.008, "num_tokens": 36196744.0, "reward": 0.30036577582359314, "reward_std": 0.14361661672592163, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.236517533659935, "rewards/logprob_reward/std": 0.3222830891609192, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 616.59375, "completions/mean_terminated_length": 558.3928833007812, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 4.058641975308642, "grad_norm": 1.6231262957549002, "kl": 0.6669921875, "learning_rate": 4.536344378248161e-08, "loss": -0.0135, "num_tokens": 36222991.0, "reward": 0.3266756534576416, "reward_std": 0.11032585054636002, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2692229747772217, "rewards/logprob_reward/std": 0.31764644384384155, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 633.75, "completions/mean_terminated_length": 561.4815063476562, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 4.061728395061729, "grad_norm": 1.6751534066542102, "kl": 0.6806640625, "learning_rate": 4.50766700450326e-08, "loss": -0.0768, "num_tokens": 36250115.0, "reward": 0.3772864043712616, "reward_std": 0.19395627081394196, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3324015736579895, "rewards/logprob_reward/std": 0.29262951016426086, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 652.46875, "completions/mean_terminated_length": 566.7307739257812, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 4.064814814814815, "grad_norm": 1.496668050801702, "kl": 0.6728515625, "learning_rate": 4.479071578810481e-08, "loss": 0.0355, "num_tokens": 36277762.0, "reward": 0.2756267189979553, "reward_std": 0.13266964256763458, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.21597416698932648, "rewards/logprob_reward/std": 0.24168932437896729, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 680.4375, "completions/mean_terminated_length": 616.8148193359375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 4.067901234567901, "grad_norm": 2.0497179674716204, "kl": 0.669677734375, "learning_rate": 4.450558215521838e-08, "loss": -0.0858, "num_tokens": 36306492.0, "reward": 0.2577815055847168, "reward_std": 0.15916094183921814, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.19961833953857422, "rewards/logprob_reward/std": 0.2491302639245987, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 617.71875, "completions/mean_terminated_length": 590.6333618164062, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 4.070987654320987, "grad_norm": 1.5377436879206923, "kl": 0.642578125, "learning_rate": 4.4221270286611765e-08, "loss": -0.0041, "num_tokens": 36333155.0, "reward": 0.3324400782585144, "reward_std": 0.0790560245513916, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.26521119475364685, "rewards/logprob_reward/std": 0.3123054802417755, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 731.21875, "completions/mean_terminated_length": 633.625, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 4.074074074074074, "grad_norm": 1.483807278070257, "kl": 0.581298828125, "learning_rate": 4.3937781319237175e-08, "loss": 0.0794, "num_tokens": 36363618.0, "reward": 0.34884485602378845, "reward_std": 0.2717476785182953, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.3112165033817291, "rewards/logprob_reward/std": 0.37505224347114563, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 702.09375, "completions/mean_terminated_length": 611.9599609375, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 4.077160493827161, "grad_norm": 1.5798295991314093, "kl": 0.7138671875, "learning_rate": 4.365511638675612e-08, "loss": 0.0934, "num_tokens": 36393069.0, "reward": 0.3154732882976532, "reward_std": 0.12789447605609894, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.267192542552948, "rewards/logprob_reward/std": 0.3412717580795288, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 597.125, "completions/mean_terminated_length": 552.9655151367188, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 4.080246913580247, "grad_norm": 1.5450631669246297, "kl": 0.59521484375, "learning_rate": 4.337327661953477e-08, "loss": -0.0395, "num_tokens": 36419685.0, "reward": 0.3300023674964905, "reward_std": 0.05808283016085625, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.26944708824157715, "rewards/logprob_reward/std": 0.2965429127216339, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 665.65625, "completions/mean_terminated_length": 582.9615478515625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 4.083333333333333, "grad_norm": 1.6753259748941252, "kl": 0.61962890625, "learning_rate": 4.3092263144639565e-08, "loss": 0.073, "num_tokens": 36447890.0, "reward": 0.3855285048484802, "reward_std": 0.18475160002708435, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.33808720111846924, "rewards/logprob_reward/std": 0.2972589433193207, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 642.46875, "completions/mean_terminated_length": 571.8148193359375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 4.08641975308642, "grad_norm": 1.5876851682439899, "kl": 0.64404296875, "learning_rate": 4.281207708583256e-08, "loss": -0.1955, "num_tokens": 36474705.0, "reward": 0.4412694573402405, "reward_std": 0.18509413301944733, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.40349385142326355, "rewards/logprob_reward/std": 0.36072438955307007, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 575.09375, "completions/mean_terminated_length": 528.6551513671875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 4.089506172839506, "grad_norm": 1.9707921044127323, "kl": 0.7392578125, "learning_rate": 4.253271956356713e-08, "loss": -0.0357, "num_tokens": 36499484.0, "reward": 0.48010218143463135, "reward_std": 0.13595843315124512, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.4396968483924866, "rewards/logprob_reward/std": 0.32903075218200684, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 657.90625, "completions/mean_terminated_length": 555.3999633789062, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 4.092592592592593, "grad_norm": 1.5646900025718429, "kl": 0.6513671875, "learning_rate": 4.2254191694983096e-08, "loss": -0.1481, "num_tokens": 36527057.0, "reward": 0.3108939528465271, "reward_std": 0.19064980745315552, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.2690488398075104, "rewards/logprob_reward/std": 0.3669106066226959, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 579.59375, "completions/mean_terminated_length": 533.6206665039062, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 4.095679012345679, "grad_norm": 1.4971796628598608, "kl": 0.63427734375, "learning_rate": 4.197649459390287e-08, "loss": -0.0259, "num_tokens": 36551912.0, "reward": 0.3146355450153351, "reward_std": 0.11491416394710541, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.24890059232711792, "rewards/logprob_reward/std": 0.26203465461730957, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 706.21875, "completions/mean_terminated_length": 647.370361328125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 4.098765432098766, "grad_norm": 1.4971476673760125, "kl": 0.5927734375, "learning_rate": 4.169962937082635e-08, "loss": 0.0148, "num_tokens": 36580659.0, "reward": 0.3895792067050934, "reward_std": 0.1733841896057129, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.34258800745010376, "rewards/logprob_reward/std": 0.3329865038394928, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 550.28125, "completions/mean_terminated_length": 518.7000122070312, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 4.101851851851852, "grad_norm": 2.0507354873919823, "kl": 0.650390625, "learning_rate": 4.142359713292698e-08, "loss": 0.0799, "num_tokens": 36604516.0, "reward": 0.5065993070602417, "reward_std": 0.10919086635112762, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.45872145891189575, "rewards/logprob_reward/std": 0.3376200497150421, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 657.8125, "completions/mean_terminated_length": 633.4000244140625, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 4.104938271604938, "grad_norm": 1.3893357621137066, "kl": 0.61669921875, "learning_rate": 4.11483989840471e-08, "loss": 0.0111, "num_tokens": 36631802.0, "reward": 0.38589394092559814, "reward_std": 0.06279394030570984, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3280766010284424, "rewards/logprob_reward/std": 0.3292454779148102, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 608.125, "completions/mean_terminated_length": 580.4000244140625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 4.1080246913580245, "grad_norm": 1.4749404193021296, "kl": 0.65673828125, "learning_rate": 4.087403602469347e-08, "loss": -0.0991, "num_tokens": 36657742.0, "reward": 0.5973392724990845, "reward_std": 0.09277798235416412, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.5595435500144958, "rewards/logprob_reward/std": 0.3143359124660492, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 671.28125, "completions/mean_terminated_length": 572.5199584960938, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 4.111111111111111, "grad_norm": 1.6366678292132466, "kl": 0.64453125, "learning_rate": 4.060050935203307e-08, "loss": -0.0758, "num_tokens": 36686295.0, "reward": 0.3553878366947174, "reward_std": 0.20019590854644775, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3080698251724243, "rewards/logprob_reward/std": 0.31260445713996887, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 589.75, "completions/mean_terminated_length": 544.8275756835938, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 4.114197530864198, "grad_norm": 1.5159615322732232, "kl": 0.6767578125, "learning_rate": 4.032782005988861e-08, "loss": -0.0728, "num_tokens": 36710951.0, "reward": 0.34867554903030396, "reward_std": 0.11131349205970764, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2901950478553772, "rewards/logprob_reward/std": 0.31018462777137756, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 646.1875, "completions/mean_terminated_length": 607.1034545898438, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 4.117283950617284, "grad_norm": 1.6612441097120558, "kl": 0.608642578125, "learning_rate": 4.0055969238733945e-08, "loss": 0.0767, "num_tokens": 36737965.0, "reward": 0.3234352469444275, "reward_std": 0.09884031116962433, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2621503174304962, "rewards/logprob_reward/std": 0.25975921750068665, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 592.84375, "completions/mean_terminated_length": 531.25, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 4.12037037037037, "grad_norm": 1.5944747332565865, "kl": 0.7392578125, "learning_rate": 3.978495797569012e-08, "loss": -0.2071, "num_tokens": 36763344.0, "reward": 0.29600828886032104, "reward_std": 0.13704751431941986, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2316758632659912, "rewards/logprob_reward/std": 0.29483363032341003, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 706.65625, "completions/mean_terminated_length": 633.423095703125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 4.1234567901234565, "grad_norm": 1.5694985730481983, "kl": 0.58154296875, "learning_rate": 3.95147873545208e-08, "loss": 0.0402, "num_tokens": 36792885.0, "reward": 0.33170008659362793, "reward_std": 0.20611174404621124, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.2817500829696655, "rewards/logprob_reward/std": 0.28610438108444214, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 568.5, "completions/mean_terminated_length": 521.3793334960938, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 4.1265432098765435, "grad_norm": 1.541595918963298, "kl": 0.66455078125, "learning_rate": 3.924545845562791e-08, "loss": -0.1692, "num_tokens": 36817861.0, "reward": 0.43395471572875977, "reward_std": 0.11640407145023346, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.388421893119812, "rewards/logprob_reward/std": 0.3468867540359497, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 621.46875, "completions/mean_terminated_length": 546.9259033203125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 4.12962962962963, "grad_norm": 1.4892648914051632, "kl": 0.62060546875, "learning_rate": 3.8976972356047325e-08, "loss": 0.0208, "num_tokens": 36844328.0, "reward": 0.4055551290512085, "reward_std": 0.1065315306186676, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.36033904552459717, "rewards/logprob_reward/std": 0.3353331387042999, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 633.65625, "completions/mean_terminated_length": 607.6333618164062, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 4.132716049382716, "grad_norm": 1.3941960666260809, "kl": 0.61962890625, "learning_rate": 3.870933012944472e-08, "loss": 0.0568, "num_tokens": 36871081.0, "reward": 0.34025588631629944, "reward_std": 0.08744353801012039, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2738954424858093, "rewards/logprob_reward/std": 0.3389797806739807, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 675.21875, "completions/mean_terminated_length": 577.5599975585938, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 4.135802469135802, "grad_norm": 1.3821747418731818, "kl": 0.63427734375, "learning_rate": 3.844253284611096e-08, "loss": -0.0673, "num_tokens": 36899504.0, "reward": 0.2853347659111023, "reward_std": 0.04446445405483246, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.2371775507926941, "rewards/logprob_reward/std": 0.32698848843574524, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 654.59375, "completions/mean_terminated_length": 616.3793334960938, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 4.138888888888889, "grad_norm": 1.6202684325550096, "kl": 0.62060546875, "learning_rate": 3.817658157295819e-08, "loss": -0.005, "num_tokens": 36927047.0, "reward": 0.5239607095718384, "reward_std": 0.11529737710952759, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.48148420453071594, "rewards/logprob_reward/std": 0.32663071155548096, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 605.21875, "completions/mean_terminated_length": 561.8965454101562, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 4.1419753086419755, "grad_norm": 1.5956806587479613, "kl": 0.64990234375, "learning_rate": 3.791147737351541e-08, "loss": -0.0701, "num_tokens": 36953282.0, "reward": 0.41034865379333496, "reward_std": 0.16449731588363647, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.35872071981430054, "rewards/logprob_reward/std": 0.3067430555820465, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 615.875, "completions/mean_terminated_length": 573.6551513671875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 4.145061728395062, "grad_norm": 1.5033476748454058, "kl": 0.614013671875, "learning_rate": 3.7647221307923946e-08, "loss": 0.0052, "num_tokens": 36979730.0, "reward": 0.4161939024925232, "reward_std": 0.1644056886434555, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.36868762969970703, "rewards/logprob_reward/std": 0.3746192455291748, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 540.5625, "completions/mean_terminated_length": 540.5625, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 4.148148148148148, "grad_norm": 1.6819597342974015, "kl": 0.662109375, "learning_rate": 3.738381443293376e-08, "loss": -0.0, "num_tokens": 37003336.0, "reward": 0.4274485111236572, "reward_std": 0.09740712493658066, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.37424832582473755, "rewards/logprob_reward/std": 0.3180454671382904, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 636.875, "completions/mean_terminated_length": 596.8275756835938, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 4.151234567901234, "grad_norm": 1.6109518360722137, "kl": 0.64501953125, "learning_rate": 3.7121257801898814e-08, "loss": -0.1398, "num_tokens": 37030360.0, "reward": 0.3617846369743347, "reward_std": 0.11701994389295578, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3082329034805298, "rewards/logprob_reward/std": 0.3324814438819885, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 684.6875, "completions/mean_terminated_length": 621.8518676757812, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 4.154320987654321, "grad_norm": 1.735554331186347, "kl": 0.598388671875, "learning_rate": 3.685955246477296e-08, "loss": 0.1195, "num_tokens": 37059086.0, "reward": 0.3180617690086365, "reward_std": 0.10269822180271149, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.25965195894241333, "rewards/logprob_reward/std": 0.3237040042877197, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 661.4375, "completions/mean_terminated_length": 594.2963256835938, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 4.157407407407407, "grad_norm": 1.5330208469460933, "kl": 0.66552734375, "learning_rate": 3.659869946810581e-08, "loss": 0.0243, "num_tokens": 37086608.0, "reward": 0.42412102222442627, "reward_std": 0.09178125858306885, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.37749555706977844, "rewards/logprob_reward/std": 0.3124769628047943, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 656.90625, "completions/mean_terminated_length": 554.1199951171875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 4.160493827160494, "grad_norm": 1.8623764196934725, "kl": 0.69580078125, "learning_rate": 3.6338699855038486e-08, "loss": -0.0103, "num_tokens": 37113601.0, "reward": 0.22179235517978668, "reward_std": 0.13995513319969177, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.1631026268005371, "rewards/logprob_reward/std": 0.21094097197055817, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 630.53125, "completions/mean_terminated_length": 557.6666870117188, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 4.16358024691358, "grad_norm": 1.3363094044328883, "kl": 0.7431640625, "learning_rate": 3.6079554665299414e-08, "loss": -0.0869, "num_tokens": 37140246.0, "reward": 0.41502583026885986, "reward_std": 0.08433513343334198, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.36738982796669006, "rewards/logprob_reward/std": 0.396855890750885, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 641.125, "completions/mean_terminated_length": 601.5172119140625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 4.166666666666667, "grad_norm": 1.7536756980264903, "kl": 0.634521484375, "learning_rate": 3.5821264935200294e-08, "loss": -0.1397, "num_tokens": 37167618.0, "reward": 0.38471630215644836, "reward_std": 0.1666683852672577, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3406570553779602, "rewards/logprob_reward/std": 0.3555237352848053, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 628.3125, "completions/mean_terminated_length": 571.7857666015625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 4.169753086419753, "grad_norm": 1.6885624947626152, "kl": 0.69482421875, "learning_rate": 3.5563831697631776e-08, "loss": -0.0474, "num_tokens": 37194536.0, "reward": 0.28741455078125, "reward_std": 0.13933715224266052, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2255995273590088, "rewards/logprob_reward/std": 0.3592804968357086, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 649.65625, "completions/mean_terminated_length": 580.3333129882812, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 4.172839506172839, "grad_norm": 1.5201239236885258, "kl": 0.63427734375, "learning_rate": 3.53072559820595e-08, "loss": -0.0305, "num_tokens": 37222145.0, "reward": 0.35959386825561523, "reward_std": 0.13059291243553162, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3092709481716156, "rewards/logprob_reward/std": 0.3507290780544281, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 721.4375, "completions/mean_terminated_length": 651.6154174804688, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 4.175925925925926, "grad_norm": 1.5811650058089066, "kl": 0.63623046875, "learning_rate": 3.505153881451997e-08, "loss": 0.0118, "num_tokens": 37251627.0, "reward": 0.35419201850891113, "reward_std": 0.11879801750183105, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3032689094543457, "rewards/logprob_reward/std": 0.3512733280658722, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 622.1875, "completions/mean_terminated_length": 564.7857666015625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 4.179012345679013, "grad_norm": 1.6753314712522067, "kl": 0.62060546875, "learning_rate": 3.479668121761617e-08, "loss": -0.0001, "num_tokens": 37277573.0, "reward": 0.44235318899154663, "reward_std": 0.20973920822143555, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3942812979221344, "rewards/logprob_reward/std": 0.32860952615737915, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 749.40625, "completions/mean_terminated_length": 641.95654296875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 4.182098765432099, "grad_norm": 1.6267127589700037, "kl": 0.6220703125, "learning_rate": 3.45426842105139e-08, "loss": -0.1449, "num_tokens": 37308510.0, "reward": 0.35964417457580566, "reward_std": 0.15717020630836487, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.3197435140609741, "rewards/logprob_reward/std": 0.33039528131484985, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 637.65625, "completions/mean_terminated_length": 582.4642944335938, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 4.185185185185185, "grad_norm": 1.4788902712264298, "kl": 0.61572265625, "learning_rate": 3.428954880893745e-08, "loss": -0.0472, "num_tokens": 37335219.0, "reward": 0.37489181756973267, "reward_std": 0.11857884377241135, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.319324254989624, "rewards/logprob_reward/std": 0.34332260489463806, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 626.3125, "completions/mean_terminated_length": 569.5, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 4.188271604938271, "grad_norm": 1.587280742007304, "kl": 0.629150390625, "learning_rate": 3.403727602516554e-08, "loss": 0.0029, "num_tokens": 37361961.0, "reward": 0.3260752558708191, "reward_std": 0.1265375167131424, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2720280587673187, "rewards/logprob_reward/std": 0.2705236077308655, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 602.03125, "completions/mean_terminated_length": 523.888916015625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 4.191358024691358, "grad_norm": 1.581954492806186, "kl": 0.72509765625, "learning_rate": 3.3785866868027426e-08, "loss": -0.0691, "num_tokens": 37387710.0, "reward": 0.408158540725708, "reward_std": 0.0937754362821579, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.35975944995880127, "rewards/logprob_reward/std": 0.34979769587516785, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 642.125, "completions/mean_terminated_length": 554.0, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 4.194444444444445, "grad_norm": 1.4991664876292448, "kl": 0.603515625, "learning_rate": 3.353532234289849e-08, "loss": 0.027, "num_tokens": 37414810.0, "reward": 0.35399478673934937, "reward_std": 0.1981583535671234, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3099942207336426, "rewards/logprob_reward/std": 0.2870131731033325, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 653.625, "completions/mean_terminated_length": 628.933349609375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 4.197530864197531, "grad_norm": 1.6408540971780785, "kl": 0.576904296875, "learning_rate": 3.3285643451696796e-08, "loss": -0.0497, "num_tokens": 37442762.0, "reward": 0.349179208278656, "reward_std": 0.1391916275024414, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.28728246688842773, "rewards/logprob_reward/std": 0.3085601031780243, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 627.0, "completions/mean_terminated_length": 585.9310302734375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 4.200617283950617, "grad_norm": 1.679522100977789, "kl": 0.64990234375, "learning_rate": 3.303683119287859e-08, "loss": -0.131, "num_tokens": 37469034.0, "reward": 0.5176540613174438, "reward_std": 0.08004452288150787, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.47794896364212036, "rewards/logprob_reward/std": 0.33830147981643677, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 667.0, "completions/mean_terminated_length": 584.6154174804688, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 4.203703703703703, "grad_norm": 1.4837890064044594, "kl": 0.53759765625, "learning_rate": 3.278888656143453e-08, "loss": -0.0442, "num_tokens": 37496750.0, "reward": 0.4469582438468933, "reward_std": 0.1378178596496582, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.4063425064086914, "rewards/logprob_reward/std": 0.3309935927391052, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 642.78125, "completions/mean_terminated_length": 630.4838256835938, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 4.20679012345679, "grad_norm": 1.523689266694728, "kl": 0.6376953125, "learning_rate": 3.254181054888569e-08, "loss": 0.0133, "num_tokens": 37523627.0, "reward": 0.3734282851219177, "reward_std": 0.07577695697546005, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3107537031173706, "rewards/logprob_reward/std": 0.30401891469955444, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 622.75, "completions/mean_terminated_length": 530.1538696289062, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 4.209876543209877, "grad_norm": 1.890216808565296, "kl": 0.71044921875, "learning_rate": 3.2295604143279534e-08, "loss": -0.2123, "num_tokens": 37549871.0, "reward": 0.3495715856552124, "reward_std": 0.15447676181793213, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3050795793533325, "rewards/logprob_reward/std": 0.31768471002578735, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 677.375, "completions/mean_terminated_length": 613.1851806640625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 4.212962962962963, "grad_norm": 1.5540210919634874, "kl": 0.67236328125, "learning_rate": 3.205026832918606e-08, "loss": -0.1549, "num_tokens": 37577947.0, "reward": 0.4311966001987457, "reward_std": 0.1718214452266693, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3888295888900757, "rewards/logprob_reward/std": 0.300140380859375, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 623.75, "completions/mean_terminated_length": 610.8386840820312, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 4.216049382716049, "grad_norm": 1.5200140946683396, "kl": 0.650146484375, "learning_rate": 3.1805804087693676e-08, "loss": 0.0107, "num_tokens": 37604419.0, "reward": 0.4300002157688141, "reward_std": 0.06730598211288452, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.37361136078834534, "rewards/logprob_reward/std": 0.35251715779304504, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 655.34375, "completions/mean_terminated_length": 630.7667236328125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 4.219135802469136, "grad_norm": 1.5083610618300893, "kl": 0.62939453125, "learning_rate": 3.156221239640558e-08, "loss": -0.0521, "num_tokens": 37631970.0, "reward": 0.25494372844696045, "reward_std": 0.10720371454954147, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.18952082097530365, "rewards/logprob_reward/std": 0.27343955636024475, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 622.3125, "completions/mean_terminated_length": 547.9259033203125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 4.222222222222222, "grad_norm": 1.5759263677765665, "kl": 0.64111328125, "learning_rate": 3.13194942294355e-08, "loss": -0.1077, "num_tokens": 37658884.0, "reward": 0.4254962205886841, "reward_std": 0.27615147829055786, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.38596799969673157, "rewards/logprob_reward/std": 0.3840332329273224, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 684.03125, "completions/mean_terminated_length": 621.0740966796875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 4.2253086419753085, "grad_norm": 1.4218560141897025, "kl": 0.61083984375, "learning_rate": 3.1077650557404076e-08, "loss": -0.1814, "num_tokens": 37687021.0, "reward": 0.35567009449005127, "reward_std": 0.2137615978717804, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.31185564398765564, "rewards/logprob_reward/std": 0.3091282248497009, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 671.25, "completions/mean_terminated_length": 589.84619140625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 4.228395061728395, "grad_norm": 1.6172909666255815, "kl": 0.63671875, "learning_rate": 3.083668234743489e-08, "loss": -0.0298, "num_tokens": 37715221.0, "reward": 0.37884873151779175, "reward_std": 0.26947689056396484, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3376097083091736, "rewards/logprob_reward/std": 0.3191835582256317, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 627.28125, "completions/mean_terminated_length": 553.8148193359375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 4.231481481481482, "grad_norm": 1.8223065066147794, "kl": 0.619140625, "learning_rate": 3.059659056315053e-08, "loss": -0.1866, "num_tokens": 37742418.0, "reward": 0.25542011857032776, "reward_std": 0.1844119429588318, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.1969945728778839, "rewards/logprob_reward/std": 0.2848254144191742, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 648.9375, "completions/mean_terminated_length": 623.933349609375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 4.234567901234568, "grad_norm": 1.3875324383837295, "kl": 0.61767578125, "learning_rate": 3.035737616466885e-08, "loss": -0.0122, "num_tokens": 37770012.0, "reward": 0.3363264799118042, "reward_std": 0.12189370393753052, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.27647385001182556, "rewards/logprob_reward/std": 0.38182875514030457, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 689.03125, "completions/mean_terminated_length": 595.239990234375, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 4.237654320987654, "grad_norm": 1.6058121580147213, "kl": 0.66064453125, "learning_rate": 3.0119040108598974e-08, "loss": -0.0399, "num_tokens": 37798617.0, "reward": 0.2187155932188034, "reward_std": 0.13899683952331543, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.15968401730060577, "rewards/logprob_reward/std": 0.2800329625606537, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 624.1875, "completions/mean_terminated_length": 611.290283203125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 4.2407407407407405, "grad_norm": 1.623741177146682, "kl": 0.601806640625, "learning_rate": 2.98815833480377e-08, "loss": -0.1022, "num_tokens": 37824667.0, "reward": 0.3622738718986511, "reward_std": 0.09437457472085953, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.29835987091064453, "rewards/logprob_reward/std": 0.2528286278247833, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 604.96875, "completions/mean_terminated_length": 577.0333862304688, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 4.243827160493828, "grad_norm": 1.439218429754961, "kl": 0.649169921875, "learning_rate": 2.964500683256549e-08, "loss": -0.1988, "num_tokens": 37850490.0, "reward": 0.39608994126319885, "reward_std": 0.10990118980407715, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.34634989500045776, "rewards/logprob_reward/std": 0.3451257646083832, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 681.03125, "completions/mean_terminated_length": 632.0357666015625, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 4.246913580246914, "grad_norm": 1.5308159675352069, "kl": 0.66796875, "learning_rate": 2.9409311508242663e-08, "loss": -0.1013, "num_tokens": 37878747.0, "reward": 0.5211547017097473, "reward_std": 0.17722265422344208, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.48531073331832886, "rewards/logprob_reward/std": 0.3311658203601837, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 656.8125, "completions/mean_terminated_length": 554.0, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 4.25, "grad_norm": 1.581783465061149, "kl": 0.64697265625, "learning_rate": 2.9174498317605794e-08, "loss": -0.0679, "num_tokens": 37906673.0, "reward": 0.4326964020729065, "reward_std": 0.20204457640647888, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.39744046330451965, "rewards/logprob_reward/std": 0.38023439049720764, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 534.125, "completions/mean_terminated_length": 518.3225708007812, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 4.253086419753086, "grad_norm": 1.6148219991574648, "kl": 0.6923828125, "learning_rate": 2.894056819966384e-08, "loss": -0.0605, "num_tokens": 37930481.0, "reward": 0.3732694089412689, "reward_std": 0.07587297260761261, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3105771243572235, "rewards/logprob_reward/std": 0.27508068084716797, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 563.75, "completions/mean_terminated_length": 533.0667114257812, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 4.256172839506172, "grad_norm": 1.3846719432893355, "kl": 0.7197265625, "learning_rate": 2.8707522089894354e-08, "loss": -0.151, "num_tokens": 37954581.0, "reward": 0.3336465656757355, "reward_std": 0.09854701161384583, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.27002397179603577, "rewards/logprob_reward/std": 0.2611810266971588, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 640.375, "completions/mean_terminated_length": 614.800048828125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 4.2592592592592595, "grad_norm": 1.736095703003275, "kl": 0.59375, "learning_rate": 2.8475360920239723e-08, "loss": -0.1622, "num_tokens": 37981745.0, "reward": 0.3136301636695862, "reward_std": 0.029620688408613205, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.24431127309799194, "rewards/logprob_reward/std": 0.29254472255706787, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 710.0, "completions/mean_terminated_length": 622.0799560546875, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 4.262345679012346, "grad_norm": 1.6312893783308913, "kl": 0.58935546875, "learning_rate": 2.8244085619103546e-08, "loss": -0.1197, "num_tokens": 38010537.0, "reward": 0.1927075982093811, "reward_std": 0.13496747612953186, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.1307862102985382, "rewards/logprob_reward/std": 0.2342350333929062, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 627.8125, "completions/mean_terminated_length": 586.8275756835938, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 4.265432098765432, "grad_norm": 1.7391155593123053, "kl": 0.62890625, "learning_rate": 2.8013697111346906e-08, "loss": -0.077, "num_tokens": 38036983.0, "reward": 0.3147032856941223, "reward_std": 0.14718939363956451, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2559203505516052, "rewards/logprob_reward/std": 0.2773653566837311, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 693.8125, "completions/mean_terminated_length": 617.6154174804688, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 4.268518518518518, "grad_norm": 1.645937760860454, "kl": 0.66015625, "learning_rate": 2.778419631828463e-08, "loss": -0.2052, "num_tokens": 38065537.0, "reward": 0.2891005575656891, "reward_std": 0.16821599006652832, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.23441728949546814, "rewards/logprob_reward/std": 0.2742525637149811, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 600.4375, "completions/mean_terminated_length": 586.774169921875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 4.271604938271605, "grad_norm": 1.3519970469219857, "kl": 0.7431640625, "learning_rate": 2.755558415768147e-08, "loss": -0.1557, "num_tokens": 38090851.0, "reward": 0.40229713916778564, "reward_std": 0.08253340423107147, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3463023900985718, "rewards/logprob_reward/std": 0.32646703720092773, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 662.90625, "completions/mean_terminated_length": 625.5516967773438, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 4.2746913580246915, "grad_norm": 1.2902168015606412, "kl": 0.62548828125, "learning_rate": 2.732786154374869e-08, "loss": 0.0059, "num_tokens": 38118680.0, "reward": 0.34256306290626526, "reward_std": 0.13456498086452484, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2868756353855133, "rewards/logprob_reward/std": 0.28898119926452637, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 642.28125, "completions/mean_terminated_length": 571.5925903320312, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 4.277777777777778, "grad_norm": 1.5592732261040207, "kl": 0.63671875, "learning_rate": 2.7101029387140318e-08, "loss": -0.0307, "num_tokens": 38145333.0, "reward": 0.5048613548278809, "reward_std": 0.13133981823921204, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.46720704436302185, "rewards/logprob_reward/std": 0.32200944423675537, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 718.6875, "completions/mean_terminated_length": 616.9166870117188, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 4.280864197530864, "grad_norm": 1.6260416941061422, "kl": 0.626220703125, "learning_rate": 2.6875088594949387e-08, "loss": -0.0995, "num_tokens": 38174931.0, "reward": 0.38631224632263184, "reward_std": 0.20459020137786865, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.3493746817111969, "rewards/logprob_reward/std": 0.3740261495113373, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 674.625, "completions/mean_terminated_length": 594.0, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 4.283950617283951, "grad_norm": 1.597683486835926, "kl": 0.596923828125, "learning_rate": 2.6650040070704484e-08, "loss": -0.0554, "num_tokens": 38202603.0, "reward": 0.2998139560222626, "reward_std": 0.12855127453804016, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.24284884333610535, "rewards/logprob_reward/std": 0.3164720833301544, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 576.90625, "completions/mean_terminated_length": 562.4838256835938, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 4.287037037037037, "grad_norm": 1.5669869355721846, "kl": 0.633544921875, "learning_rate": 2.6425884714365966e-08, "loss": 0.0141, "num_tokens": 38227456.0, "reward": 0.4674997925758362, "reward_std": 0.12463802099227905, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.41180533170700073, "rewards/logprob_reward/std": 0.26980501413345337, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 633.375, "completions/mean_terminated_length": 577.5714721679688, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 4.290123456790123, "grad_norm": 1.4707780589553814, "kl": 0.658203125, "learning_rate": 2.6202623422322546e-08, "loss": -0.0715, "num_tokens": 38253912.0, "reward": 0.33673280477523804, "reward_std": 0.08742868900299072, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.27692532539367676, "rewards/logprob_reward/std": 0.27778366208076477, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 714.9375, "completions/mean_terminated_length": 643.6154174804688, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 4.29320987654321, "grad_norm": 1.516439880848733, "kl": 0.59912109375, "learning_rate": 2.5980257087387546e-08, "loss": 0.084, "num_tokens": 38283190.0, "reward": 0.12478144466876984, "reward_std": 0.10350757837295532, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.05878493934869766, "rewards/logprob_reward/std": 0.17412590980529785, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 634.59375, "completions/mean_terminated_length": 594.3103637695312, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 4.296296296296296, "grad_norm": 1.6209933560206873, "kl": 0.67578125, "learning_rate": 2.5758786598795325e-08, "loss": 0.0294, "num_tokens": 38310473.0, "reward": 0.31104576587677, "reward_std": 0.10978768765926361, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.24491198360919952, "rewards/logprob_reward/std": 0.34314480423927307, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 629.21875, "completions/mean_terminated_length": 556.1111450195312, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 4.299382716049383, "grad_norm": 1.6161869164656069, "kl": 0.68896484375, "learning_rate": 2.5538212842197926e-08, "loss": -0.1052, "num_tokens": 38336732.0, "reward": 0.33567798137664795, "reward_std": 0.14641478657722473, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.28269776701927185, "rewards/logprob_reward/std": 0.3387736678123474, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 632.5625, "completions/mean_terminated_length": 576.6428833007812, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 4.302469135802469, "grad_norm": 1.54338718787921, "kl": 0.6259765625, "learning_rate": 2.5318536699661246e-08, "loss": -0.0584, "num_tokens": 38363938.0, "reward": 0.23274093866348267, "reward_std": 0.139927476644516, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.17526772618293762, "rewards/logprob_reward/std": 0.20763304829597473, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 667.625, "completions/mean_terminated_length": 616.7142944335938, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 4.305555555555555, "grad_norm": 1.4739517092209788, "kl": 0.60986328125, "learning_rate": 2.5099759049661802e-08, "loss": -0.1081, "num_tokens": 38392490.0, "reward": 0.43327102065086365, "reward_std": 0.16694355010986328, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.39460667967796326, "rewards/logprob_reward/std": 0.4047378599643707, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 711.625, "completions/mean_terminated_length": 667.0, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 4.308641975308642, "grad_norm": 1.6117469455620026, "kl": 0.59716796875, "learning_rate": 2.4881880767083002e-08, "loss": -0.0202, "num_tokens": 38421366.0, "reward": 0.2981303632259369, "reward_std": 0.18587583303451538, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.2479226440191269, "rewards/logprob_reward/std": 0.2802438735961914, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 580.65625, "completions/mean_terminated_length": 566.3547973632812, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 4.311728395061729, "grad_norm": 1.7079211328025252, "kl": 0.7080078125, "learning_rate": 2.4664902723211674e-08, "loss": 0.1291, "num_tokens": 38446147.0, "reward": 0.42157208919525146, "reward_std": 0.10281509160995483, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3711911737918854, "rewards/logprob_reward/std": 0.3157845735549927, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 591.75, "completions/mean_terminated_length": 547.0344848632812, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 4.314814814814815, "grad_norm": 1.5547162292779448, "kl": 0.642578125, "learning_rate": 2.444882578573476e-08, "loss": -0.1379, "num_tokens": 38472043.0, "reward": 0.35350149869918823, "reward_std": 0.17105308175086975, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3025016784667969, "rewards/logprob_reward/std": 0.2927432954311371, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 600.8125, "completions/mean_terminated_length": 540.357177734375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 4.317901234567901, "grad_norm": 1.5821409172643321, "kl": 0.6630859375, "learning_rate": 2.4233650818735573e-08, "loss": -0.2464, "num_tokens": 38497681.0, "reward": 0.394947350025177, "reward_std": 0.11928538978099823, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3450804352760315, "rewards/logprob_reward/std": 0.34904709458351135, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 591.96875, "completions/mean_terminated_length": 591.96875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 4.320987654320987, "grad_norm": 1.551631095635871, "kl": 0.579345703125, "learning_rate": 2.401937868269058e-08, "loss": -0.0589, "num_tokens": 38523016.0, "reward": 0.269132524728775, "reward_std": 0.023596247658133507, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.19139724969863892, "rewards/logprob_reward/std": 0.296143501996994, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 640.71875, "completions/mean_terminated_length": 569.74072265625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 4.324074074074074, "grad_norm": 1.6017279172400938, "kl": 0.6416015625, "learning_rate": 2.380601023446577e-08, "loss": -0.0146, "num_tokens": 38550323.0, "reward": 0.3822787404060364, "reward_std": 0.22489285469055176, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.34142082929611206, "rewards/logprob_reward/std": 0.33156728744506836, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 592.1875, "completions/mean_terminated_length": 563.4000244140625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 4.327160493827161, "grad_norm": 1.6231689453684504, "kl": 0.6767578125, "learning_rate": 2.3593546327313364e-08, "loss": 0.0031, "num_tokens": 38575201.0, "reward": 0.4894312024116516, "reward_std": 0.1254369616508484, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4396457374095917, "rewards/logprob_reward/std": 0.29297271370887756, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 684.9375, "completions/mean_terminated_length": 590.0, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 4.330246913580247, "grad_norm": 1.5433418576619862, "kl": 0.6630859375, "learning_rate": 2.338198781086842e-08, "loss": -0.0089, "num_tokens": 38603871.0, "reward": 0.3927851617336273, "reward_std": 0.1600060760974884, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.35309460759162903, "rewards/logprob_reward/std": 0.33122506737709045, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 625.15625, "completions/mean_terminated_length": 598.5667114257812, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 4.333333333333333, "grad_norm": 1.5958390504526463, "kl": 0.650390625, "learning_rate": 2.317133553114525e-08, "loss": 0.0176, "num_tokens": 38630140.0, "reward": 0.4800199866294861, "reward_std": 0.10142524540424347, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.43266111612319946, "rewards/logprob_reward/std": 0.35416263341903687, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 594.0625, "completions/mean_terminated_length": 565.4000244140625, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 4.33641975308642, "grad_norm": 1.5110064547282884, "kl": 0.67529296875, "learning_rate": 2.2961590330534298e-08, "loss": -0.098, "num_tokens": 38655830.0, "reward": 0.25580596923828125, "reward_std": 0.022158486768603325, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.1835343837738037, "rewards/logprob_reward/std": 0.30898454785346985, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 600.71875, "completions/mean_terminated_length": 572.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 4.339506172839506, "grad_norm": 1.6403663514964097, "kl": 0.744140625, "learning_rate": 2.2752753047798502e-08, "loss": -0.0943, "num_tokens": 38681257.0, "reward": 0.48660001158714294, "reward_std": 0.207355797290802, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.44344449043273926, "rewards/logprob_reward/std": 0.32075974345207214, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 584.84375, "completions/mean_terminated_length": 584.84375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 4.342592592592593, "grad_norm": 1.3665270806436993, "kl": 0.66259765625, "learning_rate": 2.2544824518070104e-08, "loss": -0.0899, "num_tokens": 38706316.0, "reward": 0.2797723412513733, "reward_std": 0.0670623704791069, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.20669148862361908, "rewards/logprob_reward/std": 0.3208942413330078, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 654.78125, "completions/mean_terminated_length": 602.0357666015625, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 4.345679012345679, "grad_norm": 1.4939202872362642, "kl": 0.65673828125, "learning_rate": 2.2337805572847425e-08, "loss": -0.2194, "num_tokens": 38734125.0, "reward": 0.26954182982444763, "reward_std": 0.11470353603363037, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.20574089884757996, "rewards/logprob_reward/std": 0.25512751936912537, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 632.84375, "completions/mean_terminated_length": 523.3200073242188, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 4.348765432098766, "grad_norm": 1.690504561065554, "kl": 0.73828125, "learning_rate": 2.2131697039991127e-08, "loss": -0.0113, "num_tokens": 38760908.0, "reward": 0.30826133489608765, "reward_std": 0.19438190758228302, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.2557069957256317, "rewards/logprob_reward/std": 0.2905013859272003, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 687.34375, "completions/mean_terminated_length": 575.125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 4.351851851851852, "grad_norm": 1.5626511266621352, "kl": 0.6318359375, "learning_rate": 2.1926499743721405e-08, "loss": -0.0784, "num_tokens": 38789695.0, "reward": 0.38657933473587036, "reward_std": 0.2794950306415558, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.3496714234352112, "rewards/logprob_reward/std": 0.36823344230651855, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 589.625, "completions/mean_terminated_length": 544.6896362304688, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 4.354938271604938, "grad_norm": 1.3672460857885216, "kl": 0.69287109375, "learning_rate": 2.1722214504614313e-08, "loss": -0.1974, "num_tokens": 38814755.0, "reward": 0.2970818281173706, "reward_std": 0.06510772556066513, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.23286868631839752, "rewards/logprob_reward/std": 0.2989687919616699, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 600.90625, "completions/mean_terminated_length": 540.4642944335938, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 4.3580246913580245, "grad_norm": 1.7651333569622694, "kl": 0.6455078125, "learning_rate": 2.1518842139598674e-08, "loss": -0.2377, "num_tokens": 38840452.0, "reward": 0.3639712929725647, "reward_std": 0.12565699219703674, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.31413477659225464, "rewards/logprob_reward/std": 0.3177318572998047, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 562.625, "completions/mean_terminated_length": 547.741943359375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 4.361111111111111, "grad_norm": 1.6151024854067229, "kl": 0.72705078125, "learning_rate": 2.1316383461952804e-08, "loss": 0.0069, "num_tokens": 38865076.0, "reward": 0.27323460578918457, "reward_std": 0.01288335956633091, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.19595514237880707, "rewards/logprob_reward/std": 0.31818312406539917, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 681.90625, "completions/mean_terminated_length": 659.1000366210938, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 4.364197530864198, "grad_norm": 1.6334716369464566, "kl": 0.7607421875, "learning_rate": 2.1114839281301143e-08, "loss": -0.039, "num_tokens": 38893369.0, "reward": 0.2659817039966583, "reward_std": 0.03890148177742958, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.19484078884124756, "rewards/logprob_reward/std": 0.3001561462879181, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 652.78125, "completions/mean_terminated_length": 628.0333862304688, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 4.367283950617284, "grad_norm": 1.4638056743567855, "kl": 0.65771484375, "learning_rate": 2.0914210403611132e-08, "loss": -0.0758, "num_tokens": 38920898.0, "reward": 0.5437999963760376, "reward_std": 0.09784190356731415, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.5000555515289307, "rewards/logprob_reward/std": 0.3222992718219757, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 601.375, "completions/mean_terminated_length": 557.6551513671875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 4.37037037037037, "grad_norm": 1.669582695448279, "kl": 0.68017578125, "learning_rate": 2.071449763118993e-08, "loss": -0.1, "num_tokens": 38946618.0, "reward": 0.4955701529979706, "reward_std": 0.12067276239395142, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.4568835496902466, "rewards/logprob_reward/std": 0.34119415283203125, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 652.9375, "completions/mean_terminated_length": 529.25, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 4.3734567901234565, "grad_norm": 1.7087922347543532, "kl": 0.662109375, "learning_rate": 2.0515701762681304e-08, "loss": -0.0641, "num_tokens": 38974096.0, "reward": 0.3160213828086853, "reward_std": 0.19536183774471283, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.27127373218536377, "rewards/logprob_reward/std": 0.31931325793266296, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 584.0625, "completions/mean_terminated_length": 554.7333374023438, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 4.3765432098765435, "grad_norm": 1.7277344723013388, "kl": 0.70654296875, "learning_rate": 2.0317823593062165e-08, "loss": -0.0234, "num_tokens": 38998966.0, "reward": 0.377634733915329, "reward_std": 0.14999543130397797, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3188996911048889, "rewards/logprob_reward/std": 0.2865521311759949, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 656.25, "completions/mean_terminated_length": 618.2069091796875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 4.37962962962963, "grad_norm": 1.538661677031588, "kl": 0.59521484375, "learning_rate": 2.0120863913639874e-08, "loss": -0.0641, "num_tokens": 39026990.0, "reward": 0.29881781339645386, "reward_std": 0.08298064768314362, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.24521425366401672, "rewards/logprob_reward/std": 0.31323689222335815, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 627.75, "completions/mean_terminated_length": 536.3077392578125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 4.382716049382716, "grad_norm": 1.5530261192586938, "kl": 0.61572265625, "learning_rate": 1.9924823512048438e-08, "loss": 0.0041, "num_tokens": 39053310.0, "reward": 0.3412266969680786, "reward_std": 0.1497499942779541, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.295807421207428, "rewards/logprob_reward/std": 0.33093565702438354, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 627.78125, "completions/mean_terminated_length": 516.8399658203125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 4.385802469135802, "grad_norm": 1.5581788170683073, "kl": 0.6484375, "learning_rate": 1.972970317224601e-08, "loss": -0.0302, "num_tokens": 39079623.0, "reward": 0.5770539045333862, "reward_std": 0.1951889991760254, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.5543653964996338, "rewards/logprob_reward/std": 0.37904947996139526, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 710.40625, "completions/mean_terminated_length": 622.5999755859375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 4.388888888888889, "grad_norm": 1.390637172200798, "kl": 0.60595703125, "learning_rate": 1.9535503674511263e-08, "loss": -0.0114, "num_tokens": 39109160.0, "reward": 0.3172440826892853, "reward_std": 0.0695582777261734, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.26568785309791565, "rewards/logprob_reward/std": 0.3645308315753937, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 666.375, "completions/mean_terminated_length": 600.1481323242188, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 4.3919753086419755, "grad_norm": 1.3485838629743747, "kl": 0.68896484375, "learning_rate": 1.934222579544059e-08, "loss": -0.1466, "num_tokens": 39137264.0, "reward": 0.2786564826965332, "reward_std": 0.16349688172340393, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.21934053301811218, "rewards/logprob_reward/std": 0.3034880757331848, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 617.4375, "completions/mean_terminated_length": 604.3225708007812, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 4.395061728395062, "grad_norm": 1.5235010537147877, "kl": 0.630859375, "learning_rate": 1.9149870307944765e-08, "loss": 0.002, "num_tokens": 39163758.0, "reward": 0.5359416007995605, "reward_std": 0.07059718668460846, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.4878517985343933, "rewards/logprob_reward/std": 0.399873286485672, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 596.03125, "completions/mean_terminated_length": 567.5, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 4.398148148148148, "grad_norm": 1.6830257141117775, "kl": 0.70263671875, "learning_rate": 1.895843798124605e-08, "loss": 0.0232, "num_tokens": 39188991.0, "reward": 0.3727891445159912, "reward_std": 0.093578040599823, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.31698793172836304, "rewards/logprob_reward/std": 0.28925272822380066, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 609.875, "completions/mean_terminated_length": 582.2667236328125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 4.401234567901234, "grad_norm": 1.3079796731751159, "kl": 0.594970703125, "learning_rate": 1.8767929580874863e-08, "loss": -0.0811, "num_tokens": 39215547.0, "reward": 0.5941882729530334, "reward_std": 0.12742620706558228, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.5595147609710693, "rewards/logprob_reward/std": 0.34080734848976135, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 595.84375, "completions/mean_terminated_length": 595.84375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 4.404320987654321, "grad_norm": 1.7595536124226225, "kl": 0.6552734375, "learning_rate": 1.8578345868666996e-08, "loss": 0.0509, "num_tokens": 39240662.0, "reward": 0.36699390411376953, "reward_std": 0.06276129186153412, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.2966598868370056, "rewards/logprob_reward/std": 0.27181482315063477, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 660.40625, "completions/mean_terminated_length": 593.0740966796875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 4.407407407407407, "grad_norm": 1.6648684722615061, "kl": 0.610107421875, "learning_rate": 1.8389687602760495e-08, "loss": -0.2302, "num_tokens": 39268075.0, "reward": 0.3408641517162323, "reward_std": 0.13295355439186096, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.2954046130180359, "rewards/logprob_reward/std": 0.3235439956188202, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 638.9375, "completions/mean_terminated_length": 599.1034545898438, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 4.410493827160494, "grad_norm": 1.680700096136354, "kl": 0.67822265625, "learning_rate": 1.820195553759246e-08, "loss": -0.1749, "num_tokens": 39294797.0, "reward": 0.2861012816429138, "reward_std": 0.1157999113202095, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.22414033114910126, "rewards/logprob_reward/std": 0.27730125188827515, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 559.1875, "completions/mean_terminated_length": 511.10345458984375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 4.41358024691358, "grad_norm": 1.619968085675831, "kl": 0.640625, "learning_rate": 1.8015150423896203e-08, "loss": 0.0007, "num_tokens": 39318907.0, "reward": 0.5091406106948853, "reward_std": 0.17619368433952332, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.46848955750465393, "rewards/logprob_reward/std": 0.3359174132347107, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 648.25, "completions/mean_terminated_length": 594.5714721679688, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 4.416666666666667, "grad_norm": 1.4526950649081616, "kl": 0.5947265625, "learning_rate": 1.782927300869827e-08, "loss": -0.0224, "num_tokens": 39346475.0, "reward": 0.3410267233848572, "reward_std": 0.08913139998912811, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2816963791847229, "rewards/logprob_reward/std": 0.32327455282211304, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 697.5625, "completions/mean_terminated_length": 606.1599731445312, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 4.419753086419753, "grad_norm": 2.5737197170213513, "kl": 0.6572265625, "learning_rate": 1.7644324035315212e-08, "loss": -0.1265, "num_tokens": 39375741.0, "reward": 0.2594638466835022, "reward_std": 0.2152375429868698, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.21537649631500244, "rewards/logprob_reward/std": 0.29154425859451294, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 541.75, "completions/mean_terminated_length": 541.75, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 4.422839506172839, "grad_norm": 1.4732592481963702, "kl": 0.67431640625, "learning_rate": 1.746030424335093e-08, "loss": -0.0353, "num_tokens": 39400037.0, "reward": 0.42542213201522827, "reward_std": 0.048348747193813324, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.36852455139160156, "rewards/logprob_reward/std": 0.30832815170288086, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 602.28125, "completions/mean_terminated_length": 588.6774291992188, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 4.425925925925926, "grad_norm": 1.5742166608162922, "kl": 0.71533203125, "learning_rate": 1.7277214368693423e-08, "loss": -0.1021, "num_tokens": 39425786.0, "reward": 0.27637040615081787, "reward_std": 0.020439274609088898, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.20291155576705933, "rewards/logprob_reward/std": 0.3113110363483429, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 650.1875, "completions/mean_terminated_length": 611.5172119140625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 4.429012345679013, "grad_norm": 1.540019323360415, "kl": 0.6298828125, "learning_rate": 1.7095055143512117e-08, "loss": -0.0331, "num_tokens": 39453096.0, "reward": 0.3047996163368225, "reward_std": 0.05103153735399246, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.23797178268432617, "rewards/logprob_reward/std": 0.318691611289978, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 535.0, "completions/mean_terminated_length": 502.4000244140625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 4.432098765432099, "grad_norm": 1.3681184622134288, "kl": 0.68505859375, "learning_rate": 1.6913827296254736e-08, "loss": -0.1569, "num_tokens": 39476808.0, "reward": 0.3895966112613678, "reward_std": 0.07732086628675461, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.33219069242477417, "rewards/logprob_reward/std": 0.3277379870414734, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 651.46875, "completions/mean_terminated_length": 582.4815063476562, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 4.435185185185185, "grad_norm": 1.6987658675392066, "kl": 0.6318359375, "learning_rate": 1.6733531551644503e-08, "loss": -0.0784, "num_tokens": 39504215.0, "reward": 0.23735493421554565, "reward_std": 0.11567119508981705, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.16997767984867096, "rewards/logprob_reward/std": 0.2777082026004791, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 683.96875, "completions/mean_terminated_length": 661.300048828125, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 4.438271604938271, "grad_norm": 1.4590120891628433, "kl": 0.60107421875, "learning_rate": 1.655416863067713e-08, "loss": -0.052, "num_tokens": 39532622.0, "reward": 0.4986763298511505, "reward_std": 0.08661604672670364, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.44991812109947205, "rewards/logprob_reward/std": 0.302733451128006, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 684.0625, "completions/mean_terminated_length": 570.75, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 4.441358024691358, "grad_norm": 1.63130441454606, "kl": 0.650390625, "learning_rate": 1.637573925061808e-08, "loss": -0.0509, "num_tokens": 39560856.0, "reward": 0.24830611050128937, "reward_std": 0.12573489546775818, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.19256234169006348, "rewards/logprob_reward/std": 0.280526727437973, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 645.65625, "completions/mean_terminated_length": 575.5925903320312, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 4.444444444444445, "grad_norm": 1.5795738662883245, "kl": 0.634521484375, "learning_rate": 1.6198244124999592e-08, "loss": -0.1078, "num_tokens": 39587989.0, "reward": 0.2595534920692444, "reward_std": 0.11400261521339417, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.19464275240898132, "rewards/logprob_reward/std": 0.26848214864730835, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 633.34375, "completions/mean_terminated_length": 620.741943359375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 4.447530864197531, "grad_norm": 1.4620778712753577, "kl": 0.66162109375, "learning_rate": 1.6021683963617805e-08, "loss": 0.0102, "num_tokens": 39615216.0, "reward": 0.327922523021698, "reward_std": 0.046899352222681046, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2636638879776001, "rewards/logprob_reward/std": 0.33484527468681335, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 613.84375, "completions/mean_terminated_length": 571.413818359375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 4.450617283950617, "grad_norm": 1.578287319952463, "kl": 0.666015625, "learning_rate": 1.5846059472530122e-08, "loss": -0.1551, "num_tokens": 39640947.0, "reward": 0.2955945134162903, "reward_std": 0.12406639009714127, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.23468834161758423, "rewards/logprob_reward/std": 0.29501873254776, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 664.75, "completions/mean_terminated_length": 627.586181640625, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 4.453703703703704, "grad_norm": 1.481964698247583, "kl": 0.58740234375, "learning_rate": 1.5671371354051997e-08, "loss": -0.0252, "num_tokens": 39669131.0, "reward": 0.3443082869052887, "reward_std": 0.05769061669707298, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2818703353404999, "rewards/logprob_reward/std": 0.33280321955680847, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 616.0, "completions/mean_terminated_length": 573.7930908203125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 4.45679012345679, "grad_norm": 1.5677605870581266, "kl": 0.61572265625, "learning_rate": 1.5497620306754582e-08, "loss": -0.0615, "num_tokens": 39695283.0, "reward": 0.46026384830474854, "reward_std": 0.05997993052005768, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.41765424609184265, "rewards/logprob_reward/std": 0.37784701585769653, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 569.1875, "completions/mean_terminated_length": 554.51611328125, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 4.459876543209877, "grad_norm": 1.7153355010580593, "kl": 0.6298828125, "learning_rate": 1.5324807025461656e-08, "loss": 0.0775, "num_tokens": 39719937.0, "reward": 0.5448050498962402, "reward_std": 0.04012700915336609, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.49770012497901917, "rewards/logprob_reward/std": 0.36130595207214355, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 643.78125, "completions/mean_terminated_length": 556.0384521484375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 4.462962962962963, "grad_norm": 1.6027967648561052, "kl": 0.64404296875, "learning_rate": 1.515293220124683e-08, "loss": 0.1283, "num_tokens": 39746962.0, "reward": 0.3585260212421417, "reward_std": 0.18744279444217682, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.30808448791503906, "rewards/logprob_reward/std": 0.3110165596008301, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 654.59375, "completions/mean_terminated_length": 551.1599731445312, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 4.466049382716049, "grad_norm": 1.9394159798223032, "kl": 0.791015625, "learning_rate": 1.498199652143092e-08, "loss": -0.0562, "num_tokens": 39774305.0, "reward": 0.16566982865333557, "reward_std": 0.12868653237819672, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.10421647131443024, "rewards/logprob_reward/std": 0.2274024784564972, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 662.78125, "completions/mean_terminated_length": 611.1785888671875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 4.469135802469136, "grad_norm": 1.7302902417868338, "kl": 0.63232421875, "learning_rate": 1.4812000669579188e-08, "loss": 0.0107, "num_tokens": 39802830.0, "reward": 0.25501397252082825, "reward_std": 0.1418566256761551, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.18959885835647583, "rewards/logprob_reward/std": 0.25934895873069763, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 656.21875, "completions/mean_terminated_length": 603.6785888671875, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 4.472222222222222, "grad_norm": 1.5282817465240168, "kl": 0.65283203125, "learning_rate": 1.4642945325498507e-08, "loss": -0.1056, "num_tokens": 39830121.0, "reward": 0.38242828845977783, "reward_std": 0.17101013660430908, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.33117032051086426, "rewards/logprob_reward/std": 0.3122538924217224, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 636.0625, "completions/mean_terminated_length": 595.9310302734375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 4.4753086419753085, "grad_norm": 1.449275111811373, "kl": 0.72998046875, "learning_rate": 1.4474831165234707e-08, "loss": -0.033, "num_tokens": 39856579.0, "reward": 0.454275518655777, "reward_std": 0.14744937419891357, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4075283706188202, "rewards/logprob_reward/std": 0.36907437443733215, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 638.625, "completions/mean_terminated_length": 598.7586059570312, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 4.478395061728395, "grad_norm": 1.4635226156714998, "kl": 0.6298828125, "learning_rate": 1.4307658861069799e-08, "loss": -0.0985, "num_tokens": 39883539.0, "reward": 0.30600759387016296, "reward_std": 0.024707134813070297, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2393139898777008, "rewards/logprob_reward/std": 0.30501946806907654, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 582.5625, "completions/mean_terminated_length": 553.1333618164062, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 4.481481481481482, "grad_norm": 1.8721722841922075, "kl": 0.68212890625, "learning_rate": 1.414142908151944e-08, "loss": -0.0497, "num_tokens": 39908405.0, "reward": 0.3947259187698364, "reward_std": 0.03768985718488693, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3344177007675171, "rewards/logprob_reward/std": 0.2959597408771515, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 677.46875, "completions/mean_terminated_length": 580.4400024414062, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 4.484567901234568, "grad_norm": 1.8204827408489457, "kl": 0.676025390625, "learning_rate": 1.3976142491330111e-08, "loss": 0.0058, "num_tokens": 39936360.0, "reward": 0.24476593732833862, "reward_std": 0.1203933134675026, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.18515661358833313, "rewards/logprob_reward/std": 0.24509762227535248, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 549.21875, "completions/mean_terminated_length": 533.9031982421875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 4.487654320987654, "grad_norm": 1.6353167312406216, "kl": 0.697265625, "learning_rate": 1.3811799751476588e-08, "loss": -0.2397, "num_tokens": 39960355.0, "reward": 0.4445737600326538, "reward_std": 0.15822428464889526, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.4002208113670349, "rewards/logprob_reward/std": 0.3139244616031647, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 612.65625, "completions/mean_terminated_length": 553.8928833007812, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 4.4907407407407405, "grad_norm": 1.5338248262537426, "kl": 0.673828125, "learning_rate": 1.3648401519159109e-08, "loss": -0.0708, "num_tokens": 39986564.0, "reward": 0.5310378670692444, "reward_std": 0.13132703304290771, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4928198456764221, "rewards/logprob_reward/std": 0.35706114768981934, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 682.28125, "completions/mean_terminated_length": 659.5000610351562, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 4.493827160493828, "grad_norm": 1.4482207680342003, "kl": 0.615234375, "learning_rate": 1.348594844780096e-08, "loss": -0.0945, "num_tokens": 40014997.0, "reward": 0.41279909014701843, "reward_std": 0.05003764480352402, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3544990122318268, "rewards/logprob_reward/std": 0.3055894672870636, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 635.3125, "completions/mean_terminated_length": 545.6154174804688, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 4.496913580246914, "grad_norm": 1.4554270072937898, "kl": 0.6171875, "learning_rate": 1.332444118704576e-08, "loss": -0.0602, "num_tokens": 40042055.0, "reward": 0.2669292986392975, "reward_std": 0.1884836107492447, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.21325477957725525, "rewards/logprob_reward/std": 0.26297545433044434, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 628.6875, "completions/mean_terminated_length": 602.3333740234375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 4.5, "grad_norm": 1.4547988698420797, "kl": 0.5830078125, "learning_rate": 1.3163880382754761e-08, "loss": 0.0268, "num_tokens": 40069025.0, "reward": 0.40306556224823, "reward_std": 0.10927714407444, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3471561670303345, "rewards/logprob_reward/std": 0.32214680314064026, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 671.5625, "completions/mean_terminated_length": 572.8800048828125, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 4.503086419753086, "grad_norm": 1.649903224361063, "kl": 0.6708984375, "learning_rate": 1.3004266677004522e-08, "loss": 0.0568, "num_tokens": 40096815.0, "reward": 0.2798728346824646, "reward_std": 0.20942535996437073, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.23110871016979218, "rewards/logprob_reward/std": 0.3210515081882477, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 709.09375, "completions/mean_terminated_length": 636.423095703125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 4.506172839506172, "grad_norm": 1.6251758564772683, "kl": 0.618896484375, "learning_rate": 1.2845600708084076e-08, "loss": -0.34, "num_tokens": 40125910.0, "reward": 0.2657831907272339, "reward_std": 0.1709056794643402, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.21198131144046783, "rewards/logprob_reward/std": 0.2678815424442291, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 706.625, "completions/mean_terminated_length": 600.8333740234375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 4.5092592592592595, "grad_norm": 1.6386169050385644, "kl": 0.65771484375, "learning_rate": 1.2687883110492515e-08, "loss": 0.0527, "num_tokens": 40155286.0, "reward": 0.25394248962402344, "reward_std": 0.17277218401432037, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.19882500171661377, "rewards/logprob_reward/std": 0.29511821269989014, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 603.9375, "completions/mean_terminated_length": 575.933349609375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 4.512345679012346, "grad_norm": 1.6238240833603697, "kl": 0.6474609375, "learning_rate": 1.2531114514936491e-08, "loss": -0.0043, "num_tokens": 40180956.0, "reward": 0.3982352614402771, "reward_std": 0.08852382749319077, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.338316947221756, "rewards/logprob_reward/std": 0.26259905099868774, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 669.90625, "completions/mean_terminated_length": 604.3333129882812, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 4.515432098765432, "grad_norm": 1.5396780715280456, "kl": 0.6025390625, "learning_rate": 1.2375295548327557e-08, "loss": 0.0158, "num_tokens": 40208585.0, "reward": 0.2664243280887604, "reward_std": 0.15152683854103088, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.2092214673757553, "rewards/logprob_reward/std": 0.2909022867679596, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 579.21875, "completions/mean_terminated_length": 564.8709716796875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 4.518518518518518, "grad_norm": 1.686287693509352, "kl": 0.673583984375, "learning_rate": 1.222042683377983e-08, "loss": -0.0036, "num_tokens": 40233920.0, "reward": 0.4524105489253998, "reward_std": 0.052459392696619034, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.39503949880599976, "rewards/logprob_reward/std": 0.3559662699699402, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 628.1875, "completions/mean_terminated_length": 554.888916015625, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 4.521604938271605, "grad_norm": 1.6583503576834586, "kl": 0.67919921875, "learning_rate": 1.2066508990607293e-08, "loss": -0.1213, "num_tokens": 40260406.0, "reward": 0.25734296441078186, "reward_std": 0.148172527551651, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.1956588327884674, "rewards/logprob_reward/std": 0.2716088891029358, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 656.875, "completions/mean_terminated_length": 604.4285888671875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 4.5246913580246915, "grad_norm": 1.6205398596623362, "kl": 0.603515625, "learning_rate": 1.1913542634321538e-08, "loss": -0.0888, "num_tokens": 40287762.0, "reward": 0.3753162622451782, "reward_std": 0.10541736334562302, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3267402648925781, "rewards/logprob_reward/std": 0.29656100273132324, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 592.5625, "completions/mean_terminated_length": 530.9285888671875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 4.527777777777778, "grad_norm": 1.6382420004200269, "kl": 0.6953125, "learning_rate": 1.1761528376629137e-08, "loss": -0.036, "num_tokens": 40312976.0, "reward": 0.4683385491371155, "reward_std": 0.22587673366069794, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.43009835481643677, "rewards/logprob_reward/std": 0.32454535365104675, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 606.9375, "completions/mean_terminated_length": 563.7930908203125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 4.530864197530864, "grad_norm": 1.721679074090868, "kl": 0.6787109375, "learning_rate": 1.1610466825429182e-08, "loss": -0.1346, "num_tokens": 40339118.0, "reward": 0.2605217397212982, "reward_std": 0.04365590959787369, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.18877416849136353, "rewards/logprob_reward/std": 0.3021732270717621, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 589.0, "completions/mean_terminated_length": 526.857177734375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 4.533950617283951, "grad_norm": 1.59029928456581, "kl": 0.68310546875, "learning_rate": 1.1460358584811091e-08, "loss": -0.0738, "num_tokens": 40364138.0, "reward": 0.2629494369029999, "reward_std": 0.056922849267721176, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.19841602444648743, "rewards/logprob_reward/std": 0.3139144480228424, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 576.28125, "completions/mean_terminated_length": 561.8386840820312, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 4.537037037037037, "grad_norm": 1.5029616563128902, "kl": 0.64599609375, "learning_rate": 1.1311204255051942e-08, "loss": -0.0274, "num_tokens": 40389135.0, "reward": 0.3788328170776367, "reward_std": 0.028946975246071815, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3167586922645569, "rewards/logprob_reward/std": 0.3223121166229248, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 656.15625, "completions/mean_terminated_length": 588.0370483398438, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 4.540123456790123, "grad_norm": 1.7001095177411158, "kl": 0.689453125, "learning_rate": 1.116300443261417e-08, "loss": 0.022, "num_tokens": 40416856.0, "reward": 0.2785152792930603, "reward_std": 0.138594388961792, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.21918366849422455, "rewards/logprob_reward/std": 0.23866917192935944, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 607.90625, "completions/mean_terminated_length": 594.4838256835938, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 4.54320987654321, "grad_norm": 1.6851136429923046, "kl": 0.6875, "learning_rate": 1.1015759710143124e-08, "loss": -0.048, "num_tokens": 40443057.0, "reward": 0.49608516693115234, "reward_std": 0.12946563959121704, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.45051127672195435, "rewards/logprob_reward/std": 0.3570079803466797, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 654.15625, "completions/mean_terminated_length": 568.8077392578125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 4.546296296296296, "grad_norm": 1.5468746486666658, "kl": 0.68798828125, "learning_rate": 1.0869470676464848e-08, "loss": -0.1093, "num_tokens": 40470186.0, "reward": 0.28684502840042114, "reward_std": 0.1367895007133484, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.23538337647914886, "rewards/logprob_reward/std": 0.25621992349624634, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 636.15625, "completions/mean_terminated_length": 596.0344848632812, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 4.549382716049383, "grad_norm": 1.6062610792981664, "kl": 0.65869140625, "learning_rate": 1.0724137916583525e-08, "loss": -0.0991, "num_tokens": 40496851.0, "reward": 0.26507773995399475, "reward_std": 0.07717886567115784, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.20078080892562866, "rewards/logprob_reward/std": 0.32264024019241333, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 677.21875, "completions/mean_terminated_length": 580.1199951171875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 4.552469135802469, "grad_norm": 1.4429451034499112, "kl": 0.6328125, "learning_rate": 1.0579762011679317e-08, "loss": -0.012, "num_tokens": 40525694.0, "reward": 0.2544757127761841, "reward_std": 0.12014258652925491, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.19941745698451996, "rewards/logprob_reward/std": 0.2637307643890381, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 560.5625, "completions/mean_terminated_length": 545.6129150390625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 4.555555555555555, "grad_norm": 1.51147388368161, "kl": 0.6591796875, "learning_rate": 1.0436343539105857e-08, "loss": -0.2095, "num_tokens": 40550016.0, "reward": 0.3771514892578125, "reward_std": 0.11396949738264084, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3253071904182434, "rewards/logprob_reward/std": 0.3318803012371063, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 608.125, "completions/mean_terminated_length": 594.7096557617188, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 4.5586419753086425, "grad_norm": 1.5392469824211903, "kl": 0.65478515625, "learning_rate": 1.0293883072388154e-08, "loss": -0.1751, "num_tokens": 40575920.0, "reward": 0.2826491892337799, "reward_std": 0.11687320470809937, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2203046828508377, "rewards/logprob_reward/std": 0.3166985511779785, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 583.90625, "completions/mean_terminated_length": 554.5667114257812, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 4.561728395061729, "grad_norm": 1.7499520833565863, "kl": 0.69189453125, "learning_rate": 1.015238118122011e-08, "loss": -0.0772, "num_tokens": 40601233.0, "reward": 0.5278899669647217, "reward_std": 0.07041719555854797, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4893221855163574, "rewards/logprob_reward/std": 0.40953290462493896, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 636.9375, "completions/mean_terminated_length": 581.6428833007812, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 4.564814814814815, "grad_norm": 1.7072666031927128, "kl": 0.658203125, "learning_rate": 1.0011838431462389e-08, "loss": -0.2309, "num_tokens": 40627519.0, "reward": 0.29464104771614075, "reward_std": 0.1486499309539795, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.23362894356250763, "rewards/logprob_reward/std": 0.27124112844467163, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 620.625, "completions/mean_terminated_length": 545.9259033203125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 4.567901234567901, "grad_norm": 1.6061249128648942, "kl": 0.6572265625, "learning_rate": 9.872255385140027e-09, "loss": -0.0098, "num_tokens": 40653807.0, "reward": 0.41907644271850586, "reward_std": 0.04790935292840004, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.37536269426345825, "rewards/logprob_reward/std": 0.3345555067062378, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 674.09375, "completions/mean_terminated_length": 609.2963256835938, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 4.570987654320987, "grad_norm": 1.6075455872323177, "kl": 0.623046875, "learning_rate": 9.733632600440245e-09, "loss": -0.1436, "num_tokens": 40682434.0, "reward": 0.44423210620880127, "reward_std": 0.1672268956899643, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.4033135175704956, "rewards/logprob_reward/std": 0.3367186188697815, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 614.53125, "completions/mean_terminated_length": 572.1724243164062, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 4.574074074074074, "grad_norm": 1.5296954279606865, "kl": 0.66455078125, "learning_rate": 9.595970631710248e-09, "loss": -0.0168, "num_tokens": 40708923.0, "reward": 0.2956441044807434, "reward_std": 0.06040128692984581, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2312711924314499, "rewards/logprob_reward/std": 0.27165836095809937, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 571.21875, "completions/mean_terminated_length": 571.21875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 4.577160493827161, "grad_norm": 1.6499513956095813, "kl": 0.638671875, "learning_rate": 9.459270029454986e-09, "loss": -0.1232, "num_tokens": 40734058.0, "reward": 0.5047092437744141, "reward_std": 0.06176898628473282, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.4566214084625244, "rewards/logprob_reward/std": 0.30148598551750183, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 646.125, "completions/mean_terminated_length": 607.0344848632812, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 4.580246913580247, "grad_norm": 1.5107721215192704, "kl": 0.63232421875, "learning_rate": 9.323531340334868e-09, "loss": -0.0342, "num_tokens": 40760866.0, "reward": 0.5694421529769897, "reward_std": 0.181858628988266, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.5320190191268921, "rewards/logprob_reward/std": 0.32316121459007263, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 578.3125, "completions/mean_terminated_length": 563.9354858398438, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 4.583333333333333, "grad_norm": 1.3967409574999627, "kl": 0.6806640625, "learning_rate": 9.188755107163743e-09, "loss": -0.1643, "num_tokens": 40785148.0, "reward": 0.4511399269104004, "reward_std": 0.07510495185852051, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.39709991216659546, "rewards/logprob_reward/std": 0.3301622271537781, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 664.8125, "completions/mean_terminated_length": 581.923095703125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 4.58641975308642, "grad_norm": 1.4731476567317443, "kl": 0.623046875, "learning_rate": 9.054941868906513e-09, "loss": -0.033, "num_tokens": 40812738.0, "reward": 0.25546595454216003, "reward_std": 0.09555266797542572, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.20051774382591248, "rewards/logprob_reward/std": 0.30628281831741333, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 595.03125, "completions/mean_terminated_length": 533.75, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 4.589506172839506, "grad_norm": 1.4326004003352668, "kl": 0.671875, "learning_rate": 8.922092160677242e-09, "loss": 0.0086, "num_tokens": 40837971.0, "reward": 0.4335731267929077, "reward_std": 0.10900871455669403, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.39147013425827026, "rewards/logprob_reward/std": 0.34734514355659485, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 619.09375, "completions/mean_terminated_length": 561.25, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 4.592592592592593, "grad_norm": 1.819844215076157, "kl": 0.71435546875, "learning_rate": 8.79020651373677e-09, "loss": -0.0461, "num_tokens": 40863990.0, "reward": 0.4046160578727722, "reward_std": 0.12718471884727478, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3558233976364136, "rewards/logprob_reward/std": 0.3138044476509094, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 622.78125, "completions/mean_terminated_length": 565.4642944335938, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 4.595679012345679, "grad_norm": 1.7731322596412393, "kl": 0.66015625, "learning_rate": 8.659285455490745e-09, "loss": -0.1396, "num_tokens": 40890791.0, "reward": 0.4081113934516907, "reward_std": 0.14348728954792023, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3597070872783661, "rewards/logprob_reward/std": 0.3183004558086395, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 642.96875, "completions/mean_terminated_length": 588.5357666015625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 4.598765432098766, "grad_norm": 1.7212812139320637, "kl": 0.6875, "learning_rate": 8.529329509487455e-09, "loss": -0.1004, "num_tokens": 40917726.0, "reward": 0.3084024488925934, "reward_std": 0.07413250207901001, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.24544718861579895, "rewards/logprob_reward/std": 0.3176335394382477, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 601.1875, "completions/mean_terminated_length": 601.1875, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 4.601851851851852, "grad_norm": 1.7211416414533496, "kl": 0.6611328125, "learning_rate": 8.400339195415718e-09, "loss": 0.0017, "num_tokens": 40943456.0, "reward": 0.41962742805480957, "reward_std": 0.012808471918106079, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.35514160990715027, "rewards/logprob_reward/std": 0.32331448793411255, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 558.6875, "completions/mean_terminated_length": 510.5517272949219, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 4.604938271604938, "grad_norm": 1.6319037803649934, "kl": 0.71533203125, "learning_rate": 8.272315029102888e-09, "loss": -0.1367, "num_tokens": 40968482.0, "reward": 0.30874210596084595, "reward_std": 0.1298351287841797, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.24929681420326233, "rewards/logprob_reward/std": 0.2748928666114807, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 588.78125, "completions/mean_terminated_length": 559.7667236328125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 4.6080246913580245, "grad_norm": 1.8231492478850215, "kl": 0.7060546875, "learning_rate": 8.145257522512606e-09, "loss": -0.0893, "num_tokens": 40994227.0, "reward": 0.3646899163722992, "reward_std": 0.03592357039451599, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.30104437470436096, "rewards/logprob_reward/std": 0.3198344111442566, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 667.8125, "completions/mean_terminated_length": 601.8518676757812, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 4.611111111111111, "grad_norm": 1.4531836831954894, "kl": 0.638671875, "learning_rate": 8.019167183743041e-09, "loss": -0.0868, "num_tokens": 41022701.0, "reward": 0.3338848948478699, "reward_std": 0.08131659030914307, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.27376097440719604, "rewards/logprob_reward/std": 0.2541194558143616, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 621.125, "completions/mean_terminated_length": 608.1290283203125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 4.614197530864198, "grad_norm": 1.493420581233994, "kl": 0.66162109375, "learning_rate": 7.89404451702455e-09, "loss": -0.0387, "num_tokens": 41049549.0, "reward": 0.3793020248413086, "reward_std": 0.07093176245689392, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.31380778551101685, "rewards/logprob_reward/std": 0.2836196720600128, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 607.59375, "completions/mean_terminated_length": 564.5172119140625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 4.617283950617284, "grad_norm": 1.4318825707060316, "kl": 0.58154296875, "learning_rate": 7.769890022717884e-09, "loss": -0.05, "num_tokens": 41076596.0, "reward": 0.500262439250946, "reward_std": 0.15359503030776978, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4586249589920044, "rewards/logprob_reward/std": 0.3225168287754059, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 691.96875, "completions/mean_terminated_length": 615.34619140625, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 4.62037037037037, "grad_norm": 1.7618698300989906, "kl": 0.616943359375, "learning_rate": 7.646704197312143e-09, "loss": -0.0126, "num_tokens": 41105423.0, "reward": 0.3334507346153259, "reward_std": 0.11642857640981674, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.28022301197052, "rewards/logprob_reward/std": 0.3062838613986969, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 586.875, "completions/mean_terminated_length": 557.7333374023438, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 4.6234567901234565, "grad_norm": 1.6493739280585693, "kl": 0.6337890625, "learning_rate": 7.524487533422635e-09, "loss": -0.0357, "num_tokens": 41130555.0, "reward": 0.3066996932029724, "reward_std": 0.056417424231767654, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2366107702255249, "rewards/logprob_reward/std": 0.30135732889175415, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 654.25, "completions/mean_terminated_length": 585.7777709960938, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 4.6265432098765435, "grad_norm": 1.5876229534703301, "kl": 0.631103515625, "learning_rate": 7.403240519789161e-09, "loss": -0.1163, "num_tokens": 41157907.0, "reward": 0.344829797744751, "reward_std": 0.10202877223491669, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.2963387072086334, "rewards/logprob_reward/std": 0.30353811383247375, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 624.5, "completions/mean_terminated_length": 597.86669921875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 4.62962962962963, "grad_norm": 1.4245396355140885, "kl": 0.64404296875, "learning_rate": 7.282963641273842e-09, "loss": -0.0925, "num_tokens": 41185179.0, "reward": 0.4962606430053711, "reward_std": 0.0863039568066597, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.45070627331733704, "rewards/logprob_reward/std": 0.31232988834381104, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 573.53125, "completions/mean_terminated_length": 490.1111145019531, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 4.632716049382716, "grad_norm": 1.8012320147283722, "kl": 0.67822265625, "learning_rate": 7.163657378859267e-09, "loss": 0.1265, "num_tokens": 41209540.0, "reward": 0.2950032651424408, "reward_std": 0.07470333576202393, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.23750363290309906, "rewards/logprob_reward/std": 0.30755895376205444, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 616.1875, "completions/mean_terminated_length": 557.9285888671875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 4.635802469135802, "grad_norm": 1.6075067476327032, "kl": 0.644287109375, "learning_rate": 7.045322209646654e-09, "loss": -0.128, "num_tokens": 41235710.0, "reward": 0.3259797990322113, "reward_std": 0.16804982721805573, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2684497833251953, "rewards/logprob_reward/std": 0.3268306255340576, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 692.875, "completions/mean_terminated_length": 616.4615478515625, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 4.638888888888889, "grad_norm": 1.483801868573726, "kl": 0.599365234375, "learning_rate": 6.927958606853746e-09, "loss": -0.1378, "num_tokens": 41264774.0, "reward": 0.21084165573120117, "reward_std": 0.05331305414438248, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.15093518793582916, "rewards/logprob_reward/std": 0.31149551272392273, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 565.09375, "completions/mean_terminated_length": 550.290283203125, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 4.6419753086419755, "grad_norm": 1.4154145026386424, "kl": 0.640625, "learning_rate": 6.811567039813087e-09, "loss": -0.0364, "num_tokens": 41289409.0, "reward": 0.4211810231208801, "reward_std": 0.02735307440161705, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.36034005880355835, "rewards/logprob_reward/std": 0.3731338381767273, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 663.46875, "completions/mean_terminated_length": 626.1724243164062, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 4.645061728395062, "grad_norm": 1.4389618916919193, "kl": 0.61572265625, "learning_rate": 6.696147973970112e-09, "loss": 0.0047, "num_tokens": 41316932.0, "reward": 0.3250003159046173, "reward_std": 0.12398744374513626, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.26736146211624146, "rewards/logprob_reward/std": 0.3024098873138428, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 632.4375, "completions/mean_terminated_length": 591.9310302734375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 4.648148148148148, "grad_norm": 1.6689090368324258, "kl": 0.66455078125, "learning_rate": 6.581701870881196e-09, "loss": 0.0303, "num_tokens": 41343502.0, "reward": 0.3835676908493042, "reward_std": 0.13179218769073486, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.33243632316589355, "rewards/logprob_reward/std": 0.34144821763038635, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 680.09375, "completions/mean_terminated_length": 600.7307739257812, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 4.651234567901234, "grad_norm": 1.5133017455409972, "kl": 0.66943359375, "learning_rate": 6.4682291882119375e-09, "loss": -0.0526, "num_tokens": 41372173.0, "reward": 0.3423236608505249, "reward_std": 0.09250175952911377, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.29008185863494873, "rewards/logprob_reward/std": 0.2897256314754486, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 600.25, "completions/mean_terminated_length": 572.0, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 4.654320987654321, "grad_norm": 1.5996844194535533, "kl": 0.6162109375, "learning_rate": 6.355730379735219e-09, "loss": 0.0431, "num_tokens": 41397797.0, "reward": 0.36124861240386963, "reward_std": 0.04813811928033829, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.29722070693969727, "rewards/logprob_reward/std": 0.2970229685306549, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 636.21875, "completions/mean_terminated_length": 546.7307739257812, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 4.657407407407407, "grad_norm": 1.733294398952414, "kl": 0.6630859375, "learning_rate": 6.244205895329452e-09, "loss": -0.028, "num_tokens": 41424848.0, "reward": 0.24957111477851868, "reward_std": 0.11390677839517593, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.18702346086502075, "rewards/logprob_reward/std": 0.2827208936214447, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 605.90625, "completions/mean_terminated_length": 578.0333862304688, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 4.660493827160494, "grad_norm": 6.168337890513164, "kl": 3.60595703125, "learning_rate": 6.133656180976776e-09, "loss": -0.0627, "num_tokens": 41450921.0, "reward": 0.3672724664211273, "reward_std": 0.1061067208647728, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3073860704898834, "rewards/logprob_reward/std": 0.3036520481109619, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 649.1875, "completions/mean_terminated_length": 595.6428833007812, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 4.66358024691358, "grad_norm": 1.4983924405899482, "kl": 0.61328125, "learning_rate": 6.024081678761228e-09, "loss": -0.0951, "num_tokens": 41478115.0, "reward": 0.3306097984313965, "reward_std": 0.09728977084159851, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2701220214366913, "rewards/logprob_reward/std": 0.30373623967170715, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 614.40625, "completions/mean_terminated_length": 587.1000366210938, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 4.666666666666667, "grad_norm": 1.3684899654745197, "kl": 0.6044921875, "learning_rate": 5.915482826867047e-09, "loss": -0.1285, "num_tokens": 41503884.0, "reward": 0.47588223218917847, "reward_std": 0.11659595370292664, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4280635714530945, "rewards/logprob_reward/std": 0.27764904499053955, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 608.9375, "completions/mean_terminated_length": 595.54833984375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 4.669753086419753, "grad_norm": 1.3335124833517444, "kl": 0.68798828125, "learning_rate": 5.807860059576841e-09, "loss": -0.1538, "num_tokens": 41530466.0, "reward": 0.32042714953422546, "reward_std": 0.10971537232398987, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2588079571723938, "rewards/logprob_reward/std": 0.2939283549785614, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 598.34375, "completions/mean_terminated_length": 500.11541748046875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 4.672839506172839, "grad_norm": 1.4844994149405657, "kl": 0.607666015625, "learning_rate": 5.701213807269956e-09, "loss": -0.0105, "num_tokens": 41556245.0, "reward": 0.3553958535194397, "reward_std": 0.22128954529762268, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.30807870626449585, "rewards/logprob_reward/std": 0.34947848320007324, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 654.375, "completions/mean_terminated_length": 569.0769653320312, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 4.675925925925926, "grad_norm": 1.7105351418205397, "kl": 0.65478515625, "learning_rate": 5.5955444964206345e-09, "loss": 0.0398, "num_tokens": 41583873.0, "reward": 0.41713327169418335, "reward_std": 0.1725083589553833, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3766758441925049, "rewards/logprob_reward/std": 0.344534307718277, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 576.0625, "completions/mean_terminated_length": 546.2000122070312, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 4.679012345679013, "grad_norm": 1.7510031457974278, "kl": 0.62744140625, "learning_rate": 5.490852549596387e-09, "loss": -0.0344, "num_tokens": 41608935.0, "reward": 0.3712571859359741, "reward_std": 0.09945842623710632, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.30834129452705383, "rewards/logprob_reward/std": 0.2861432731151581, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 618.1875, "completions/mean_terminated_length": 560.2142944335938, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 4.682098765432099, "grad_norm": 1.6714282006825838, "kl": 0.6826171875, "learning_rate": 5.387138385456319e-09, "loss": -0.057, "num_tokens": 41635229.0, "reward": 0.2647513151168823, "reward_std": 0.0642758160829544, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.196945920586586, "rewards/logprob_reward/std": 0.29207462072372437, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 660.1875, "completions/mean_terminated_length": 608.2142944335938, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 4.685185185185185, "grad_norm": 1.5885516970104256, "kl": 0.670654296875, "learning_rate": 5.284402418749362e-09, "loss": -0.0135, "num_tokens": 41662259.0, "reward": 0.39571478962898254, "reward_std": 0.05865626409649849, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.35287758708000183, "rewards/logprob_reward/std": 0.3080309331417084, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 683.625, "completions/mean_terminated_length": 635.0, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 4.688271604938271, "grad_norm": 1.622107423044986, "kl": 0.6044921875, "learning_rate": 5.182645060312685e-09, "loss": -0.0373, "num_tokens": 41690755.0, "reward": 0.3516530394554138, "reward_std": 0.12066973745822906, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3004477918148041, "rewards/logprob_reward/std": 0.3123582899570465, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 711.71875, "completions/mean_terminated_length": 624.2799682617188, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 4.6913580246913575, "grad_norm": 1.5621313025535155, "kl": 0.61865234375, "learning_rate": 5.081866717070088e-09, "loss": -0.1356, "num_tokens": 41720126.0, "reward": 0.25725945830345154, "reward_std": 0.07388924807310104, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.2025105208158493, "rewards/logprob_reward/std": 0.307766854763031, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 582.71875, "completions/mean_terminated_length": 568.4838256835938, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 4.694444444444445, "grad_norm": 1.5873027569302531, "kl": 0.62939453125, "learning_rate": 4.9820677920302534e-09, "loss": -0.0442, "num_tokens": 41745081.0, "reward": 0.35769394040107727, "reward_std": 0.0397963672876358, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.2932710349559784, "rewards/logprob_reward/std": 0.30971822142601013, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 647.96875, "completions/mean_terminated_length": 578.3333129882812, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 4.697530864197531, "grad_norm": 1.5099399423152358, "kl": 0.673828125, "learning_rate": 4.883248684285302e-09, "loss": -0.2014, "num_tokens": 41772216.0, "reward": 0.2648606598377228, "reward_std": 0.11817566305398941, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.20401184260845184, "rewards/logprob_reward/std": 0.26747146248817444, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 631.9375, "completions/mean_terminated_length": 541.4615478515625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 4.700617283950617, "grad_norm": 1.625273532127586, "kl": 0.66064453125, "learning_rate": 4.785409789008988e-09, "loss": -0.0151, "num_tokens": 41798994.0, "reward": 0.3513917922973633, "reward_std": 0.13913173973560333, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2966853082180023, "rewards/logprob_reward/std": 0.25738972425460815, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 646.53125, "completions/mean_terminated_length": 576.629638671875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 4.703703703703704, "grad_norm": 1.5725672889336315, "kl": 0.6669921875, "learning_rate": 4.68855149745534e-09, "loss": 0.0099, "num_tokens": 41826343.0, "reward": 0.3599196970462799, "reward_std": 0.15665298700332642, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3061607778072357, "rewards/logprob_reward/std": 0.2610715925693512, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 568.78125, "completions/mean_terminated_length": 554.0967407226562, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 4.70679012345679, "grad_norm": 1.675956471985648, "kl": 0.68310546875, "learning_rate": 4.592674196956914e-09, "loss": -0.1422, "num_tokens": 41850500.0, "reward": 0.46250051259994507, "reward_std": 0.09865938127040863, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.4131949841976166, "rewards/logprob_reward/std": 0.33078107237815857, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 560.125, "completions/mean_terminated_length": 560.125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 4.709876543209877, "grad_norm": 1.6249253642803203, "kl": 0.6572265625, "learning_rate": 4.497778270923374e-09, "loss": -0.1287, "num_tokens": 41875660.0, "reward": 0.5072196125984192, "reward_std": 0.09360355138778687, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.4559384286403656, "rewards/logprob_reward/std": 0.3151721656322479, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 638.75, "completions/mean_terminated_length": 583.7142944335938, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 4.712962962962963, "grad_norm": 1.7184145585782873, "kl": 0.6845703125, "learning_rate": 4.403864098839833e-09, "loss": -0.0992, "num_tokens": 41902268.0, "reward": 0.4586626887321472, "reward_std": 0.13698400557041168, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.4158751964569092, "rewards/logprob_reward/std": 0.3525381088256836, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 650.46875, "completions/mean_terminated_length": 581.2963256835938, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 4.716049382716049, "grad_norm": 1.7471511019566524, "kl": 0.638427734375, "learning_rate": 4.31093205626551e-09, "loss": -0.0263, "num_tokens": 41929683.0, "reward": 0.3591461777687073, "reward_std": 0.10843110829591751, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.30877357721328735, "rewards/logprob_reward/std": 0.31344476342201233, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 655.71875, "completions/mean_terminated_length": 587.5184936523438, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 4.719135802469136, "grad_norm": 1.7391929447779988, "kl": 0.75, "learning_rate": 4.218982514832048e-09, "loss": 0.0147, "num_tokens": 41957106.0, "reward": 0.3091885447502136, "reward_std": 0.09400251507759094, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.24979284405708313, "rewards/logprob_reward/std": 0.31749820709228516, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 656.03125, "completions/mean_terminated_length": 571.1154174804688, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 4.722222222222222, "grad_norm": 1.5899278237604269, "kl": 0.628662109375, "learning_rate": 4.128015842242122e-09, "loss": -0.1412, "num_tokens": 41984275.0, "reward": 0.21665401756763458, "reward_std": 0.14132598042488098, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.15392112731933594, "rewards/logprob_reward/std": 0.2848603129386902, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 677.875, "completions/mean_terminated_length": 628.4285888671875, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 4.7253086419753085, "grad_norm": 1.4715311674593028, "kl": 0.62353515625, "learning_rate": 4.0380324022679935e-09, "loss": -0.1363, "num_tokens": 42012439.0, "reward": 0.3621186316013336, "reward_std": 0.12233404070138931, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.30513185262680054, "rewards/logprob_reward/std": 0.3452848792076111, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 692.3125, "completions/mean_terminated_length": 599.4400024414062, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 4.728395061728395, "grad_norm": 1.5051708284228094, "kl": 0.6455078125, "learning_rate": 3.9490325547499316e-09, "loss": -0.0164, "num_tokens": 42041213.0, "reward": 0.5136197805404663, "reward_std": 0.18795272707939148, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.4838831424713135, "rewards/logprob_reward/std": 0.4486639201641083, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 653.65625, "completions/mean_terminated_length": 600.75, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 4.731481481481482, "grad_norm": 1.586156201252431, "kl": 0.64501953125, "learning_rate": 3.861016655594962e-09, "loss": -0.0179, "num_tokens": 42068766.0, "reward": 0.32601824402809143, "reward_std": 0.14114752411842346, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2719647288322449, "rewards/logprob_reward/std": 0.2896358370780945, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 656.65625, "completions/mean_terminated_length": 618.6551513671875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 4.734567901234568, "grad_norm": 1.704712197631911, "kl": 0.646484375, "learning_rate": 3.773985056775258e-09, "loss": -0.0531, "num_tokens": 42096511.0, "reward": 0.4290062189102173, "reward_std": 0.1015857681632042, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.38292354345321655, "rewards/logprob_reward/std": 0.37177774310112, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 604.59375, "completions/mean_terminated_length": 604.59375, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 4.737654320987654, "grad_norm": 1.556511398555781, "kl": 0.6826171875, "learning_rate": 3.68793810632681e-09, "loss": -0.0159, "num_tokens": 42122022.0, "reward": 0.39002320170402527, "reward_std": 0.03621188551187515, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.32224804162979126, "rewards/logprob_reward/std": 0.30544522404670715, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 664.78125, "completions/mean_terminated_length": 545.0416870117188, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 4.7407407407407405, "grad_norm": 1.7227840574259345, "kl": 0.68310546875, "learning_rate": 3.602876148348116e-09, "loss": -0.0403, "num_tokens": 42149539.0, "reward": 0.371837317943573, "reward_std": 0.19306811690330505, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3298192024230957, "rewards/logprob_reward/std": 0.3151124119758606, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 609.96875, "completions/mean_terminated_length": 567.137939453125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 4.743827160493828, "grad_norm": 1.4319134046741784, "kl": 0.640380859375, "learning_rate": 3.518799522998661e-09, "loss": 0.0042, "num_tokens": 42175934.0, "reward": 0.34102851152420044, "reward_std": 0.10027420520782471, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2816983461380005, "rewards/logprob_reward/std": 0.2825404703617096, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 547.875, "completions/mean_terminated_length": 532.51611328125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 4.746913580246914, "grad_norm": 1.894347891841536, "kl": 0.72021484375, "learning_rate": 3.435708566497608e-09, "loss": -0.104, "num_tokens": 42199430.0, "reward": 0.38753411173820496, "reward_std": 0.06984757632017136, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.33337122201919556, "rewards/logprob_reward/std": 0.30069461464881897, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 622.34375, "completions/mean_terminated_length": 564.9642944335938, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 4.75, "grad_norm": 1.755280682776025, "kl": 0.66357421875, "learning_rate": 3.353603611122524e-09, "loss": -0.0631, "num_tokens": 42225365.0, "reward": 0.40157532691955566, "reward_std": 0.14443372189998627, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3559170365333557, "rewards/logprob_reward/std": 0.35512664914131165, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 620.71875, "completions/mean_terminated_length": 593.8333740234375, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 4.753086419753086, "grad_norm": 1.775404524839957, "kl": 0.651611328125, "learning_rate": 3.2724849852079628e-09, "loss": -0.1086, "num_tokens": 42252056.0, "reward": 0.5054649114608765, "reward_std": 0.1092282384634018, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.45746105909347534, "rewards/logprob_reward/std": 0.32872462272644043, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 577.40625, "completions/mean_terminated_length": 494.7037048339844, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 4.756172839506172, "grad_norm": 1.7565976314302731, "kl": 0.693359375, "learning_rate": 3.192353013144189e-09, "loss": -0.0522, "num_tokens": 42277069.0, "reward": 0.3600865304470062, "reward_std": 0.15828199684619904, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3063461482524872, "rewards/logprob_reward/std": 0.37334975600242615, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 628.9375, "completions/mean_terminated_length": 572.5, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 4.7592592592592595, "grad_norm": 1.4190252730715356, "kl": 0.60400390625, "learning_rate": 3.113208015375901e-09, "loss": -0.0443, "num_tokens": 42303675.0, "reward": 0.3730033040046692, "reward_std": 0.09320180863142014, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3172259032726288, "rewards/logprob_reward/std": 0.3665897250175476, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 572.96875, "completions/mean_terminated_length": 526.3103637695312, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 4.762345679012346, "grad_norm": 1.7238888838960205, "kl": 0.684326171875, "learning_rate": 3.0350503084008995e-09, "loss": -0.0729, "num_tokens": 42328422.0, "reward": 0.4364106357097626, "reward_std": 0.0629199892282486, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.38420623540878296, "rewards/logprob_reward/std": 0.3362131118774414, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 655.90625, "completions/mean_terminated_length": 603.3214721679688, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 4.765432098765432, "grad_norm": 1.5349157218287044, "kl": 0.630615234375, "learning_rate": 2.957880204768809e-09, "loss": -0.1307, "num_tokens": 42355735.0, "reward": 0.3615562319755554, "reward_std": 0.23643046617507935, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.31839579343795776, "rewards/logprob_reward/std": 0.3221658170223236, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 669.9375, "completions/mean_terminated_length": 646.3333740234375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 4.768518518518518, "grad_norm": 1.5516454218044233, "kl": 0.6533203125, "learning_rate": 2.8816980130799418e-09, "loss": 0.0084, "num_tokens": 42383457.0, "reward": 0.4233872890472412, "reward_std": 0.09984374046325684, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3697358965873718, "rewards/logprob_reward/std": 0.3061535656452179, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 533.9375, "completions/mean_terminated_length": 501.2666931152344, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 4.771604938271605, "grad_norm": 1.8070875305658787, "kl": 0.6767578125, "learning_rate": 2.806504037983992e-09, "loss": -0.0291, "num_tokens": 42406855.0, "reward": 0.44650810956954956, "reward_std": 0.09535175561904907, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3919534981250763, "rewards/logprob_reward/std": 0.37066373229026794, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 692.125, "completions/mean_terminated_length": 615.5385131835938, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 4.7746913580246915, "grad_norm": 1.797432036767165, "kl": 0.65771484375, "learning_rate": 2.7322985801787046e-09, "loss": 0.0474, "num_tokens": 42435667.0, "reward": 0.3551892638206482, "reward_std": 0.1964063048362732, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3043769896030426, "rewards/logprob_reward/std": 0.32449349761009216, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 597.8125, "completions/mean_terminated_length": 553.72412109375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 4.777777777777778, "grad_norm": 1.4772157903734902, "kl": 0.60595703125, "learning_rate": 2.6590819364088746e-09, "loss": 0.0078, "num_tokens": 42460889.0, "reward": 0.33964604139328003, "reward_std": 0.103782519698143, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2766900658607483, "rewards/logprob_reward/std": 0.2778541147708893, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 603.40625, "completions/mean_terminated_length": 575.36669921875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 4.780864197530864, "grad_norm": 1.4791205584414882, "kl": 0.704833984375, "learning_rate": 2.5868543994650993e-09, "loss": -0.0986, "num_tokens": 42486510.0, "reward": 0.4038945734500885, "reward_std": 0.08974959701299667, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.35154953598976135, "rewards/logprob_reward/std": 0.3251228928565979, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 656.6875, "completions/mean_terminated_length": 571.923095703125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 4.783950617283951, "grad_norm": 1.493193476441432, "kl": 0.5927734375, "learning_rate": 2.5156162581824736e-09, "loss": -0.0793, "num_tokens": 42514416.0, "reward": 0.3684358298778534, "reward_std": 0.0907871425151825, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.3260398507118225, "rewards/logprob_reward/std": 0.36973345279693604, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 614.875, "completions/mean_terminated_length": 601.6774291992188, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 4.787037037037037, "grad_norm": 1.530597463148744, "kl": 0.632568359375, "learning_rate": 2.44536779743959e-09, "loss": -0.1787, "num_tokens": 42540476.0, "reward": 0.36482325196266174, "reward_std": 0.13654784858226776, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3046647310256958, "rewards/logprob_reward/std": 0.3189173638820648, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 676.4375, "completions/mean_terminated_length": 612.0740966796875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 4.790123456790123, "grad_norm": 1.3305308912800542, "kl": 0.60400390625, "learning_rate": 2.376109298157347e-09, "loss": -0.0958, "num_tokens": 42568730.0, "reward": 0.4558490216732025, "reward_std": 0.2048996090888977, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.41622111201286316, "rewards/logprob_reward/std": 0.3511371910572052, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 653.25, "completions/mean_terminated_length": 614.8965454101562, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 4.79320987654321, "grad_norm": 1.5757238777961142, "kl": 0.63134765625, "learning_rate": 2.3078410372978084e-09, "loss": -0.136, "num_tokens": 42596846.0, "reward": 0.3413069546222687, "reward_std": 0.08404748886823654, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.28547996282577515, "rewards/logprob_reward/std": 0.3061901032924652, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 567.125, "completions/mean_terminated_length": 501.857177734375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 4.796296296296296, "grad_norm": 1.7136641501300027, "kl": 0.648193359375, "learning_rate": 2.240563287863151e-09, "loss": -0.1047, "num_tokens": 42621226.0, "reward": 0.47114601731300354, "reward_std": 0.1227567121386528, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.42974555492401123, "rewards/logprob_reward/std": 0.33213141560554504, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 591.9375, "completions/mean_terminated_length": 530.2142944335938, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 4.799382716049383, "grad_norm": 1.4953795149690972, "kl": 0.6552734375, "learning_rate": 2.174276318894497e-09, "loss": -0.0618, "num_tokens": 42646844.0, "reward": 0.3497818410396576, "reward_std": 0.15614914894104004, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2983686923980713, "rewards/logprob_reward/std": 0.2982739210128784, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 627.1875, "completions/mean_terminated_length": 570.5, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 4.802469135802469, "grad_norm": 1.6246308474843663, "kl": 0.62646484375, "learning_rate": 2.1089803954708884e-09, "loss": -0.0758, "num_tokens": 42673182.0, "reward": 0.2838028073310852, "reward_std": 0.15158604085445404, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.2250586450099945, "rewards/logprob_reward/std": 0.3231425881385803, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 640.28125, "completions/mean_terminated_length": 569.2222290039062, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 4.805555555555555, "grad_norm": 1.6494319724381075, "kl": 0.666748046875, "learning_rate": 2.0446757787082324e-09, "loss": 0.0053, "num_tokens": 42699731.0, "reward": 0.23637640476226807, "reward_std": 0.09682808816432953, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.16889044642448425, "rewards/logprob_reward/std": 0.27291402220726013, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 635.28125, "completions/mean_terminated_length": 609.36669921875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 4.8086419753086425, "grad_norm": 1.5753589042425373, "kl": 0.68994140625, "learning_rate": 1.98136272575819e-09, "loss": 0.0186, "num_tokens": 42726376.0, "reward": 0.3483109474182129, "reward_std": 0.08099912106990814, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.28284549713134766, "rewards/logprob_reward/std": 0.3294100761413574, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 688.125, "completions/mean_terminated_length": 594.0800170898438, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 4.811728395061729, "grad_norm": 1.565992571681475, "kl": 0.671875, "learning_rate": 1.919041489807233e-09, "loss": -0.1171, "num_tokens": 42754976.0, "reward": 0.2515946328639984, "reward_std": 0.13924452662467957, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.19621628522872925, "rewards/logprob_reward/std": 0.25123703479766846, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 689.21875, "completions/mean_terminated_length": 611.9615478515625, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 4.814814814814815, "grad_norm": 1.4581199482028246, "kl": 0.589111328125, "learning_rate": 1.857712320075616e-09, "loss": -0.0631, "num_tokens": 42783563.0, "reward": 0.34005945920944214, "reward_std": 0.23700174689292908, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.29798272252082825, "rewards/logprob_reward/std": 0.35764217376708984, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 664.75, "completions/mean_terminated_length": 627.586181640625, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 4.817901234567901, "grad_norm": 1.4876748602985774, "kl": 0.640625, "learning_rate": 1.7973754618162972e-09, "loss": -0.1111, "num_tokens": 42811279.0, "reward": 0.32135793566703796, "reward_std": 0.13363397121429443, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2633143663406372, "rewards/logprob_reward/std": 0.2964593470096588, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 695.40625, "completions/mean_terminated_length": 619.5769653320312, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 4.820987654320987, "grad_norm": 1.5989340172254636, "kl": 0.64306640625, "learning_rate": 1.7380311563140737e-09, "loss": -0.0844, "num_tokens": 42840592.0, "reward": 0.45801758766174316, "reward_std": 0.13174057006835938, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.4221028685569763, "rewards/logprob_reward/std": 0.3762233853340149, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 685.40625, "completions/mean_terminated_length": 650.3793334960938, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 4.824074074074074, "grad_norm": 1.440066890426954, "kl": 0.605712890625, "learning_rate": 1.6796796408845292e-09, "loss": -0.1295, "num_tokens": 42869561.0, "reward": 0.31659525632858276, "reward_std": 0.0983721986413002, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.2545502781867981, "rewards/logprob_reward/std": 0.3353313207626343, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 661.78125, "completions/mean_terminated_length": 610.0357666015625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 4.827160493827161, "grad_norm": 1.6091801071320333, "kl": 0.65576171875, "learning_rate": 1.622321148873146e-09, "loss": -0.0971, "num_tokens": 42897586.0, "reward": 0.364992618560791, "reward_std": 0.20443648099899292, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3152695894241333, "rewards/logprob_reward/std": 0.3166876435279846, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 562.21875, "completions/mean_terminated_length": 562.21875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 4.830246913580247, "grad_norm": 1.5469842917984746, "kl": 0.64794921875, "learning_rate": 1.5659559096543318e-09, "loss": -0.0884, "num_tokens": 42921913.0, "reward": 0.461954802274704, "reward_std": 0.1011238843202591, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4160609245300293, "rewards/logprob_reward/std": 0.33176761865615845, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 623.9375, "completions/mean_terminated_length": 549.8518676757812, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 4.833333333333333, "grad_norm": 1.6938354439633045, "kl": 0.7265625, "learning_rate": 1.5105841486304783e-09, "loss": -0.0493, "num_tokens": 42947987.0, "reward": 0.5095148086547852, "reward_std": 0.20620262622833252, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.47237759828567505, "rewards/logprob_reward/std": 0.33899182081222534, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 613.8125, "completions/mean_terminated_length": 571.3793334960938, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 4.83641975308642, "grad_norm": 1.578695114841943, "kl": 0.63623046875, "learning_rate": 1.456206087231182e-09, "loss": -0.0941, "num_tokens": 42974325.0, "reward": 0.3753373324871063, "reward_std": 0.08581893891096115, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.32329148054122925, "rewards/logprob_reward/std": 0.2929462790489197, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 639.03125, "completions/mean_terminated_length": 584.0357666015625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 4.839506172839506, "grad_norm": 1.6118878290446765, "kl": 0.61328125, "learning_rate": 1.4028219429121912e-09, "loss": 0.0539, "num_tokens": 43001154.0, "reward": 0.44094985723495483, "reward_std": 0.14123091101646423, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.39272207021713257, "rewards/logprob_reward/std": 0.3036029636859894, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 649.71875, "completions/mean_terminated_length": 563.34619140625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 4.842592592592593, "grad_norm": 1.5022852782036442, "kl": 0.67724609375, "learning_rate": 1.350431929154655e-09, "loss": -0.1103, "num_tokens": 43029117.0, "reward": 0.2336864024400711, "reward_std": 0.05452674254775047, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.1693737953901291, "rewards/logprob_reward/std": 0.31718847155570984, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 632.6875, "completions/mean_terminated_length": 606.6000366210938, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 4.845679012345679, "grad_norm": 1.7937547557142355, "kl": 0.65283203125, "learning_rate": 1.2990362554642087e-09, "loss": -0.0304, "num_tokens": 43056071.0, "reward": 0.42322784662246704, "reward_std": 0.14505761861801147, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.36955875158309937, "rewards/logprob_reward/std": 0.3047729730606079, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 604.625, "completions/mean_terminated_length": 544.7142944335938, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 4.848765432098766, "grad_norm": 1.5379545617999795, "kl": 0.65234375, "learning_rate": 1.2486351273701678e-09, "loss": -0.0687, "num_tokens": 43082047.0, "reward": 0.13546256721019745, "reward_std": 0.0847892314195633, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.06023617833852768, "rewards/logprob_reward/std": 0.17540471255779266, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 574.71875, "completions/mean_terminated_length": 544.7667236328125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 4.851851851851852, "grad_norm": 1.5500797251587841, "kl": 0.69970703125, "learning_rate": 1.199228746424752e-09, "loss": -0.0841, "num_tokens": 43106506.0, "reward": 0.3590214252471924, "reward_std": 0.07751736044883728, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.2982182502746582, "rewards/logprob_reward/std": 0.3076047897338867, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 678.375, "completions/mean_terminated_length": 614.370361328125, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 4.854938271604938, "grad_norm": 1.8036527803414355, "kl": 0.6904296875, "learning_rate": 1.1508173102021402e-09, "loss": -0.1509, "num_tokens": 43134502.0, "reward": 0.38690486550331116, "reward_std": 0.1738266795873642, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3430887460708618, "rewards/logprob_reward/std": 0.32528313994407654, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 643.90625, "completions/mean_terminated_length": 556.1923217773438, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 4.8580246913580245, "grad_norm": 1.5874004671027895, "kl": 0.6416015625, "learning_rate": 1.1034010122978332e-09, "loss": 0.0164, "num_tokens": 43161171.0, "reward": 0.23865287005901337, "reward_std": 0.1317855268716812, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.17836427688598633, "rewards/logprob_reward/std": 0.2758690416812897, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 672.5625, "completions/mean_terminated_length": 607.4815063476562, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 4.861111111111111, "grad_norm": 1.5245526834347425, "kl": 0.63037109375, "learning_rate": 1.0569800423277652e-09, "loss": -0.0144, "num_tokens": 43189205.0, "reward": 0.42569297552108765, "reward_std": 0.24454718828201294, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3827143907546997, "rewards/logprob_reward/std": 0.38084083795547485, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 615.15625, "completions/mean_terminated_length": 572.862060546875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 4.864197530864198, "grad_norm": 1.5853154613451157, "kl": 0.626953125, "learning_rate": 1.0115545859276098e-09, "loss": 0.0824, "num_tokens": 43215578.0, "reward": 0.5475718975067139, "reward_std": 0.11802814900875092, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.5077188014984131, "rewards/logprob_reward/std": 0.4070405662059784, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 645.09375, "completions/mean_terminated_length": 590.9642944335938, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 4.867283950617284, "grad_norm": 1.6206578126048359, "kl": 0.645751953125, "learning_rate": 9.67124824752058e-10, "loss": -0.0734, "num_tokens": 43242721.0, "reward": 0.3543761372566223, "reward_std": 0.16935056447982788, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.30347347259521484, "rewards/logprob_reward/std": 0.28233858942985535, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 550.59375, "completions/mean_terminated_length": 519.0333862304688, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 4.87037037037037, "grad_norm": 1.8127164826436724, "kl": 0.68896484375, "learning_rate": 9.236909364739587e-10, "loss": -0.1437, "num_tokens": 43266716.0, "reward": 0.4635693430900574, "reward_std": 0.10048481822013855, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.41785484552383423, "rewards/logprob_reward/std": 0.32631415128707886, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 655.6875, "completions/mean_terminated_length": 603.0714721679688, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 4.8734567901234565, "grad_norm": 1.4292844619262444, "kl": 0.62353515625, "learning_rate": 8.812530947837904e-10, "loss": -0.0206, "num_tokens": 43293958.0, "reward": 0.40818583965301514, "reward_std": 0.12459313869476318, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3563176095485687, "rewards/logprob_reward/std": 0.2976675033569336, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 655.1875, "completions/mean_terminated_length": 570.0769653320312, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 4.8765432098765435, "grad_norm": 1.4669433900969446, "kl": 0.6103515625, "learning_rate": 8.39811469388857e-10, "loss": 0.0475, "num_tokens": 43321492.0, "reward": 0.3215346932411194, "reward_std": 0.1559082716703415, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.2739274501800537, "rewards/logprob_reward/std": 0.3152759075164795, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 637.96875, "completions/mean_terminated_length": 529.8800048828125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 4.87962962962963, "grad_norm": 1.5496771163584628, "kl": 0.66357421875, "learning_rate": 7.99366226012621e-10, "loss": 0.0142, "num_tokens": 43348535.0, "reward": 0.40902048349380493, "reward_std": 0.16599100828170776, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.37113386392593384, "rewards/logprob_reward/std": 0.3711528480052948, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 618.625, "completions/mean_terminated_length": 525.0769653320312, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 4.882716049382716, "grad_norm": 1.6958840218707805, "kl": 0.67724609375, "learning_rate": 7.59917526394066e-10, "loss": -0.0372, "num_tokens": 43375195.0, "reward": 0.34240061044692993, "reward_std": 0.11557981371879578, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.29016733169555664, "rewards/logprob_reward/std": 0.31696975231170654, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 643.3125, "completions/mean_terminated_length": 588.9285888671875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 4.885802469135802, "grad_norm": 1.3434155137125463, "kl": 0.63232421875, "learning_rate": 7.214655282870019e-10, "loss": -0.1683, "num_tokens": 43402017.0, "reward": 0.3140681982040405, "reward_std": 0.08269229531288147, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.255214661359787, "rewards/logprob_reward/std": 0.3142188787460327, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 648.21875, "completions/mean_terminated_length": 623.1666870117188, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 4.888888888888889, "grad_norm": 1.4595380084838694, "kl": 0.62109375, "learning_rate": 6.840103854595103e-10, "loss": -0.0571, "num_tokens": 43429052.0, "reward": 0.4436105191707611, "reward_std": 0.09996222704648972, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.3887338936328888, "rewards/logprob_reward/std": 0.3119935691356659, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 654.0625, "completions/mean_terminated_length": 601.2142944335938, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 4.8919753086419755, "grad_norm": 1.6989362938947257, "kl": 0.67041015625, "learning_rate": 6.475522476932504e-10, "loss": -0.0112, "num_tokens": 43456010.0, "reward": 0.37004151940345764, "reward_std": 0.14243920147418976, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3174072206020355, "rewards/logprob_reward/std": 0.30746331810951233, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 585.90625, "completions/mean_terminated_length": 523.3214721679688, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 4.895061728395062, "grad_norm": 1.592373687075735, "kl": 0.63525390625, "learning_rate": 6.120912607829598e-10, "loss": 0.0707, "num_tokens": 43481195.0, "reward": 0.34905314445495605, "reward_std": 0.12103253602981567, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.29061466455459595, "rewards/logprob_reward/std": 0.2826617956161499, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 718.34375, "completions/mean_terminated_length": 616.4583740234375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 4.898148148148148, "grad_norm": 1.6040463182401234, "kl": 0.60791015625, "learning_rate": 5.776275665357045e-10, "loss": -0.0022, "num_tokens": 43511298.0, "reward": 0.33553004264831543, "reward_std": 0.13964848220348358, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.29642224311828613, "rewards/logprob_reward/std": 0.36444607377052307, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 618.9375, "completions/mean_terminated_length": 591.933349609375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 4.901234567901234, "grad_norm": 1.4315355621919283, "kl": 0.66162109375, "learning_rate": 5.441613027704905e-10, "loss": -0.008, "num_tokens": 43537864.0, "reward": 0.28007739782333374, "reward_std": 0.042530350387096405, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.20703044533729553, "rewards/logprob_reward/std": 0.2963643968105316, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 656.09375, "completions/mean_terminated_length": 587.9629516601562, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 4.904320987654321, "grad_norm": 1.745062753117265, "kl": 0.681640625, "learning_rate": 5.116926033176261e-10, "loss": 0.068, "num_tokens": 43565539.0, "reward": 0.4590321183204651, "reward_std": 0.11497743427753448, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.4162856936454773, "rewards/logprob_reward/std": 0.3267732560634613, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 690.375, "completions/mean_terminated_length": 613.3846435546875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 4.907407407407407, "grad_norm": 1.4743197075146945, "kl": 0.585693359375, "learning_rate": 4.802215980182212e-10, "loss": 0.0526, "num_tokens": 43594047.0, "reward": 0.4591360092163086, "reward_std": 0.13060882687568665, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.41987329721450806, "rewards/logprob_reward/std": 0.3725893497467041, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 683.0, "completions/mean_terminated_length": 634.2857666015625, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 4.910493827160494, "grad_norm": 1.4890919889001153, "kl": 0.632080078125, "learning_rate": 4.4974841272357734e-10, "loss": -0.0254, "num_tokens": 43622235.0, "reward": 0.21072512865066528, "reward_std": 0.1100379079580307, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.14386126399040222, "rewards/logprob_reward/std": 0.2837538421154022, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 600.3125, "completions/mean_terminated_length": 556.4827270507812, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 4.91358024691358, "grad_norm": 1.414772685953773, "kl": 0.67724609375, "learning_rate": 4.2027316929479916e-10, "loss": 0.0045, "num_tokens": 43648045.0, "reward": 0.3801284432411194, "reward_std": 0.08835794776678085, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3320871591567993, "rewards/logprob_reward/std": 0.3552050292491913, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 595.75, "completions/mean_terminated_length": 551.4483032226562, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 4.916666666666667, "grad_norm": 1.4375056492204379, "kl": 0.65869140625, "learning_rate": 3.917959856022668e-10, "loss": -0.0496, "num_tokens": 43673501.0, "reward": 0.3692110776901245, "reward_std": 0.1579667031764984, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3130123019218445, "rewards/logprob_reward/std": 0.3046607971191406, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 663.6875, "completions/mean_terminated_length": 639.6666870117188, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 4.919753086419753, "grad_norm": 1.4572720149057106, "kl": 0.580322265625, "learning_rate": 3.6431697552510853e-10, "loss": -0.0843, "num_tokens": 43700935.0, "reward": 0.359041690826416, "reward_std": 0.15622547268867493, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3017130196094513, "rewards/logprob_reward/std": 0.33933356404304504, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 614.40625, "completions/mean_terminated_length": 572.0344848632812, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 4.922839506172839, "grad_norm": 1.589438142924742, "kl": 0.6328125, "learning_rate": 3.3783624895086795e-10, "loss": -0.0584, "num_tokens": 43727492.0, "reward": 0.3861275315284729, "reward_std": 0.17345941066741943, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3318083882331848, "rewards/logprob_reward/std": 0.3070046901702881, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 605.0625, "completions/mean_terminated_length": 561.72412109375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 4.925925925925926, "grad_norm": 1.6417989537422075, "kl": 0.64208984375, "learning_rate": 3.123539117749485e-10, "loss": -0.0412, "num_tokens": 43753762.0, "reward": 0.4776015281677246, "reward_std": 0.15066763758659363, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.43344616889953613, "rewards/logprob_reward/std": 0.43388843536376953, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 552.28125, "completions/mean_terminated_length": 537.0645141601562, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 4.929012345679013, "grad_norm": 1.695157338783966, "kl": 0.63330078125, "learning_rate": 2.8787006590022535e-10, "loss": -0.0756, "num_tokens": 43777775.0, "reward": 0.4344272315502167, "reward_std": 0.1286439448595047, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.37853026390075684, "rewards/logprob_reward/std": 0.29206913709640503, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 603.28125, "completions/mean_terminated_length": 575.2333374023438, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 4.932098765432099, "grad_norm": 1.4374319978525636, "kl": 0.66064453125, "learning_rate": 2.6438480923665627e-10, "loss": -0.0798, "num_tokens": 43802988.0, "reward": 0.4467771053314209, "reward_std": 0.08427700400352478, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3957245945930481, "rewards/logprob_reward/std": 0.3235898017883301, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 611.8125, "completions/mean_terminated_length": 569.1724243164062, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 4.935185185185185, "grad_norm": 1.545658658318958, "kl": 0.70751953125, "learning_rate": 2.418982357008936e-10, "loss": -0.1859, "num_tokens": 43829030.0, "reward": 0.4129653573036194, "reward_std": 0.15249526500701904, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.3651004135608673, "rewards/logprob_reward/std": 0.35472726821899414, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 634.75, "completions/mean_terminated_length": 579.1428833007812, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 4.938271604938271, "grad_norm": 1.383844536815906, "kl": 0.62841796875, "learning_rate": 2.2041043521586756e-10, "loss": -0.0731, "num_tokens": 43855934.0, "reward": 0.46378204226493835, "reward_std": 0.12477526068687439, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4180911183357239, "rewards/logprob_reward/std": 0.3182862401008606, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 642.84375, "completions/mean_terminated_length": 603.413818359375, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 4.9413580246913575, "grad_norm": 1.6761068927767602, "kl": 0.6220703125, "learning_rate": 1.999214937104532e-10, "loss": -0.0886, "num_tokens": 43883177.0, "reward": 0.3118339776992798, "reward_std": 0.11317771673202515, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.25620439648628235, "rewards/logprob_reward/std": 0.2917734682559967, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 632.09375, "completions/mean_terminated_length": 576.107177734375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 4.944444444444445, "grad_norm": 1.4340000731178144, "kl": 0.606201171875, "learning_rate": 1.8043149311916529e-10, "loss": -0.0498, "num_tokens": 43909992.0, "reward": 0.35696110129356384, "reward_std": 0.13570229709148407, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.30634570121765137, "rewards/logprob_reward/std": 0.323677122592926, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 621.625, "completions/mean_terminated_length": 580.0, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 4.947530864197531, "grad_norm": 1.5356607149485457, "kl": 0.601806640625, "learning_rate": 1.6194051138176955e-10, "loss": -0.1287, "num_tokens": 43936780.0, "reward": 0.318225622177124, "reward_std": 0.16314074397087097, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.25983402132987976, "rewards/logprob_reward/std": 0.28212153911590576, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 619.03125, "completions/mean_terminated_length": 561.1785888671875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 4.950617283950617, "grad_norm": 1.5155947528722653, "kl": 0.60205078125, "learning_rate": 1.444486224429775e-10, "loss": 0.0098, "num_tokens": 43963437.0, "reward": 0.3745138645172119, "reward_std": 0.20273840427398682, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.3293209671974182, "rewards/logprob_reward/std": 0.3095782399177551, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 655.625, "completions/mean_terminated_length": 603.0, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 4.953703703703704, "grad_norm": 1.3865851718991942, "kl": 0.68603515625, "learning_rate": 1.2795589625216875e-10, "loss": -0.188, "num_tokens": 43991049.0, "reward": 0.4455931782722473, "reward_std": 0.1334584802389145, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.40135353803634644, "rewards/logprob_reward/std": 0.3162521719932556, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 572.625, "completions/mean_terminated_length": 525.9310302734375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 4.95679012345679, "grad_norm": 1.6959759191656727, "kl": 0.70947265625, "learning_rate": 1.1246239876316899e-10, "loss": -0.0328, "num_tokens": 44015933.0, "reward": 0.256577730178833, "reward_std": 0.07936423271894455, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.18786412477493286, "rewards/logprob_reward/std": 0.2755887806415558, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 649.6875, "completions/mean_terminated_length": 637.6128540039062, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 4.959876543209877, "grad_norm": 1.5092013028309084, "kl": 0.64111328125, "learning_rate": 9.796819193383376e-11, "loss": -0.139, "num_tokens": 44043287.0, "reward": 0.3289586901664734, "reward_std": 0.03399747610092163, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.26134300231933594, "rewards/logprob_reward/std": 0.2843376100063324, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 646.4375, "completions/mean_terminated_length": 592.5, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 4.962962962962963, "grad_norm": 1.5612554489374904, "kl": 0.62060546875, "learning_rate": 8.447333372593735e-11, "loss": -0.184, "num_tokens": 44070857.0, "reward": 0.4210704267024994, "reward_std": 0.19198304414749146, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.3775782585144043, "rewards/logprob_reward/std": 0.31015920639038086, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 606.46875, "completions/mean_terminated_length": 563.27587890625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 4.966049382716049, "grad_norm": 1.52092630232937, "kl": 0.6650390625, "learning_rate": 7.197787810492295e-11, "loss": -0.0837, "num_tokens": 44096652.0, "reward": 0.4705764055252075, "reward_std": 0.16531717777252197, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.4256404638290405, "rewards/logprob_reward/std": 0.33756667375564575, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 617.15625, "completions/mean_terminated_length": 559.0357666015625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 4.969135802469136, "grad_norm": 1.755388546982618, "kl": 0.8466796875, "learning_rate": 6.04818750396252e-11, "loss": 0.019, "num_tokens": 44122981.0, "reward": 0.2172410488128662, "reward_std": 0.09098158776760101, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.14762896299362183, "rewards/logprob_reward/std": 0.2767450511455536, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 652.90625, "completions/mean_terminated_length": 584.1851806640625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 4.972222222222222, "grad_norm": 1.4559497643413157, "kl": 0.618896484375, "learning_rate": 4.9985370502131366e-11, "loss": -0.0408, "num_tokens": 44150278.0, "reward": 0.24504435062408447, "reward_std": 0.1104205995798111, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.17852148413658142, "rewards/logprob_reward/std": 0.2081945240497589, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 676.53125, "completions/mean_terminated_length": 596.34619140625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 4.9753086419753085, "grad_norm": 1.570783471811436, "kl": 0.688720703125, "learning_rate": 4.0488406467559245e-11, "loss": -0.083, "num_tokens": 44178619.0, "reward": 0.21804635226726532, "reward_std": 0.10495812445878983, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.1554681658744812, "rewards/logprob_reward/std": 0.26869282126426697, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 664.65625, "completions/mean_terminated_length": 598.1111450195312, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 4.978395061728395, "grad_norm": 1.5314840536686318, "kl": 0.58984375, "learning_rate": 3.1991020913890723e-11, "loss": 0.0049, "num_tokens": 44206364.0, "reward": 0.42047351598739624, "reward_std": 0.1077742725610733, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.37344276905059814, "rewards/logprob_reward/std": 0.35037484765052795, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 598.875, "completions/mean_terminated_length": 538.1428833007812, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 4.981481481481482, "grad_norm": 1.8243381962913001, "kl": 0.71435546875, "learning_rate": 2.449324782183293e-11, "loss": 0.0902, "num_tokens": 44231788.0, "reward": 0.31465286016464233, "reward_std": 0.14625564217567444, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.2558642625808716, "rewards/logprob_reward/std": 0.31594035029411316, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 623.875, "completions/mean_terminated_length": 566.7142944335938, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 4.984567901234568, "grad_norm": 1.5637611155469633, "kl": 0.71337890625, "learning_rate": 1.799511717470725e-11, "loss": -0.0492, "num_tokens": 44257944.0, "reward": 0.426765501499176, "reward_std": 0.11681216955184937, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.3769616484642029, "rewards/logprob_reward/std": 0.31651821732521057, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 597.46875, "completions/mean_terminated_length": 536.5357666015625, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 4.987654320987654, "grad_norm": 1.6456561336689777, "kl": 0.658203125, "learning_rate": 1.2496654958310537e-11, "loss": 0.0612, "num_tokens": 44283759.0, "reward": 0.4005149304866791, "reward_std": 0.16650116443634033, "rewards/format_reward_func/mean": 0.8125, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.35473883152008057, "rewards/logprob_reward/std": 0.2880318760871887, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 572.125, "completions/mean_terminated_length": 557.5484008789062, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 4.9907407407407405, "grad_norm": 1.5818542918317833, "kl": 0.63525390625, "learning_rate": 7.997883160748563e-12, "loss": 0.1119, "num_tokens": 44308575.0, "reward": 0.480800062417984, "reward_std": 0.059931229799985886, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.4265834391117096, "rewards/logprob_reward/std": 0.3407739996910095, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 663.125, "completions/mean_terminated_length": 625.7930908203125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 4.993827160493828, "grad_norm": 1.5961673005737749, "kl": 0.64892578125, "learning_rate": 4.4988197724360465e-12, "loss": 0.025, "num_tokens": 44336223.0, "reward": 0.4012889266014099, "reward_std": 0.07446186244487762, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.3451821208000183, "rewards/logprob_reward/std": 0.3220919072628021, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 603.3125, "completions/mean_terminated_length": 575.2667236328125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 4.996913580246914, "grad_norm": 1.61927799545433, "kl": 0.66748046875, "learning_rate": 1.9994787860133646e-12, "loss": -0.0263, "num_tokens": 44361769.0, "reward": 0.39280831813812256, "reward_std": 0.03557324409484863, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.332287073135376, "rewards/logprob_reward/std": 0.2970695197582245, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 586.0625, "completions/mean_terminated_length": 540.7586059570312, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 5.0, "grad_norm": 1.457381094634309, "kl": 0.64599609375, "learning_rate": 4.998701962355412e-13, "loss": -0.0937, "num_tokens": 44386955.0, "reward": 0.29000014066696167, "reward_std": 0.09594706445932388, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.22500015795230865, "rewards/logprob_reward/std": 0.2622636556625366, "step": 1620 }, { "epoch": 5.0, "step": 1620, "total_flos": 0.0, "train_loss": -0.10387079785937259, "train_runtime": 20092.6949, "train_samples_per_second": 0.645, "train_steps_per_second": 0.081 } ], "logging_steps": 1, "max_steps": 1620, "num_input_tokens_seen": 44386955, "num_train_epochs": 5, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }