{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 1620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 547.625, "completions/mean_terminated_length": 547.625, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.0030864197530864196, "grad_norm": 4.879990798151839, "kl": NaN, "learning_rate": 0.0, "loss": -0.1871, "num_tokens": 24340.0, "reward": 0.003167829941958189, "reward_std": 0.006335659883916378, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 4.7588691813871264e-05, "rewards/logprob_reward/std": 0.0002692022826522589, "step": 1 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 523.3125, "completions/mean_terminated_length": 507.1612854003906, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.006172839506172839, "grad_norm": 6.645700899096121, "kl": NaN, "learning_rate": 1.020408163265306e-08, "loss": -0.3743, "num_tokens": 47462.0, "reward": 0.0063501279801130295, "reward_std": 0.012700255960226059, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00011125343735329807, "rewards/logprob_reward/std": 0.0004910404095426202, "step": 2 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 551.0, "completions/mean_terminated_length": 535.741943359375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.009259259259259259, "grad_norm": 0.00038052978130953183, "kl": NaN, "learning_rate": 2.040816326530612e-08, "loss": 0.0, "num_tokens": 71866.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 3 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 532.75, "completions/mean_terminated_length": 500.0000305175781, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.012345679012345678, "grad_norm": 0.0005572360154994931, "kl": NaN, "learning_rate": 3.0612244897959183e-08, "loss": 0.0, "num_tokens": 95878.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 4 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 536.03125, "completions/mean_terminated_length": 536.03125, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.015432098765432098, "grad_norm": 10.61183558134393, "kl": NaN, "learning_rate": 4.081632653061224e-08, "loss": -0.3743, "num_tokens": 119719.0, "reward": 0.0064407894387841225, "reward_std": 0.012881578877568245, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00021198844478931278, "rewards/logprob_reward/std": 0.0008489831234328449, "step": 5 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 879.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 544.4375, "completions/mean_terminated_length": 544.4375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.018518518518518517, "grad_norm": 0.0, "kl": NaN, "learning_rate": 5.1020408163265303e-08, "loss": 0.0, "num_tokens": 143649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 6 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 502.5, "completions/mean_terminated_length": 502.5, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.021604938271604937, "grad_norm": 7.27543371690155, "kl": NaN, "learning_rate": 6.122448979591837e-08, "loss": -0.3743, "num_tokens": 166089.0, "reward": 0.006321371532976627, "reward_std": 0.012642743065953255, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 7.93015060480684e-05, "rewards/logprob_reward/std": 0.0003123948990833014, "step": 7 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 524.78125, "completions/mean_terminated_length": 508.6773986816406, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.024691358024691357, "grad_norm": 9.211364593756338, "kl": NaN, "learning_rate": 7.142857142857142e-08, "loss": -0.5506, "num_tokens": 189406.0, "reward": 0.006642765365540981, "reward_std": 0.013285531662404537, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00043640637886710465, "rewards/logprob_reward/std": 0.0014902930706739426, "step": 8 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 463.84375, "completions/mean_terminated_length": 463.84375, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.027777777777777776, "grad_norm": 4.750598646530187, "kl": NaN, "learning_rate": 8.163265306122448e-08, "loss": -0.1871, "num_tokens": 210493.0, "reward": 0.0031913958955556154, "reward_std": 0.0063827913254499435, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 7.377310248557478e-05, "rewards/logprob_reward/std": 0.00041732366662472486, "step": 9 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 551.4375, "completions/mean_terminated_length": 551.4375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.030864197530864196, "grad_norm": 0.0, "kl": NaN, "learning_rate": 9.183673469387755e-08, "loss": 0.0, "num_tokens": 234415.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 10 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 512.84375, "completions/mean_terminated_length": 512.84375, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.033950617283950615, "grad_norm": 5.585638064770051, "kl": NaN, "learning_rate": 1.0204081632653061e-07, "loss": -0.1871, "num_tokens": 257178.0, "reward": 0.003199605271220207, "reward_std": 0.006399210542440414, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 8.289470861200243e-05, "rewards/logprob_reward/std": 0.0004689233028329909, "step": 11 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 503.375, "completions/mean_terminated_length": 468.66668701171875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.037037037037037035, "grad_norm": 0.0008072613180261669, "kl": NaN, "learning_rate": 1.1224489795918366e-07, "loss": 0.0, "num_tokens": 279898.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 12 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 570.625, "completions/mean_terminated_length": 556.0, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.040123456790123455, "grad_norm": 0.00046137066571460666, "kl": NaN, "learning_rate": 1.2244897959183673e-07, "loss": 0.0, "num_tokens": 304566.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 13 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 598.75, "completions/mean_terminated_length": 585.0322265625, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.043209876543209874, "grad_norm": 0.0, "kl": NaN, "learning_rate": 1.326530612244898e-07, "loss": 0.0, "num_tokens": 330450.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 14 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 565.15625, "completions/mean_terminated_length": 534.5667114257812, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.046296296296296294, "grad_norm": 0.0, "kl": NaN, "learning_rate": 1.4285714285714285e-07, "loss": 0.0, "num_tokens": 355123.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 15 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 542.25, "completions/mean_terminated_length": 526.7096557617188, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.04938271604938271, "grad_norm": 4.36675829873075, "kl": NaN, "learning_rate": 1.5306122448979592e-07, "loss": -0.275, "num_tokens": 378955.0, "reward": 0.003193480661138892, "reward_std": 0.006386961322277784, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 7.608939631609246e-05, "rewards/logprob_reward/std": 0.0003962118935305625, "step": 16 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 553.59375, "completions/mean_terminated_length": 522.2333374023438, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.05246913580246913, "grad_norm": 4.919935265122631, "kl": NaN, "learning_rate": 1.6326530612244896e-07, "loss": -0.1871, "num_tokens": 403382.0, "reward": 0.0031280089169740677, "reward_std": 0.006256017833948135, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 3.3431635984015884e-06, "rewards/logprob_reward/std": 1.8911789084086195e-05, "step": 17 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 594.3125, "completions/mean_terminated_length": 549.862060546875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.05555555555555555, "grad_norm": 7.433026661740928, "kl": NaN, "learning_rate": 1.7346938775510203e-07, "loss": -0.1871, "num_tokens": 428960.0, "reward": 0.0031758369877934456, "reward_std": 0.006351673975586891, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 5.648556907544844e-05, "rewards/logprob_reward/std": 0.0003195306344423443, "step": 18 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 523.5, "completions/mean_terminated_length": 490.13336181640625, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.05864197530864197, "grad_norm": 4.74937187483194, "kl": NaN, "learning_rate": 1.836734693877551e-07, "loss": -0.1871, "num_tokens": 452024.0, "reward": 0.003166038077324629, "reward_std": 0.006332076154649258, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 4.559782973956317e-05, "rewards/logprob_reward/std": 0.00025794029352255166, "step": 19 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 528.875, "completions/mean_terminated_length": 495.86669921875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.06172839506172839, "grad_norm": 0.0, "kl": NaN, "learning_rate": 1.9387755102040814e-07, "loss": 0.0, "num_tokens": 475340.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 20 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 553.65625, "completions/mean_terminated_length": 522.300048828125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.06481481481481481, "grad_norm": 0.0, "kl": NaN, "learning_rate": 2.0408163265306121e-07, "loss": 0.0, "num_tokens": 499329.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 21 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 511.375, "completions/mean_terminated_length": 494.83868408203125, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.06790123456790123, "grad_norm": 5.039991769823507, "kl": NaN, "learning_rate": 2.1428571428571426e-07, "loss": -0.1871, "num_tokens": 521813.0, "reward": 0.003222013358026743, "reward_std": 0.006444026716053486, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00010779264266602695, "rewards/logprob_reward/std": 0.0006097672739997506, "step": 22 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 515.0, "completions/mean_terminated_length": 515.0, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.07098765432098765, "grad_norm": 0.0, "kl": NaN, "learning_rate": 2.2448979591836733e-07, "loss": 0.0, "num_tokens": 545069.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 23 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 494.125, "completions/mean_terminated_length": 477.0322570800781, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.07407407407407407, "grad_norm": 10.389645085374694, "kl": NaN, "learning_rate": 2.346938775510204e-07, "loss": -0.3743, "num_tokens": 567493.0, "reward": 0.006342649459838867, "reward_std": 0.012685298919677734, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00010294403182342649, "rewards/logprob_reward/std": 0.0004805905919056386, "step": 24 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 584.75, "completions/mean_terminated_length": 483.3846435546875, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.07716049382716049, "grad_norm": 0.0, "kl": NaN, "learning_rate": 2.4489795918367347e-07, "loss": 0.0, "num_tokens": 593321.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 25 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 537.15625, "completions/mean_terminated_length": 521.4515991210938, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.08024691358024691, "grad_norm": 2.673643403308418, "kl": NaN, "learning_rate": 2.551020408163265e-07, "loss": -0.0624, "num_tokens": 617334.0, "reward": 0.00964600034058094, "reward_std": 0.006430668756365776, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.00030111195519566536, "rewards/logprob_reward/std": 0.0009512502583675086, "step": 26 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 496.03125, "completions/mean_terminated_length": 479.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.08333333333333333, "grad_norm": 5.735671573618509, "kl": NaN, "learning_rate": 2.653061224489796e-07, "loss": -0.1871, "num_tokens": 639619.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 27 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 537.75, "completions/mean_terminated_length": 522.0645141601562, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.08641975308641975, "grad_norm": 9.310152593066146, "kl": NaN, "learning_rate": 2.755102040816326e-07, "loss": -0.3743, "num_tokens": 663439.0, "reward": 0.006380573846399784, "reward_std": 0.012761147692799568, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00014508160529658198, "rewards/logprob_reward/std": 0.0007550088339485228, "step": 28 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 494.53125, "completions/mean_terminated_length": 477.45159912109375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.08950617283950617, "grad_norm": 4.434418341643034, "kl": NaN, "learning_rate": 2.857142857142857e-07, "loss": -0.1871, "num_tokens": 685388.0, "reward": 0.003202674211934209, "reward_std": 0.006405348423868418, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 8.63045861478895e-05, "rewards/logprob_reward/std": 0.00048821247764863074, "step": 29 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 608.9375, "completions/mean_terminated_length": 595.54833984375, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.09259259259259259, "grad_norm": 0.0, "kl": NaN, "learning_rate": 2.9591836734693874e-07, "loss": 0.0, "num_tokens": 711710.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 30 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 531.59375, "completions/mean_terminated_length": 531.59375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.09567901234567901, "grad_norm": 4.5768042220847605, "kl": NaN, "learning_rate": 3.0612244897959183e-07, "loss": -0.1871, "num_tokens": 735193.0, "reward": 0.0031982893124222755, "reward_std": 0.006396578624844551, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 8.14325176179409e-05, "rewards/logprob_reward/std": 0.00046065187780186534, "step": 31 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 492.46875, "completions/mean_terminated_length": 492.46875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.09876543209876543, "grad_norm": 0.0, "kl": NaN, "learning_rate": 3.163265306122449e-07, "loss": 0.0, "num_tokens": 757452.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 32 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 552.40625, "completions/mean_terminated_length": 520.9666748046875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.10185185185185185, "grad_norm": 5.73460587413631, "kl": NaN, "learning_rate": 3.265306122448979e-07, "loss": -0.1871, "num_tokens": 781945.0, "reward": 0.0032463932875543833, "reward_std": 0.0064927865751087666, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00013488148397300392, "rewards/logprob_reward/std": 0.0007630048785358667, "step": 33 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 501.4375, "completions/mean_terminated_length": 484.58062744140625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.10493827160493827, "grad_norm": 0.0, "kl": NaN, "learning_rate": 3.3673469387755096e-07, "loss": 0.0, "num_tokens": 804735.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 34 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 514.40625, "completions/mean_terminated_length": 514.40625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.10802469135802469, "grad_norm": 6.87957308311849, "kl": NaN, "learning_rate": 3.4693877551020406e-07, "loss": -0.3743, "num_tokens": 827436.0, "reward": 0.006365813780575991, "reward_std": 0.012731627561151981, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00012868200428783894, "rewards/logprob_reward/std": 0.0005134922685101628, "step": 35 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 638.78125, "completions/mean_terminated_length": 530.9199829101562, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.1111111111111111, "grad_norm": 5.874910559307383, "kl": NaN, "learning_rate": 3.5714285714285716e-07, "loss": -0.347, "num_tokens": 854661.0, "reward": 0.0031954434234648943, "reward_std": 0.006390886846929789, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 7.827053195796907e-05, "rewards/logprob_reward/std": 0.0003081193717662245, "step": 36 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 532.90625, "completions/mean_terminated_length": 500.16668701171875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.11419753086419752, "grad_norm": 7.681843992448197, "kl": NaN, "learning_rate": 3.673469387755102e-07, "loss": -0.3743, "num_tokens": 878670.0, "reward": 0.006405107211321592, "reward_std": 0.012810214422643185, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0001723413442960009, "rewards/logprob_reward/std": 0.0007948004058562219, "step": 37 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 519.6875, "completions/mean_terminated_length": 503.4193420410156, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.11728395061728394, "grad_norm": 1.9184316592124007, "kl": NaN, "learning_rate": 3.7755102040816324e-07, "loss": -0.0545, "num_tokens": 901708.0, "reward": 0.0031901041511446238, "reward_std": 0.006277103908360004, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 7.233808719320223e-05, "rewards/logprob_reward/std": 0.0002902286360040307, "step": 38 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 615.34375, "completions/mean_terminated_length": 573.0689697265625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.12037037037037036, "grad_norm": 0.0, "kl": NaN, "learning_rate": 3.877551020408163e-07, "loss": 0.0, "num_tokens": 928267.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 39 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 578.4375, "completions/mean_terminated_length": 495.9259338378906, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.12345679012345678, "grad_norm": 0.00354366427333239, "kl": NaN, "learning_rate": 3.979591836734694e-07, "loss": 0.0, "num_tokens": 953173.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 40 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 525.5625, "completions/mean_terminated_length": 474.0, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.12654320987654322, "grad_norm": 0.0, "kl": NaN, "learning_rate": 4.0816326530612243e-07, "loss": 0.0, "num_tokens": 976395.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 41 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 561.1875, "completions/mean_terminated_length": 530.3333740234375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.12962962962962962, "grad_norm": 5.452052063608532, "kl": NaN, "learning_rate": 4.183673469387755e-07, "loss": -0.3547, "num_tokens": 1001085.0, "reward": 0.0031829094514250755, "reward_std": 0.006365818902850151, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 6.434385431930423e-05, "rewards/logprob_reward/std": 0.0003311107575427741, "step": 42 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 524.4375, "completions/mean_terminated_length": 453.0714416503906, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.13271604938271606, "grad_norm": 8.882324796707932, "kl": NaN, "learning_rate": 4.285714285714285e-07, "loss": -0.3549, "num_tokens": 1024631.0, "reward": 0.003180807689204812, "reward_std": 0.006361615378409624, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 6.200830830493942e-05, "rewards/logprob_reward/std": 0.0003339833638165146, "step": 43 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 509.21875, "completions/mean_terminated_length": 509.21875, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.13580246913580246, "grad_norm": 6.3561378563385595, "kl": NaN, "learning_rate": 4.387755102040816e-07, "loss": -0.3568, "num_tokens": 1047458.0, "reward": 0.018970444798469543, "reward_std": 0.019943157210946083, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.00024493783712387085, "rewards/logprob_reward/std": 0.000614454853348434, "step": 44 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 450.90625, "completions/mean_terminated_length": 450.90625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.1388888888888889, "grad_norm": 6.723842234335574, "kl": NaN, "learning_rate": 4.4897959183673465e-07, "loss": -0.3743, "num_tokens": 1067883.0, "reward": 0.006346043664962053, "reward_std": 0.012692087329924107, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00010671508789528161, "rewards/logprob_reward/std": 0.0005800679209642112, "step": 45 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 411.90625, "completions/mean_terminated_length": 411.90625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.1419753086419753, "grad_norm": 2.3938271409190297, "kl": NaN, "learning_rate": 4.5918367346938775e-07, "loss": -0.1081, "num_tokens": 1087052.0, "reward": 0.006382983643561602, "reward_std": 0.007370436564087868, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00014775953604839742, "rewards/logprob_reward/std": 0.0005816027987748384, "step": 46 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 539.53125, "completions/mean_terminated_length": 489.4137878417969, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.14506172839506173, "grad_norm": 8.741826559168924, "kl": NaN, "learning_rate": 4.693877551020408e-07, "loss": -0.5614, "num_tokens": 1110633.0, "reward": 0.009479801170527935, "reward_std": 0.01895960234105587, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.00011644529149634764, "rewards/logprob_reward/std": 0.00039107364136725664, "step": 47 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 479.3125, "completions/mean_terminated_length": 479.3125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.14814814814814814, "grad_norm": 0.0, "kl": NaN, "learning_rate": 4.795918367346938e-07, "loss": 0.0, "num_tokens": 1132355.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 48 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 535.4375, "completions/mean_terminated_length": 535.4375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.15123456790123457, "grad_norm": 6.8910251613189395, "kl": NaN, "learning_rate": 4.897959183673469e-07, "loss": -0.3743, "num_tokens": 1156073.0, "reward": 0.0063034119084477425, "reward_std": 0.012606823816895485, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 5.934640648774803e-05, "rewards/logprob_reward/std": 0.00023372126452159137, "step": 49 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 470.75, "completions/mean_terminated_length": 433.86669921875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.15432098765432098, "grad_norm": 10.199205303249721, "kl": NaN, "learning_rate": 5e-07, "loss": -0.7485, "num_tokens": 1177205.0, "reward": 0.012633386999368668, "reward_std": 0.025266773998737335, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0001482078223489225, "rewards/logprob_reward/std": 0.00046398723497986794, "step": 50 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 526.71875, "completions/mean_terminated_length": 493.5666809082031, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.1574074074074074, "grad_norm": 22.069679942677325, "kl": NaN, "learning_rate": 4.999995001298037e-07, "loss": -0.4714, "num_tokens": 1200696.0, "reward": 0.009701208211481571, "reward_std": 0.014054418541491032, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.00036245345836505294, "rewards/logprob_reward/std": 0.001145550631918013, "step": 51 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 528.84375, "completions/mean_terminated_length": 477.6206970214844, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.16049382716049382, "grad_norm": 14.415036849323204, "kl": NaN, "learning_rate": 4.99998000521214e-07, "loss": -0.6278, "num_tokens": 1224303.0, "reward": 0.012864849530160427, "reward_std": 0.025729699060320854, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.00040538786561228335, "rewards/logprob_reward/std": 0.00129931780975312, "step": 52 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 547.71875, "completions/mean_terminated_length": 479.6785888671875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.16358024691358025, "grad_norm": 5.236887534241916, "kl": NaN, "learning_rate": 4.999955011802275e-07, "loss": -0.2955, "num_tokens": 1248478.0, "reward": 0.010162696242332458, "reward_std": 0.014412381686270237, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0008752185967750847, "rewards/logprob_reward/std": 0.003261502366513014, "step": 53 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 497.0625, "completions/mean_terminated_length": 480.06451416015625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.16666666666666666, "grad_norm": 8.421637357342595, "kl": NaN, "learning_rate": 4.999920021168393e-07, "loss": -0.466, "num_tokens": 1271212.0, "reward": 0.009957539848983288, "reward_std": 0.019376683980226517, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0006472665118053555, "rewards/logprob_reward/std": 0.002017608843743801, "step": 54 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 475.65625, "completions/mean_terminated_length": 457.9677429199219, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.1697530864197531, "grad_norm": 14.131655360271075, "kl": NaN, "learning_rate": 4.999875033450417e-07, "loss": -0.5614, "num_tokens": 1292665.0, "reward": 0.009479843080043793, "reward_std": 0.018959686160087585, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.00011649253428913653, "rewards/logprob_reward/std": 0.0005409479490481317, "step": 55 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 504.25, "completions/mean_terminated_length": 469.60003662109375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.1728395061728395, "grad_norm": 9.585090906952301, "kl": NaN, "learning_rate": 4.999820048828253e-07, "loss": -0.7972, "num_tokens": 1315305.0, "reward": 0.016101833432912827, "reward_std": 0.031882334500551224, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0005298148025758564, "rewards/logprob_reward/std": 0.0013958527706563473, "step": 56 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 515.96875, "completions/mean_terminated_length": 499.58062744140625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.17592592592592593, "grad_norm": 13.522680321834711, "kl": NaN, "learning_rate": 4.999755067521781e-07, "loss": -0.7486, "num_tokens": 1338224.0, "reward": 0.013169733807444572, "reward_std": 0.026339467614889145, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0007441485649906099, "rewards/logprob_reward/std": 0.0030354137998074293, "step": 57 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 474.125, "completions/mean_terminated_length": 456.3870849609375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.17901234567901234, "grad_norm": 8.343067177520528, "kl": NaN, "learning_rate": 4.999680089790861e-07, "loss": -0.7161, "num_tokens": 1359760.0, "reward": 0.02902640402317047, "reward_std": 0.03867201507091522, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0010015605948865414, "rewards/logprob_reward/std": 0.002976895309984684, "step": 58 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 565.59375, "completions/mean_terminated_length": 500.107177734375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.18209876543209877, "grad_norm": 3.4895631441555404, "kl": NaN, "learning_rate": 4.999595115935325e-07, "loss": -0.2656, "num_tokens": 1384755.0, "reward": 0.02569853514432907, "reward_std": 0.012860770337283611, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0007761501474305987, "rewards/logprob_reward/std": 0.0013694728258997202, "step": 59 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 463.71875, "completions/mean_terminated_length": 463.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.18518518518518517, "grad_norm": 5.676083938080832, "kl": NaN, "learning_rate": 4.999500146294979e-07, "loss": -0.539, "num_tokens": 1406030.0, "reward": 0.03294990956783295, "reward_std": 0.033797409385442734, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0018887876067310572, "rewards/logprob_reward/std": 0.004393962677568197, "step": 60 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 543.5625, "completions/mean_terminated_length": 454.59259033203125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.1882716049382716, "grad_norm": 8.804184607283121, "kl": NaN, "learning_rate": 4.999395181249604e-07, "loss": -0.635, "num_tokens": 1429828.0, "reward": 0.0346556231379509, "reward_std": 0.03339875489473343, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.000311802898067981, "rewards/logprob_reward/std": 0.0005382333183661103, "step": 61 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 478.71875, "completions/mean_terminated_length": 442.36669921875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.19135802469135801, "grad_norm": 5.5555862620704595, "kl": NaN, "learning_rate": 4.99928022121895e-07, "loss": -0.4251, "num_tokens": 1451363.0, "reward": 0.02836659923195839, "reward_std": 0.03340596705675125, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.00026844331296160817, "rewards/logprob_reward/std": 0.0006385487504303455, "step": 62 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 451.0, "completions/mean_terminated_length": 451.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.19444444444444445, "grad_norm": 9.370034003411387, "kl": NaN, "learning_rate": 4.99915526666274e-07, "loss": -0.948, "num_tokens": 1471991.0, "reward": 0.037867240607738495, "reward_std": 0.04707539454102516, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0004080414946656674, "rewards/logprob_reward/std": 0.0006741120596416295, "step": 63 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 584.40625, "completions/mean_terminated_length": 521.607177734375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.19753086419753085, "grad_norm": 12.851748881128868, "kl": NaN, "learning_rate": 4.999020318080661e-07, "loss": -0.9943, "num_tokens": 1497096.0, "reward": 0.02561819739639759, "reward_std": 0.045522771775722504, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0006868854979984462, "rewards/logprob_reward/std": 0.0014965421287342906, "step": 64 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 463.59375, "completions/mean_terminated_length": 445.51611328125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.2006172839506173, "grad_norm": 7.583142573219291, "kl": NaN, "learning_rate": 4.998875376012368e-07, "loss": -0.6527, "num_tokens": 1518103.0, "reward": 0.034932032227516174, "reward_std": 0.03384808078408241, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0006189263658598065, "rewards/logprob_reward/std": 0.0014778337208554149, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 474.03125, "completions/mean_terminated_length": 437.36669921875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.2037037037037037, "grad_norm": 11.646201626453006, "kl": 0.01688385009765625, "learning_rate": 4.998720441037479e-07, "loss": -0.9368, "num_tokens": 1539704.0, "reward": 0.03542628139257431, "reward_std": 0.041238218545913696, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0011680902680382133, "rewards/logprob_reward/std": 0.0017735165311023593, "step": 66 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 451.3125, "completions/mean_terminated_length": 413.13336181640625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.20679012345679013, "grad_norm": 5.5349097190879695, "kl": NaN, "learning_rate": 4.99855551377557e-07, "loss": -0.5997, "num_tokens": 1560386.0, "reward": 0.04110191762447357, "reward_std": 0.047965094447135925, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0005299084004946053, "rewards/logprob_reward/std": 0.0007469018455594778, "step": 67 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 508.5625, "completions/mean_terminated_length": 434.9285888671875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.20987654320987653, "grad_norm": 6.043545392642072, "kl": NaN, "learning_rate": 4.998380594886182e-07, "loss": -0.4584, "num_tokens": 1583228.0, "reward": 0.0408533550798893, "reward_std": 0.02602376975119114, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0002537275431677699, "rewards/logprob_reward/std": 0.0005096375825814903, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 537.4375, "completions/mean_terminated_length": 467.9285888671875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.21296296296296297, "grad_norm": 9.025643600318977, "kl": 0.01786041259765625, "learning_rate": 4.998195685068808e-07, "loss": -0.836, "num_tokens": 1606838.0, "reward": 0.03815925866365433, "reward_std": 0.05255364254117012, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0007325104670599103, "rewards/logprob_reward/std": 0.001503912266343832, "step": 69 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 467.65625, "completions/mean_terminated_length": 430.5666809082031, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.21604938271604937, "grad_norm": 15.21712081526465, "kl": NaN, "learning_rate": 4.998000785062895e-07, "loss": -0.9813, "num_tokens": 1628447.0, "reward": 0.038099028170108795, "reward_std": 0.04530385509133339, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0006655875477008522, "rewards/logprob_reward/std": 0.001207419903948903, "step": 70 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 434.875, "completions/mean_terminated_length": 395.6000061035156, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.2191358024691358, "grad_norm": 6.228276737398428, "kl": NaN, "learning_rate": 4.997795895647841e-07, "loss": -0.4687, "num_tokens": 1649051.0, "reward": 0.03813503682613373, "reward_std": 0.03481914848089218, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0007055974565446377, "rewards/logprob_reward/std": 0.0011948143364861608, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 447.21875, "completions/mean_terminated_length": 447.21875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.2222222222222222, "grad_norm": 6.123223944953748, "kl": 0.01909637451171875, "learning_rate": 4.997581017642991e-07, "loss": -0.536, "num_tokens": 1669838.0, "reward": 0.06365455687046051, "reward_std": 0.04029175266623497, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0012828335165977478, "rewards/logprob_reward/std": 0.0024304192047566175, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 505.0, "completions/mean_terminated_length": 488.258056640625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.22530864197530864, "grad_norm": 4.045335637387813, "kl": 0.0219268798828125, "learning_rate": 4.997356151907633e-07, "loss": -0.3864, "num_tokens": 1692502.0, "reward": 0.05487312376499176, "reward_std": 0.04867660999298096, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0019423530902713537, "rewards/logprob_reward/std": 0.0025331033393740654, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 557.15625, "completions/mean_terminated_length": 508.862060546875, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.22839506172839505, "grad_norm": 6.256315480758599, "kl": 0.0246429443359375, "learning_rate": 4.997121299340997e-07, "loss": -0.6425, "num_tokens": 1716799.0, "reward": 0.054621484130620956, "reward_std": 0.05373973399400711, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0016627591103315353, "rewards/logprob_reward/std": 0.003260746132582426, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 469.5625, "completions/mean_terminated_length": 451.6773986816406, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.23148148148148148, "grad_norm": 5.3090812476367875, "kl": 0.02172088623046875, "learning_rate": 4.99687646088225e-07, "loss": -0.644, "num_tokens": 1737913.0, "reward": 0.05796758458018303, "reward_std": 0.05293092876672745, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0019084264058619738, "rewards/logprob_reward/std": 0.0030874432995915413, "step": 75 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 506.3125, "completions/mean_terminated_length": 506.3125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.2345679012345679, "grad_norm": 3.09483623935045, "kl": NaN, "learning_rate": 4.996621637510491e-07, "loss": -0.2407, "num_tokens": 1760615.0, "reward": 0.0681268572807312, "reward_std": 0.0321149118244648, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0027798402588814497, "rewards/logprob_reward/std": 0.0033658831380307674, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 479.40625, "completions/mean_terminated_length": 479.40625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.23765432098765432, "grad_norm": 6.022043154609086, "kl": 0.0325927734375, "learning_rate": 4.996356830244749e-07, "loss": -0.3437, "num_tokens": 1782048.0, "reward": 0.06664453446865082, "reward_std": 0.03348292410373688, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0011328145628795028, "rewards/logprob_reward/std": 0.0022796066477894783, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 444.90625, "completions/mean_terminated_length": 406.3000183105469, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.24074074074074073, "grad_norm": 10.690746280967904, "kl": 0.029754638671875, "learning_rate": 4.996082040143977e-07, "loss": -0.4602, "num_tokens": 1802945.0, "reward": 0.07090184837579727, "reward_std": 0.03920045495033264, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0023909390438348055, "rewards/logprob_reward/std": 0.0038968671578913927, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 526.4375, "completions/mean_terminated_length": 474.96551513671875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.24382716049382716, "grad_norm": 5.942169315397655, "kl": 0.02519989013671875, "learning_rate": 4.995797268307051e-07, "loss": -0.3951, "num_tokens": 1826151.0, "reward": 0.04450929909944534, "reward_std": 0.047439657151699066, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0008436637581326067, "rewards/logprob_reward/std": 0.0016849666135385633, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 609.8125, "completions/mean_terminated_length": 471.75, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.24691358024691357, "grad_norm": 7.553668248560938, "kl": 0.0283966064453125, "learning_rate": 4.995502515872763e-07, "loss": -0.5494, "num_tokens": 1852245.0, "reward": 0.04472878575325012, "reward_std": 0.050996411591768265, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0010875379666686058, "rewards/logprob_reward/std": 0.0020124169532209635, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 526.0625, "completions/mean_terminated_length": 510.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.25, "grad_norm": 6.008576594263075, "kl": 0.026123046875, "learning_rate": 4.995197784019818e-07, "loss": -0.2975, "num_tokens": 1875791.0, "reward": 0.060693349689245224, "reward_std": 0.04175513982772827, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0014648307114839554, "rewards/logprob_reward/std": 0.002707106526941061, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 541.75, "completions/mean_terminated_length": 509.60003662109375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.25308641975308643, "grad_norm": 5.849176959121505, "kl": 0.028839111328125, "learning_rate": 4.994883073966823e-07, "loss": -0.3656, "num_tokens": 1899519.0, "reward": 0.060538940131664276, "reward_std": 0.04685738682746887, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0012932643294334412, "rewards/logprob_reward/std": 0.0021190636325627565, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 468.125, "completions/mean_terminated_length": 450.19354248046875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.25617283950617287, "grad_norm": 4.10962518518028, "kl": 0.0347137451171875, "learning_rate": 4.994558386972295e-07, "loss": -0.1882, "num_tokens": 1920863.0, "reward": 0.05730611830949783, "reward_std": 0.04541774466633797, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0011734651634469628, "rewards/logprob_reward/std": 0.002099178498610854, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 647.03125, "completions/mean_terminated_length": 608.0344848632812, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.25925925925925924, "grad_norm": 3.286319365867372, "kl": 0.02838897705078125, "learning_rate": 4.994223724334643e-07, "loss": -0.2404, "num_tokens": 1948432.0, "reward": 0.05551876127719879, "reward_std": 0.05297388881444931, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.002659732010215521, "rewards/logprob_reward/std": 0.004493256099522114, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 543.53125, "completions/mean_terminated_length": 493.82757568359375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.2623456790123457, "grad_norm": 4.224829549134744, "kl": 0.0585479736328125, "learning_rate": 4.99387908739217e-07, "loss": -0.2441, "num_tokens": 1972285.0, "reward": 0.053791362792253494, "reward_std": 0.04617141932249069, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0007404016796499491, "rewards/logprob_reward/std": 0.0013021408813074231, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 593.53125, "completions/mean_terminated_length": 513.8148193359375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.2654320987654321, "grad_norm": 3.7137649235386068, "kl": 0.063995361328125, "learning_rate": 4.993524477523067e-07, "loss": -0.1313, "num_tokens": 1998098.0, "reward": 0.06119167059659958, "reward_std": 0.054584987461566925, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.002018526429310441, "rewards/logprob_reward/std": 0.004277835600078106, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 485.96875, "completions/mean_terminated_length": 468.6128845214844, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.26851851851851855, "grad_norm": 3.694604424319856, "kl": 0.0387420654296875, "learning_rate": 4.993159896145405e-07, "loss": -0.2402, "num_tokens": 2020197.0, "reward": 0.054866090416908264, "reward_std": 0.05415533855557442, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0019345462787896395, "rewards/logprob_reward/std": 0.0032406121026724577, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 482.78125, "completions/mean_terminated_length": 465.32257080078125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.2716049382716049, "grad_norm": 3.774088917632137, "kl": 0.041046142578125, "learning_rate": 4.99278534471713e-07, "loss": -0.2648, "num_tokens": 2042238.0, "reward": 0.06346947699785233, "reward_std": 0.04732219874858856, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0010771907400339842, "rewards/logprob_reward/std": 0.0016593115869909525, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 542.5, "completions/mean_terminated_length": 510.4000244140625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.27469135802469136, "grad_norm": 3.347485424204535, "kl": 0.0376434326171875, "learning_rate": 4.992400824736059e-07, "loss": -0.3509, "num_tokens": 2065678.0, "reward": 0.05103076249361038, "reward_std": 0.047450192272663116, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0011452909093350172, "rewards/logprob_reward/std": 0.0024503832682967186, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 535.375, "completions/mean_terminated_length": 502.8000183105469, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.2777777777777778, "grad_norm": 7.001336144386259, "kl": 0.0478057861328125, "learning_rate": 4.992006337739874e-07, "loss": -0.4782, "num_tokens": 2089774.0, "reward": 0.05893455818295479, "reward_std": 0.043769750744104385, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0029828420374542475, "rewards/logprob_reward/std": 0.004484000615775585, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 519.96875, "completions/mean_terminated_length": 486.36669921875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.2808641975308642, "grad_norm": 3.097845696323117, "kl": 0.0368804931640625, "learning_rate": 4.991601885306111e-07, "loss": -0.1096, "num_tokens": 2113281.0, "reward": 0.03542055934667587, "reward_std": 0.04836319386959076, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0011617304990068078, "rewards/logprob_reward/std": 0.0018826224841177464, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 545.28125, "completions/mean_terminated_length": 513.36669921875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.2839506172839506, "grad_norm": 3.7743253756780084, "kl": 0.050323486328125, "learning_rate": 4.991187469052162e-07, "loss": -0.1952, "num_tokens": 2137282.0, "reward": 0.05702035129070282, "reward_std": 0.05189286172389984, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0008559462148696184, "rewards/logprob_reward/std": 0.0017882605316117406, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 501.53125, "completions/mean_terminated_length": 447.4827575683594, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.28703703703703703, "grad_norm": 3.8907350246801036, "kl": 0.04400634765625, "learning_rate": 4.99076309063526e-07, "loss": -0.1082, "num_tokens": 2159795.0, "reward": 0.051053136587142944, "reward_std": 0.045405298471450806, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.00117015209980309, "rewards/logprob_reward/std": 0.002091635251417756, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 652.09375, "completions/mean_terminated_length": 583.2222290039062, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.29012345679012347, "grad_norm": 2.293708714001795, "kl": 0.0368194580078125, "learning_rate": 4.99032875175248e-07, "loss": 0.1117, "num_tokens": 2186946.0, "reward": 0.053791485726833344, "reward_std": 0.04802599549293518, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0007405409123748541, "rewards/logprob_reward/std": 0.0019137536874040961, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 574.90625, "completions/mean_terminated_length": 510.7500305175781, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.2932098765432099, "grad_norm": 2.951710553313073, "kl": 0.049102783203125, "learning_rate": 4.989884454140724e-07, "loss": -0.0989, "num_tokens": 2211843.0, "reward": 0.05078906565904617, "reward_std": 0.047226108610630035, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0008767400868237019, "rewards/logprob_reward/std": 0.0018862105207517743, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 570.21875, "completions/mean_terminated_length": 505.39288330078125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.2962962962962963, "grad_norm": 2.706009833282272, "kl": 0.04150390625, "learning_rate": 4.989430199576722e-07, "loss": -0.008, "num_tokens": 2236422.0, "reward": 0.045017652213573456, "reward_std": 0.04995456710457802, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0014085028087720275, "rewards/logprob_reward/std": 0.003322416450828314, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 543.375, "completions/mean_terminated_length": 511.3333740234375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.2993827160493827, "grad_norm": 2.7390931663000604, "kl": 0.0435791015625, "learning_rate": 4.988965989877022e-07, "loss": -0.0619, "num_tokens": 2260418.0, "reward": 0.05442715808749199, "reward_std": 0.04363364353775978, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0014468419831246138, "rewards/logprob_reward/std": 0.002777144545689225, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 565.0625, "completions/mean_terminated_length": 534.4666748046875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.30246913580246915, "grad_norm": 2.6940412752911413, "kl": 0.0467376708984375, "learning_rate": 4.988491826897978e-07, "loss": 0.0639, "num_tokens": 2285240.0, "reward": 0.06444443762302399, "reward_std": 0.0413370281457901, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0021604890935122967, "rewards/logprob_reward/std": 0.004000400193035603, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 614.59375, "completions/mean_terminated_length": 556.107177734375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.3055555555555556, "grad_norm": 2.9928673723537904, "kl": 0.047119140625, "learning_rate": 4.988007712535752e-07, "loss": -0.0496, "num_tokens": 2310995.0, "reward": 0.05372615158557892, "reward_std": 0.0532066747546196, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0006679468788206577, "rewards/logprob_reward/std": 0.0012105830246582627, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 627.28125, "completions/mean_terminated_length": 516.2000122070312, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.30864197530864196, "grad_norm": 2.5027149167555587, "kl": 0.047332763671875, "learning_rate": 4.987513648726298e-07, "loss": -0.0632, "num_tokens": 2337436.0, "reward": 0.0507701113820076, "reward_std": 0.04203411191701889, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0008556764805689454, "rewards/logprob_reward/std": 0.0016093184240162373, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 607.34375, "completions/mean_terminated_length": 579.5667114257812, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.3117283950617284, "grad_norm": 3.139962304140252, "kl": 0.052276611328125, "learning_rate": 4.987009637445358e-07, "loss": 0.0031, "num_tokens": 2363355.0, "reward": 0.04761410504579544, "reward_std": 0.05293666571378708, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0008212258107960224, "rewards/logprob_reward/std": 0.0012592091225087643, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 527.75, "completions/mean_terminated_length": 494.66668701171875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.3148148148148148, "grad_norm": 2.883736141072442, "kl": 0.0460968017578125, "learning_rate": 4.986495680708453e-07, "loss": -0.0103, "num_tokens": 2386411.0, "reward": 0.06110449135303497, "reward_std": 0.05544377118349075, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0019216546788811684, "rewards/logprob_reward/std": 0.0034797852858901024, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 624.4375, "completions/mean_terminated_length": 512.5599975585938, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.31790123456790126, "grad_norm": 2.7624234295814936, "kl": 0.046630859375, "learning_rate": 4.985971780570878e-07, "loss": -0.047, "num_tokens": 2412781.0, "reward": 0.04408486187458038, "reward_std": 0.039759885519742966, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0003720703534781933, "rewards/logprob_reward/std": 0.0007752202218398452, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 590.0, "completions/mean_terminated_length": 545.1034545898438, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.32098765432098764, "grad_norm": 3.1068981475336437, "kl": 0.051055908203125, "learning_rate": 4.985437939127687e-07, "loss": -0.0894, "num_tokens": 2438325.0, "reward": 0.05655660480260849, "reward_std": 0.046917624771595, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.00034067226806655526, "rewards/logprob_reward/std": 0.000890880124643445, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 570.1875, "completions/mean_terminated_length": 505.357177734375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.32407407407407407, "grad_norm": 2.4669522103941977, "kl": 0.0426177978515625, "learning_rate": 4.984894158513696e-07, "loss": -0.2237, "num_tokens": 2463147.0, "reward": 0.051290228962898254, "reward_std": 0.053184136748313904, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0014335874002426863, "rewards/logprob_reward/std": 0.003448633011430502, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 614.28125, "completions/mean_terminated_length": 571.8965454101562, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.3271604938271605, "grad_norm": 3.6235426676332723, "kl": 0.0612030029296875, "learning_rate": 4.984340440903456e-07, "loss": -0.1257, "num_tokens": 2489568.0, "reward": 0.05727202072739601, "reward_std": 0.0427728109061718, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0011355748865753412, "rewards/logprob_reward/std": 0.002726617967709899, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 545.75, "completions/mean_terminated_length": 496.2758483886719, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.33024691358024694, "grad_norm": 3.8062614866717412, "kl": 0.04974365234375, "learning_rate": 4.983776788511268e-07, "loss": -0.161, "num_tokens": 2513680.0, "reward": 0.04481091350317001, "reward_std": 0.040245793759822845, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0011787915136665106, "rewards/logprob_reward/std": 0.0018818581011146307, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 570.625, "completions/mean_terminated_length": 505.857177734375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.3333333333333333, "grad_norm": 3.061970292466625, "kl": 0.062469482421875, "learning_rate": 4.983203203591154e-07, "loss": -0.1396, "num_tokens": 2538524.0, "reward": 0.053908370435237885, "reward_std": 0.055700093507766724, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0008704091887921095, "rewards/logprob_reward/std": 0.003946096636354923, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 494.8125, "completions/mean_terminated_length": 494.8125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.33641975308641975, "grad_norm": 2.7202993772031805, "kl": 0.0513916015625, "learning_rate": 4.982619688436859e-07, "loss": -0.0298, "num_tokens": 2560690.0, "reward": 0.06022677570581436, "reward_std": 0.04589129984378815, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0009464181493967772, "rewards/logprob_reward/std": 0.0018110605888068676, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 542.15625, "completions/mean_terminated_length": 526.6129150390625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.3395061728395062, "grad_norm": 3.346190718258864, "kl": 0.066192626953125, "learning_rate": 4.982026245381837e-07, "loss": -0.1211, "num_tokens": 2584415.0, "reward": 0.05337420850992203, "reward_std": 0.04603324830532074, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.000276896491413936, "rewards/logprob_reward/std": 0.0007598244701512158, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 609.75, "completions/mean_terminated_length": 533.0370483398438, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.3425925925925926, "grad_norm": 2.379277616100787, "kl": 0.0584259033203125, "learning_rate": 4.981422876799244e-07, "loss": -0.0572, "num_tokens": 2610763.0, "reward": 0.04142041876912117, "reward_std": 0.04397819936275482, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0008838011417537928, "rewards/logprob_reward/std": 0.0019359972793608904, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 575.5, "completions/mean_terminated_length": 561.0322265625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.345679012345679, "grad_norm": 2.736137122162934, "kl": 0.0489501953125, "learning_rate": 4.980809585101927e-07, "loss": -0.0628, "num_tokens": 2635719.0, "reward": 0.061229631304740906, "reward_std": 0.027119195088744164, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0020606969483196735, "rewards/logprob_reward/std": 0.0034382117446511984, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 566.625, "completions/mean_terminated_length": 536.1333618164062, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.3487654320987654, "grad_norm": 2.477840339001624, "kl": 0.054168701171875, "learning_rate": 4.980186372742417e-07, "loss": -0.0458, "num_tokens": 2659935.0, "reward": 0.06329086422920227, "reward_std": 0.04749561846256256, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.000878739170730114, "rewards/logprob_reward/std": 0.0020262636244297028, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 526.0, "completions/mean_terminated_length": 509.9354553222656, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.35185185185185186, "grad_norm": 3.5438811757063107, "kl": 0.09625244140625, "learning_rate": 4.979553242212917e-07, "loss": -0.1171, "num_tokens": 2683463.0, "reward": 0.05102846771478653, "reward_std": 0.04755294322967529, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0011427418794482946, "rewards/logprob_reward/std": 0.0026069299783557653, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 571.9375, "completions/mean_terminated_length": 541.800048828125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.3549382716049383, "grad_norm": 2.5227547419462355, "kl": 0.049407958984375, "learning_rate": 4.978910196045291e-07, "loss": -0.1261, "num_tokens": 2708077.0, "reward": 0.06320597231388092, "reward_std": 0.04742685705423355, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0007844074862077832, "rewards/logprob_reward/std": 0.002121969358995557, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 576.28125, "completions/mean_terminated_length": 546.433349609375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.35802469135802467, "grad_norm": 7.359540387286494, "kl": 0.068756103515625, "learning_rate": 4.978257236811055e-07, "loss": -0.3935, "num_tokens": 2733578.0, "reward": 0.05617111921310425, "reward_std": 0.04427188262343407, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.003384580370038748, "rewards/logprob_reward/std": 0.01107009407132864, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 546.4375, "completions/mean_terminated_length": 546.4375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.3611111111111111, "grad_norm": 3.109979629121864, "kl": 0.0557861328125, "learning_rate": 4.977594367121369e-07, "loss": -0.1422, "num_tokens": 2757568.0, "reward": 0.05709470063447952, "reward_std": 0.0473439060151577, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0009385535959154367, "rewards/logprob_reward/std": 0.0019288675393909216, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 586.5625, "completions/mean_terminated_length": 524.0714721679688, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.36419753086419754, "grad_norm": 3.2522664113011843, "kl": 0.05938720703125, "learning_rate": 4.976921589627021e-07, "loss": -0.1721, "num_tokens": 2782918.0, "reward": 0.052726443856954575, "reward_std": 0.0450054295361042, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.006501602008938789, "rewards/logprob_reward/std": 0.015674972906708717, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 548.25, "completions/mean_terminated_length": 499.03448486328125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.36728395061728397, "grad_norm": 3.455605600846889, "kl": 0.057098388671875, "learning_rate": 4.976238907018427e-07, "loss": -0.2375, "num_tokens": 2807090.0, "reward": 0.06296929717063904, "reward_std": 0.03989600017666817, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0005214466946199536, "rewards/logprob_reward/std": 0.0011562422150745988, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 592.9375, "completions/mean_terminated_length": 513.1111450195312, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.37037037037037035, "grad_norm": 2.5785227908166624, "kl": 0.0533447265625, "learning_rate": 4.975546322025605e-07, "loss": -0.084, "num_tokens": 2832220.0, "reward": 0.05106581747531891, "reward_std": 0.05249335616827011, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.001184241147711873, "rewards/logprob_reward/std": 0.002151809399947524, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 609.90625, "completions/mean_terminated_length": 567.0689697265625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.3734567901234568, "grad_norm": 4.374803977435889, "kl": 0.057281494140625, "learning_rate": 4.974843837418175e-07, "loss": -0.2325, "num_tokens": 2858537.0, "reward": 0.04909980297088623, "reward_std": 0.050683602690696716, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0024720027577131987, "rewards/logprob_reward/std": 0.0034602447412908077, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 545.53125, "completions/mean_terminated_length": 513.6333618164062, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.3765432098765432, "grad_norm": 2.8643841333726985, "kl": 0.059661865234375, "learning_rate": 4.974131456005349e-07, "loss": -0.2127, "num_tokens": 2882294.0, "reward": 0.060503534972667694, "reward_std": 0.05500777065753937, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0012539270101115108, "rewards/logprob_reward/std": 0.002895203186199069, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 536.9375, "completions/mean_terminated_length": 521.2257690429688, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.37962962962962965, "grad_norm": 2.8088641308214384, "kl": 0.0654296875, "learning_rate": 4.973409180635911e-07, "loss": -0.0703, "num_tokens": 2906556.0, "reward": 0.057338517159223557, "reward_std": 0.047599297016859055, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0012094636913388968, "rewards/logprob_reward/std": 0.002666698070243001, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 568.875, "completions/mean_terminated_length": 538.5333862304688, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.38271604938271603, "grad_norm": 2.5218944395967955, "kl": 0.062347412109375, "learning_rate": 4.972677014198213e-07, "loss": -0.1082, "num_tokens": 2931476.0, "reward": 0.06511932611465454, "reward_std": 0.04857144504785538, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0029103620909154415, "rewards/logprob_reward/std": 0.004752908833324909, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 597.65625, "completions/mean_terminated_length": 569.2333374023438, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.38580246913580246, "grad_norm": 2.6358729093991218, "kl": 0.05462646484375, "learning_rate": 4.97193495962016e-07, "loss": -0.0485, "num_tokens": 2956733.0, "reward": 0.05737147852778435, "reward_std": 0.04644714668393135, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0012460858561098576, "rewards/logprob_reward/std": 0.0020421771332621574, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 607.15625, "completions/mean_terminated_length": 579.36669921875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.3888888888888889, "grad_norm": 3.516291466833583, "kl": 0.0527191162109375, "learning_rate": 4.971183019869201e-07, "loss": -0.2645, "num_tokens": 2982554.0, "reward": 0.04825485497713089, "reward_std": 0.04806240648031235, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001533173373900354, "rewards/logprob_reward/std": 0.002815558109432459, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 606.65625, "completions/mean_terminated_length": 547.0357666015625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.39197530864197533, "grad_norm": 2.712705374458109, "kl": 0.052276611328125, "learning_rate": 4.970421197952311e-07, "loss": 0.0621, "num_tokens": 3008063.0, "reward": 0.048473648726940155, "reward_std": 0.046610329300165176, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001776272663846612, "rewards/logprob_reward/std": 0.0030606482177972794, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 610.34375, "completions/mean_terminated_length": 582.7667236328125, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.3950617283950617, "grad_norm": 2.536075529612535, "kl": 0.061492919921875, "learning_rate": 4.969649496915991e-07, "loss": 0.0236, "num_tokens": 3033902.0, "reward": 0.052005015313625336, "reward_std": 0.04411856085062027, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.00222779531031847, "rewards/logprob_reward/std": 0.005151921417564154, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 525.75, "completions/mean_terminated_length": 474.2069091796875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.39814814814814814, "grad_norm": 3.011332619467256, "kl": 0.074005126953125, "learning_rate": 4.96886791984624e-07, "loss": -0.3031, "num_tokens": 3057122.0, "reward": 0.05376160889863968, "reward_std": 0.053026266396045685, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0007073450833559036, "rewards/logprob_reward/std": 0.0019466037629172206, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 490.75, "completions/mean_terminated_length": 455.20001220703125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.4012345679012346, "grad_norm": 3.144798988669224, "kl": 0.072296142578125, "learning_rate": 4.968076469868558e-07, "loss": -0.2522, "num_tokens": 3079510.0, "reward": 0.05199963599443436, "reward_std": 0.04264131188392639, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0022218169178813696, "rewards/logprob_reward/std": 0.006449904292821884, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 519.71875, "completions/mean_terminated_length": 503.45159912109375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.404320987654321, "grad_norm": 2.779361540038843, "kl": 0.0804443359375, "learning_rate": 4.967275150147921e-07, "loss": -0.2118, "num_tokens": 3102905.0, "reward": 0.0702369213104248, "reward_std": 0.0397680439054966, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0016521394718438387, "rewards/logprob_reward/std": 0.0031119592022150755, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 623.5, "completions/mean_terminated_length": 596.800048828125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.4074074074074074, "grad_norm": 2.907337266435984, "kl": 0.25628662109375, "learning_rate": 4.966463963888775e-07, "loss": -0.0338, "num_tokens": 3129681.0, "reward": 0.07101751863956451, "reward_std": 0.02736024744808674, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0025194690097123384, "rewards/logprob_reward/std": 0.004677193239331245, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 572.78125, "completions/mean_terminated_length": 526.1034545898438, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.4104938271604938, "grad_norm": 3.1986088549857206, "kl": 0.0638427734375, "learning_rate": 4.965642914335025e-07, "loss": -0.172, "num_tokens": 3154698.0, "reward": 0.04453890770673752, "reward_std": 0.04678146913647652, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0008765653474256396, "rewards/logprob_reward/std": 0.0020337984897196293, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 573.28125, "completions/mean_terminated_length": 558.741943359375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.41358024691358025, "grad_norm": 2.7195080598710395, "kl": 0.071533203125, "learning_rate": 4.964812004770013e-07, "loss": -0.0662, "num_tokens": 3180315.0, "reward": 0.057729728519916534, "reward_std": 0.04083167761564255, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0016441469779238105, "rewards/logprob_reward/std": 0.003845022525638342, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 536.46875, "completions/mean_terminated_length": 503.9667053222656, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.4166666666666667, "grad_norm": 3.7904174216466835, "kl": 0.079833984375, "learning_rate": 4.963971238516519e-07, "loss": -0.2033, "num_tokens": 3203954.0, "reward": 0.04193691164255142, "reward_std": 0.041769422590732574, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0014576807152479887, "rewards/logprob_reward/std": 0.0031217315699905157, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 503.0, "completions/mean_terminated_length": 503.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.41975308641975306, "grad_norm": 3.187683602245789, "kl": 0.062591552734375, "learning_rate": 4.963120618936732e-07, "loss": -0.1837, "num_tokens": 3226382.0, "reward": 0.042183056473731995, "reward_std": 0.04615149646997452, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0017311744159087539, "rewards/logprob_reward/std": 0.003340093418955803, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 572.40625, "completions/mean_terminated_length": 542.300048828125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.4228395061728395, "grad_norm": 2.6112897569480227, "kl": 0.06365966796875, "learning_rate": 4.962260149432247e-07, "loss": -0.1889, "num_tokens": 3251047.0, "reward": 0.06047273054718971, "reward_std": 0.045773088932037354, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0012196984607726336, "rewards/logprob_reward/std": 0.002573272679001093, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 568.40625, "completions/mean_terminated_length": 553.7096557617188, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.42592592592592593, "grad_norm": 2.469587378558464, "kl": 0.062469482421875, "learning_rate": 4.96138983344405e-07, "loss": -0.155, "num_tokens": 3275408.0, "reward": 0.06650030612945557, "reward_std": 0.04502515867352486, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0009725566487759352, "rewards/logprob_reward/std": 0.002437157789245248, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 573.4375, "completions/mean_terminated_length": 526.8275756835938, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.42901234567901236, "grad_norm": 2.328868779136458, "kl": 0.06634521484375, "learning_rate": 4.9605096744525e-07, "loss": -0.0233, "num_tokens": 3300246.0, "reward": 0.07066695392131805, "reward_std": 0.025806615129113197, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0021299482323229313, "rewards/logprob_reward/std": 0.004689618945121765, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 532.03125, "completions/mean_terminated_length": 516.1612548828125, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.43209876543209874, "grad_norm": 3.333962717233176, "kl": 0.107452392578125, "learning_rate": 4.95961967597732e-07, "loss": -0.14, "num_tokens": 3323807.0, "reward": 0.05786291882395744, "reward_std": 0.04674186185002327, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0017921293620020151, "rewards/logprob_reward/std": 0.0031645300332456827, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 614.34375, "completions/mean_terminated_length": 571.9655151367188, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.4351851851851852, "grad_norm": 2.9684297054896156, "kl": 0.065948486328125, "learning_rate": 4.958719841577579e-07, "loss": -0.2246, "num_tokens": 3350090.0, "reward": 0.044370800256729126, "reward_std": 0.046897463500499725, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0006897771963849664, "rewards/logprob_reward/std": 0.001732326578348875, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 549.46875, "completions/mean_terminated_length": 517.8333740234375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.4382716049382716, "grad_norm": 3.932832666658898, "kl": 0.0792236328125, "learning_rate": 4.957810174851679e-07, "loss": -0.1534, "num_tokens": 3374729.0, "reward": 0.04233923554420471, "reward_std": 0.05221627652645111, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.001904707052744925, "rewards/logprob_reward/std": 0.0034659402444958687, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 551.40625, "completions/mean_terminated_length": 551.40625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.44135802469135804, "grad_norm": 2.544294118168252, "kl": 0.081085205078125, "learning_rate": 4.956890679437345e-07, "loss": -0.0192, "num_tokens": 3398474.0, "reward": 0.06388504803180695, "reward_std": 0.0397639200091362, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0015389358159154654, "rewards/logprob_reward/std": 0.0027499685529619455, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 570.96875, "completions/mean_terminated_length": 570.96875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.4444444444444444, "grad_norm": 2.9359685242986684, "kl": 0.066986083984375, "learning_rate": 4.955961359011601e-07, "loss": -0.1657, "num_tokens": 3423177.0, "reward": 0.03861922770738602, "reward_std": 0.046918563544750214, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0012435840908437967, "rewards/logprob_reward/std": 0.002308847848325968, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 601.78125, "completions/mean_terminated_length": 588.1612548828125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.44753086419753085, "grad_norm": 2.7023965534656407, "kl": 0.073272705078125, "learning_rate": 4.955022217290766e-07, "loss": -0.1101, "num_tokens": 3449090.0, "reward": 0.05682101473212242, "reward_std": 0.05433877557516098, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0006344596622511744, "rewards/logprob_reward/std": 0.0016948895063251257, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 560.46875, "completions/mean_terminated_length": 512.5172119140625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.4506172839506173, "grad_norm": 2.986924507376984, "kl": 0.075164794921875, "learning_rate": 4.954073258030431e-07, "loss": 0.0157, "num_tokens": 3473221.0, "reward": 0.05381591618061066, "reward_std": 0.04771006107330322, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0007676836103200912, "rewards/logprob_reward/std": 0.0014980339910835028, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 553.6875, "completions/mean_terminated_length": 538.51611328125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.4537037037037037, "grad_norm": 2.843594232342128, "kl": 0.07666015625, "learning_rate": 4.953114485025446e-07, "loss": -0.2028, "num_tokens": 3497243.0, "reward": 0.044478774070739746, "reward_std": 0.054053470492362976, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0008097498212009668, "rewards/logprob_reward/std": 0.0022112694568932056, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 603.75, "completions/mean_terminated_length": 560.27587890625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.4567901234567901, "grad_norm": 2.8092190763627793, "kl": 0.078277587890625, "learning_rate": 4.95214590210991e-07, "loss": -0.2411, "num_tokens": 3523095.0, "reward": 0.044111467897892, "reward_std": 0.04442127048969269, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0004016283492092043, "rewards/logprob_reward/std": 0.0010047410614788532, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 632.78125, "completions/mean_terminated_length": 632.78125, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.45987654320987653, "grad_norm": 2.5107977402015056, "kl": 0.072479248046875, "learning_rate": 4.951167513157147e-07, "loss": -0.1486, "num_tokens": 3550028.0, "reward": 0.060559920966625214, "reward_std": 0.051685988903045654, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0013165771961212158, "rewards/logprob_reward/std": 0.004063119646161795, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 559.25, "completions/mean_terminated_length": 528.2667236328125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.46296296296296297, "grad_norm": 2.8056032826451127, "kl": 0.068939208984375, "learning_rate": 4.950179322079697e-07, "loss": -0.1228, "num_tokens": 3574460.0, "reward": 0.05674087256193161, "reward_std": 0.052086036652326584, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0005454131751321256, "rewards/logprob_reward/std": 0.001222002669237554, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 553.90625, "completions/mean_terminated_length": 505.2758483886719, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.4660493827160494, "grad_norm": 3.213821541862956, "kl": 0.093719482421875, "learning_rate": 4.949181332829299e-07, "loss": -0.1993, "num_tokens": 3599049.0, "reward": 0.05664665251970291, "reward_std": 0.046812109649181366, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0004407258238643408, "rewards/logprob_reward/std": 0.001193308737128973, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 492.0, "completions/mean_terminated_length": 474.83868408203125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.4691358024691358, "grad_norm": 2.9182994246328415, "kl": 0.08349609375, "learning_rate": 4.948173549396873e-07, "loss": -0.0844, "num_tokens": 3621193.0, "reward": 0.04452269524335861, "reward_std": 0.04726994037628174, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0008585481555201113, "rewards/logprob_reward/std": 0.001919503789395094, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 638.375, "completions/mean_terminated_length": 566.9629516601562, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.4722222222222222, "grad_norm": 2.215355945019327, "kl": 0.07696533203125, "learning_rate": 4.947155975812506e-07, "loss": -0.1416, "num_tokens": 3648181.0, "reward": 0.050617851316928864, "reward_std": 0.03936385735869408, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0006865040631964803, "rewards/logprob_reward/std": 0.0015793000347912312, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 576.0625, "completions/mean_terminated_length": 561.6129150390625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.47530864197530864, "grad_norm": 3.4099798068408775, "kl": 0.08404541015625, "learning_rate": 4.946128616145436e-07, "loss": -0.3246, "num_tokens": 3673447.0, "reward": 0.0597553625702858, "reward_std": 0.04533667117357254, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0004226211167406291, "rewards/logprob_reward/std": 0.001463288557715714, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 551.5625, "completions/mean_terminated_length": 551.5625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.4783950617283951, "grad_norm": 2.6736789485757373, "kl": 0.07171630859375, "learning_rate": 4.945091474504037e-07, "loss": -0.0012, "num_tokens": 3697693.0, "reward": 0.06379599869251251, "reward_std": 0.027524448931217194, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.001439998159185052, "rewards/logprob_reward/std": 0.0025939131155610085, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 576.09375, "completions/mean_terminated_length": 576.09375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.48148148148148145, "grad_norm": 2.458439816355618, "kl": 0.08233642578125, "learning_rate": 4.944044555035793e-07, "loss": 0.0019, "num_tokens": 3723216.0, "reward": 0.06332716345787048, "reward_std": 0.03316696733236313, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0009190634591504931, "rewards/logprob_reward/std": 0.002278524450957775, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 572.71875, "completions/mean_terminated_length": 558.1612548828125, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.4845679012345679, "grad_norm": 2.1576719984246027, "kl": 0.076141357421875, "learning_rate": 4.9429878619273e-07, "loss": 0.0094, "num_tokens": 3747383.0, "reward": 0.04400166496634483, "reward_std": 0.03943231329321861, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0002796255284920335, "rewards/logprob_reward/std": 0.0007145073032006621, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 499.46875, "completions/mean_terminated_length": 482.5483703613281, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.4876543209876543, "grad_norm": 4.9867034473928875, "kl": 0.0755615234375, "learning_rate": 4.941921399404232e-07, "loss": -0.3322, "num_tokens": 3769338.0, "reward": 0.04752832651138306, "reward_std": 0.05379118397831917, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0007259194971993566, "rewards/logprob_reward/std": 0.0020718248561024666, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 541.3125, "completions/mean_terminated_length": 525.741943359375, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.49074074074074076, "grad_norm": 2.384406357181412, "kl": 0.0927734375, "learning_rate": 4.940845171731329e-07, "loss": -0.157, "num_tokens": 3793096.0, "reward": 0.07222440838813782, "reward_std": 0.03863812983036041, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.00038822778151370585, "rewards/logprob_reward/std": 0.0012460780562832952, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 530.21875, "completions/mean_terminated_length": 514.290283203125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.49382716049382713, "grad_norm": 2.677695822774252, "kl": 0.0936279296875, "learning_rate": 4.939759183212388e-07, "loss": -0.1621, "num_tokens": 3816327.0, "reward": 0.0631580650806427, "reward_std": 0.041855860501527786, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0007311837980523705, "rewards/logprob_reward/std": 0.001986136194318533, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 590.59375, "completions/mean_terminated_length": 528.6785888671875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.49691358024691357, "grad_norm": 2.5034008153124625, "kl": 0.10211181640625, "learning_rate": 4.938663438190232e-07, "loss": -0.121, "num_tokens": 3841958.0, "reward": 0.054024044424295425, "reward_std": 0.03780882805585861, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0009989351965487003, "rewards/logprob_reward/std": 0.0030066664330661297, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 545.5625, "completions/mean_terminated_length": 545.5625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.5, "grad_norm": 2.8793487536491207, "kl": 0.075836181640625, "learning_rate": 4.937557941046705e-07, "loss": -0.2787, "num_tokens": 3865608.0, "reward": 0.04443252086639404, "reward_std": 0.04472089558839798, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0007583519909530878, "rewards/logprob_reward/std": 0.0016288717743009329, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 600.1875, "completions/mean_terminated_length": 521.7037353515625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.5030864197530864, "grad_norm": 4.3373373253550636, "kl": 0.088104248046875, "learning_rate": 4.936442696202648e-07, "loss": -0.2416, "num_tokens": 3890882.0, "reward": 0.0412057563662529, "reward_std": 0.05025993287563324, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.000645287218503654, "rewards/logprob_reward/std": 0.002340418053790927, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 687.6875, "completions/mean_terminated_length": 610.0769653320312, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.5061728395061729, "grad_norm": 2.2070094386749357, "kl": 0.07977294921875, "learning_rate": 4.935317708117881e-07, "loss": -0.1125, "num_tokens": 3919648.0, "reward": 0.047586582601070404, "reward_std": 0.04772721976041794, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.000790646648965776, "rewards/logprob_reward/std": 0.0019396664574742317, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 625.34375, "completions/mean_terminated_length": 598.7667236328125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.5092592592592593, "grad_norm": 2.4562094566655577, "kl": 0.100830078125, "learning_rate": 4.934182981291187e-07, "loss": -0.0291, "num_tokens": 3946491.0, "reward": 0.05210525542497635, "reward_std": 0.040608860552310944, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0023391738068312407, "rewards/logprob_reward/std": 0.00513032078742981, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 635.0, "completions/mean_terminated_length": 579.4285888671875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.5123456790123457, "grad_norm": 2.615833483797188, "kl": 0.093414306640625, "learning_rate": 4.933038520260299e-07, "loss": -0.0896, "num_tokens": 3973851.0, "reward": 0.038603805005550385, "reward_std": 0.032515063881874084, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0012264486867934465, "rewards/logprob_reward/std": 0.0025094689335674047, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 587.21875, "completions/mean_terminated_length": 542.0344848632812, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.5154320987654321, "grad_norm": 2.715888113920899, "kl": 0.09375, "learning_rate": 4.931884329601869e-07, "loss": -0.1415, "num_tokens": 3999294.0, "reward": 0.04511518031358719, "reward_std": 0.052907492965459824, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0015168681275099516, "rewards/logprob_reward/std": 0.004005419556051493, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 597.84375, "completions/mean_terminated_length": 584.0967407226562, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.5185185185185185, "grad_norm": 2.7278046327751158, "kl": 0.09930419921875, "learning_rate": 4.930720413931463e-07, "loss": -0.1705, "num_tokens": 4025385.0, "reward": 0.04705619066953659, "reward_std": 0.04059264063835144, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.00020132274948991835, "rewards/logprob_reward/std": 0.0006632709410041571, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 618.6875, "completions/mean_terminated_length": 605.6129150390625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.5216049382716049, "grad_norm": 1.982411689281292, "kl": 0.09539794921875, "learning_rate": 4.929546777903534e-07, "loss": -0.2065, "num_tokens": 4051479.0, "reward": 0.06299344450235367, "reward_std": 0.04481876641511917, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0005482725682668388, "rewards/logprob_reward/std": 0.0015520042506977916, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 590.0, "completions/mean_terminated_length": 561.0667114257812, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.5246913580246914, "grad_norm": 2.572582741579363, "kl": 0.0904541015625, "learning_rate": 4.928363426211407e-07, "loss": -0.1826, "num_tokens": 4076823.0, "reward": 0.04143887758255005, "reward_std": 0.04543890058994293, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0009043083409778774, "rewards/logprob_reward/std": 0.0023466269485652447, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 541.5625, "completions/mean_terminated_length": 541.5625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.5277777777777778, "grad_norm": 2.731307993510179, "kl": 0.093017578125, "learning_rate": 4.927170363587262e-07, "loss": -0.1749, "num_tokens": 4100509.0, "reward": 0.050791189074516296, "reward_std": 0.039393700659275055, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0008790968568064272, "rewards/logprob_reward/std": 0.0023332065902650356, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 597.0625, "completions/mean_terminated_length": 568.6000366210938, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.5308641975308642, "grad_norm": 2.1871290908855476, "kl": 0.099365234375, "learning_rate": 4.925967594802109e-07, "loss": -0.1, "num_tokens": 4125855.0, "reward": 0.05643356591463089, "reward_std": 0.0411858856678009, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.00020396310719661415, "rewards/logprob_reward/std": 0.0011045490391552448, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 597.21875, "completions/mean_terminated_length": 597.21875, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.5339506172839507, "grad_norm": 2.44677423545142, "kl": 0.08740234375, "learning_rate": 4.924755124665774e-07, "loss": -0.1932, "num_tokens": 4151322.0, "reward": 0.05687759071588516, "reward_std": 0.03429555520415306, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0006973271956667304, "rewards/logprob_reward/std": 0.0018503706669434905, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 656.5625, "completions/mean_terminated_length": 632.0667114257812, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.5370370370370371, "grad_norm": 1.9145257120214048, "kl": 0.08758544921875, "learning_rate": 4.923532958026878e-07, "loss": -0.0852, "num_tokens": 4178220.0, "reward": 0.0600404292345047, "reward_std": 0.032063812017440796, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0007393647101707757, "rewards/logprob_reward/std": 0.0033789281733334064, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 566.8125, "completions/mean_terminated_length": 552.0645141601562, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.5401234567901234, "grad_norm": 3.707207888615661, "kl": 0.0899658203125, "learning_rate": 4.922301099772821e-07, "loss": -0.3079, "num_tokens": 4202470.0, "reward": 0.04734790325164795, "reward_std": 0.053314127027988434, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0005254444549791515, "rewards/logprob_reward/std": 0.001620819210074842, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 617.75, "completions/mean_terminated_length": 590.6666870117188, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.5432098765432098, "grad_norm": 2.4510673949748543, "kl": 0.10162353515625, "learning_rate": 4.921059554829753e-07, "loss": -0.0792, "num_tokens": 4228838.0, "reward": 0.04168153554201126, "reward_std": 0.04795132577419281, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.001173929194919765, "rewards/logprob_reward/std": 0.0027986096683889627, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 608.875, "completions/mean_terminated_length": 608.875, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.5462962962962963, "grad_norm": 2.654131100571325, "kl": 0.11077880859375, "learning_rate": 4.91980832816257e-07, "loss": -0.1494, "num_tokens": 4254806.0, "reward": 0.057833198457956314, "reward_std": 0.0521986298263073, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.001759108155965805, "rewards/logprob_reward/std": 0.0036081629805266857, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 564.46875, "completions/mean_terminated_length": 533.8333740234375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.5493827160493827, "grad_norm": 2.870744145847357, "kl": 0.09637451171875, "learning_rate": 4.918547424774873e-07, "loss": -0.0505, "num_tokens": 4279321.0, "reward": 0.05646614730358124, "reward_std": 0.047001827508211136, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.00024016425595618784, "rewards/logprob_reward/std": 0.001035280991345644, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 572.21875, "completions/mean_terminated_length": 557.6451416015625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.5524691358024691, "grad_norm": 2.8791619627740213, "kl": 0.10540771484375, "learning_rate": 4.917276849708972e-07, "loss": -0.1143, "num_tokens": 4304416.0, "reward": 0.04742274433374405, "reward_std": 0.05460613593459129, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0006086068460717797, "rewards/logprob_reward/std": 0.0015793552156537771, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 567.375, "completions/mean_terminated_length": 536.933349609375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.5555555555555556, "grad_norm": 2.760430399481095, "kl": 0.09259033203125, "learning_rate": 4.915996608045842e-07, "loss": -0.2382, "num_tokens": 4328968.0, "reward": 0.05662311986088753, "reward_std": 0.044914934784173965, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0004145758575759828, "rewards/logprob_reward/std": 0.0011752157006412745, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 564.5625, "completions/mean_terminated_length": 564.5625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.558641975308642, "grad_norm": 2.754564018346516, "kl": 0.087188720703125, "learning_rate": 4.914706704905125e-07, "loss": -0.1993, "num_tokens": 4353026.0, "reward": 0.04729136824607849, "reward_std": 0.05444261059165001, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0004626337031368166, "rewards/logprob_reward/std": 0.0011854376643896103, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 607.875, "completions/mean_terminated_length": 607.875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.5617283950617284, "grad_norm": 2.9974037878149566, "kl": 0.086883544921875, "learning_rate": 4.913407145445093e-07, "loss": -0.2072, "num_tokens": 4379070.0, "reward": 0.05068790540099144, "reward_std": 0.03280695900321007, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0007643378921784461, "rewards/logprob_reward/std": 0.0019737649708986282, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 555.0, "completions/mean_terminated_length": 523.7333374023438, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.5648148148148148, "grad_norm": 2.7019698751031886, "kl": 0.09429931640625, "learning_rate": 4.912097934862632e-07, "loss": -0.2045, "num_tokens": 4403086.0, "reward": 0.05067024007439613, "reward_std": 0.051602087914943695, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0007447098614647985, "rewards/logprob_reward/std": 0.0020433899480849504, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 653.46875, "completions/mean_terminated_length": 628.7667236328125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.5679012345679012, "grad_norm": 2.8024004394039803, "kl": 0.08929443359375, "learning_rate": 4.910779078393228e-07, "loss": -0.1117, "num_tokens": 4430317.0, "reward": 0.04114678502082825, "reward_std": 0.04779033362865448, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0005797599442303181, "rewards/logprob_reward/std": 0.001542158075608313, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 598.03125, "completions/mean_terminated_length": 553.9655151367188, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.5709876543209876, "grad_norm": 3.139001329290585, "kl": 0.09881591796875, "learning_rate": 4.909450581310935e-07, "loss": -0.1868, "num_tokens": 4455838.0, "reward": 0.046875, "reward_std": 0.04568375647068024, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 578.53125, "completions/mean_terminated_length": 578.53125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.5740740740740741, "grad_norm": 2.715790957358654, "kl": 0.09686279296875, "learning_rate": 4.908112448928363e-07, "loss": -0.1373, "num_tokens": 4480711.0, "reward": 0.053631968796253204, "reward_std": 0.04750828444957733, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0005632966640405357, "rewards/logprob_reward/std": 0.001424881280399859, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 578.09375, "completions/mean_terminated_length": 548.36669921875, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.5771604938271605, "grad_norm": 2.4131258394211077, "kl": 0.10272216796875, "learning_rate": 4.906764686596651e-07, "loss": -0.2549, "num_tokens": 4505642.0, "reward": 0.04702848196029663, "reward_std": 0.04592697322368622, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0001705329050309956, "rewards/logprob_reward/std": 0.0006710395682603121, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 546.40625, "completions/mean_terminated_length": 546.40625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5802469135802469, "grad_norm": 2.708093528492381, "kl": 0.0911865234375, "learning_rate": 4.90540729970545e-07, "loss": -0.0681, "num_tokens": 4529631.0, "reward": 0.05057809129357338, "reward_std": 0.04895421117544174, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0006423235754482448, "rewards/logprob_reward/std": 0.0017255189595744014, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 655.625, "completions/mean_terminated_length": 603.0, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.5833333333333334, "grad_norm": 2.3868795689153552, "kl": 0.109130859375, "learning_rate": 4.904040293682897e-07, "loss": -0.1324, "num_tokens": 4557343.0, "reward": 0.0293910950422287, "reward_std": 0.03942374140024185, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0014067735755816102, "rewards/logprob_reward/std": 0.004575551021844149, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 580.0, "completions/mean_terminated_length": 550.4000244140625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.5864197530864198, "grad_norm": 3.073421440606636, "kl": 0.11328125, "learning_rate": 4.902663673995597e-07, "loss": -0.0849, "num_tokens": 4582635.0, "reward": 0.05398586764931679, "reward_std": 0.04952556639909744, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0009565172949805856, "rewards/logprob_reward/std": 0.0022744557354599237, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 607.125, "completions/mean_terminated_length": 593.6774291992188, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.5895061728395061, "grad_norm": 2.599229833389622, "kl": 0.10601806640625, "learning_rate": 4.9012774461486e-07, "loss": -0.1981, "num_tokens": 4609015.0, "reward": 0.04128945991396904, "reward_std": 0.04696265980601311, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0007382894400507212, "rewards/logprob_reward/std": 0.0021007144823670387, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 553.3125, "completions/mean_terminated_length": 538.1290283203125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.5925925925925926, "grad_norm": 3.088150746873547, "kl": 0.10693359375, "learning_rate": 4.899881615685376e-07, "loss": -0.2816, "num_tokens": 4633257.0, "reward": 0.04374999925494194, "reward_std": 0.051933757960796356, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 641.96875, "completions/mean_terminated_length": 602.4483032226562, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.595679012345679, "grad_norm": 2.4233293562287863, "kl": 0.1162109375, "learning_rate": 4.898476188187798e-07, "loss": -0.0939, "num_tokens": 4660224.0, "reward": 0.05658697336912155, "reward_std": 0.04677470028400421, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0003744146670214832, "rewards/logprob_reward/std": 0.0017363273072987795, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 588.875, "completions/mean_terminated_length": 574.8386840820312, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.5987654320987654, "grad_norm": 2.024359732059673, "kl": 0.09490966796875, "learning_rate": 4.897061169276118e-07, "loss": -0.0746, "num_tokens": 4685660.0, "reward": 0.05633340775966644, "reward_std": 0.03949132561683655, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 9.267372661270201e-05, "rewards/logprob_reward/std": 0.0005242417682893574, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 535.625, "completions/mean_terminated_length": 519.8709716796875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.6018518518518519, "grad_norm": 2.5964712865604556, "kl": 0.09527587890625, "learning_rate": 4.895636564608942e-07, "loss": -0.2392, "num_tokens": 4709340.0, "reward": 0.04751855507493019, "reward_std": 0.04732377082109451, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0007150607998482883, "rewards/logprob_reward/std": 0.0019512978615239263, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 594.09375, "completions/mean_terminated_length": 565.433349609375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.6049382716049383, "grad_norm": 2.571796913593863, "kl": 0.108642578125, "learning_rate": 4.894202379883206e-07, "loss": -0.1736, "num_tokens": 4735019.0, "reward": 0.03775563836097717, "reward_std": 0.04141201078891754, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.00028404415934346616, "rewards/logprob_reward/std": 0.0008146684267558157, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 600.46875, "completions/mean_terminated_length": 600.46875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.6080246913580247, "grad_norm": 2.9396181866146875, "kl": 0.10418701171875, "learning_rate": 4.892758620834165e-07, "loss": -0.1014, "num_tokens": 4760726.0, "reward": 0.05058019980788231, "reward_std": 0.05364552140235901, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0006446627667173743, "rewards/logprob_reward/std": 0.001384554896503687, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 620.0, "completions/mean_terminated_length": 578.2069091796875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.6111111111111112, "grad_norm": 3.4608153300475, "kl": 0.11517333984375, "learning_rate": 4.891305293235351e-07, "loss": -0.2337, "num_tokens": 4787374.0, "reward": 0.03217801824212074, "reward_std": 0.040280796587467194, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0010311321821063757, "rewards/logprob_reward/std": 0.00260757259093225, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 588.5625, "completions/mean_terminated_length": 588.5625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.6141975308641975, "grad_norm": 3.293033384055398, "kl": 0.11273193359375, "learning_rate": 4.889842402898569e-07, "loss": -0.2187, "num_tokens": 4812480.0, "reward": 0.056570760905742645, "reward_std": 0.03236209228634834, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0003563996870070696, "rewards/logprob_reward/std": 0.0010288916528224945, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 556.40625, "completions/mean_terminated_length": 541.3225708007812, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.6172839506172839, "grad_norm": 2.8393316824113106, "kl": 0.11126708984375, "learning_rate": 4.888369955673858e-07, "loss": -0.2211, "num_tokens": 4836969.0, "reward": 0.04425449296832085, "reward_std": 0.053492575883865356, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0005605471087619662, "rewards/logprob_reward/std": 0.0018193412106484175, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 677.90625, "completions/mean_terminated_length": 598.0385131835938, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.6203703703703703, "grad_norm": 2.4586059669310236, "kl": 0.104949951171875, "learning_rate": 4.88688795744948e-07, "loss": -0.033, "num_tokens": 4865474.0, "reward": 0.04104198142886162, "reward_std": 0.04574280232191086, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.00046331313205882907, "rewards/logprob_reward/std": 0.0013247487368062139, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 590.03125, "completions/mean_terminated_length": 561.1000366210938, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.6234567901234568, "grad_norm": 3.0975596149761806, "kl": 0.126708984375, "learning_rate": 4.885396414151888e-07, "loss": -0.277, "num_tokens": 4891327.0, "reward": 0.05083741247653961, "reward_std": 0.05358371138572693, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0009304598206654191, "rewards/logprob_reward/std": 0.0025314248632639647, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 645.1875, "completions/mean_terminated_length": 606.0, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.6265432098765432, "grad_norm": 2.969395557908256, "kl": 0.10601806640625, "learning_rate": 4.883895331745707e-07, "loss": -0.2939, "num_tokens": 4918445.0, "reward": 0.03783799707889557, "reward_std": 0.053672000765800476, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0003755528596229851, "rewards/logprob_reward/std": 0.0009694079053588212, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 602.09375, "completions/mean_terminated_length": 558.4483032226562, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.6296296296296297, "grad_norm": 2.6446376143799233, "kl": 0.106781005859375, "learning_rate": 4.882384716233709e-07, "loss": -0.0601, "num_tokens": 4944108.0, "reward": 0.03495427593588829, "reward_std": 0.03947734832763672, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0006436366238631308, "rewards/logprob_reward/std": 0.0021368886809796095, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 593.34375, "completions/mean_terminated_length": 579.4515991210938, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.6327160493827161, "grad_norm": 2.1730488753880812, "kl": 0.10675048828125, "learning_rate": 4.880864573656785e-07, "loss": -0.1947, "num_tokens": 4969399.0, "reward": 0.053266704082489014, "reward_std": 0.0473971962928772, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0001574465713929385, "rewards/logprob_reward/std": 0.0007713346858508885, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 616.90625, "completions/mean_terminated_length": 574.7930908203125, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.6358024691358025, "grad_norm": 2.732093927778844, "kl": 0.10491943359375, "learning_rate": 4.879334910093926e-07, "loss": -0.1368, "num_tokens": 4995388.0, "reward": 0.04427650570869446, "reward_std": 0.044918712228536606, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.000585003406740725, "rewards/logprob_reward/std": 0.0019014464924111962, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 561.5, "completions/mean_terminated_length": 475.85186767578125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.6388888888888888, "grad_norm": 2.1492116299588457, "kl": 0.125244140625, "learning_rate": 4.877795731662202e-07, "loss": -0.2233, "num_tokens": 5019556.0, "reward": 0.0563012957572937, "reward_std": 0.041363805532455444, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 5.699522444047034e-05, "rewards/logprob_reward/std": 0.00022817167337052524, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 644.6875, "completions/mean_terminated_length": 619.4000244140625, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.6419753086419753, "grad_norm": 2.4248377093804447, "kl": 0.110107421875, "learning_rate": 4.876247044516724e-07, "loss": -0.0783, "num_tokens": 5046702.0, "reward": 0.028475604951381683, "reward_std": 0.031558647751808167, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0003895602421835065, "rewards/logprob_reward/std": 0.0011856276541948318, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 627.78125, "completions/mean_terminated_length": 571.1785888671875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.6450617283950617, "grad_norm": 2.7390330955608304, "kl": 0.1085205078125, "learning_rate": 4.874688854850635e-07, "loss": -0.1363, "num_tokens": 5073107.0, "reward": 0.03207886219024658, "reward_std": 0.039595384150743484, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0009209603886120021, "rewards/logprob_reward/std": 0.002882851753383875, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 660.65625, "completions/mean_terminated_length": 539.5416870117188, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6481481481481481, "grad_norm": 2.8889382030754427, "kl": 0.12890625, "learning_rate": 4.873121168895075e-07, "loss": -0.1702, "num_tokens": 5101032.0, "reward": 0.031383778899908066, "reward_std": 0.04649616777896881, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.00014864497643429786, "rewards/logprob_reward/std": 0.0005854673217982054, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 616.3125, "completions/mean_terminated_length": 616.3125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.6512345679012346, "grad_norm": 2.8242270768474826, "kl": 0.10198974609375, "learning_rate": 4.87154399291916e-07, "loss": -0.1788, "num_tokens": 5126994.0, "reward": 0.04099217802286148, "reward_std": 0.047876451164484024, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0004079728969372809, "rewards/logprob_reward/std": 0.0014560659183189273, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 572.5, "completions/mean_terminated_length": 572.5, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.654320987654321, "grad_norm": 2.6266475279642494, "kl": 0.12945556640625, "learning_rate": 4.869957333229955e-07, "loss": -0.2273, "num_tokens": 5151918.0, "reward": 0.04423429071903229, "reward_std": 0.04626988619565964, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0005380974616855383, "rewards/logprob_reward/std": 0.002099623205140233, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 633.0625, "completions/mean_terminated_length": 620.4515991210938, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.6574074074074074, "grad_norm": 2.484121195726362, "kl": 0.11895751953125, "learning_rate": 4.868361196172453e-07, "loss": -0.1246, "num_tokens": 5178844.0, "reward": 0.05328449606895447, "reward_std": 0.04550441354513168, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.00017721450421959162, "rewards/logprob_reward/std": 0.0010024766670539975, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 671.71875, "completions/mean_terminated_length": 635.27587890625, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.6604938271604939, "grad_norm": 2.1254880703665893, "kl": 0.10760498046875, "learning_rate": 4.866755588129542e-07, "loss": -0.0416, "num_tokens": 5206727.0, "reward": 0.05987684428691864, "reward_std": 0.04718624800443649, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0005576012772507966, "rewards/logprob_reward/std": 0.0027992515824735165, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 577.40625, "completions/mean_terminated_length": 563.0, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.6635802469135802, "grad_norm": 2.6988438433636692, "kl": 0.163818359375, "learning_rate": 4.86514051552199e-07, "loss": -0.1574, "num_tokens": 5232276.0, "reward": 0.060177482664585114, "reward_std": 0.03958515450358391, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0008916450897231698, "rewards/logprob_reward/std": 0.002548168646171689, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 630.15625, "completions/mean_terminated_length": 539.2692260742188, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.6666666666666666, "grad_norm": 2.314573383950194, "kl": 0.118896484375, "learning_rate": 4.863515984808408e-07, "loss": -0.1538, "num_tokens": 5258621.0, "reward": 0.031334538012742996, "reward_std": 0.03406674042344093, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 9.393022628501058e-05, "rewards/logprob_reward/std": 0.0003963824128732085, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 631.0, "completions/mean_terminated_length": 604.800048828125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.6697530864197531, "grad_norm": 2.435706130506513, "kl": 0.139892578125, "learning_rate": 4.861882002485234e-07, "loss": -0.1688, "num_tokens": 5285157.0, "reward": 0.03769915550947189, "reward_std": 0.04663718491792679, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0002212801482528448, "rewards/logprob_reward/std": 0.0008804204408079386, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 643.03125, "completions/mean_terminated_length": 617.6333618164062, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.6728395061728395, "grad_norm": 2.410842172320329, "kl": 0.12322998046875, "learning_rate": 4.860238575086699e-07, "loss": -0.1206, "num_tokens": 5312018.0, "reward": 0.05332405865192413, "reward_std": 0.046081870794296265, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.00022117517073638737, "rewards/logprob_reward/std": 0.0012511557433754206, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 641.78125, "completions/mean_terminated_length": 629.4515991210938, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.6759259259259259, "grad_norm": 2.4154129091967533, "kl": 0.1409912109375, "learning_rate": 4.858585709184806e-07, "loss": -0.0788, "num_tokens": 5338995.0, "reward": 0.034696124494075775, "reward_std": 0.04090491682291031, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0003568080719560385, "rewards/logprob_reward/std": 0.0017188823549076915, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 542.375, "completions/mean_terminated_length": 542.375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.6790123456790124, "grad_norm": 2.0939959907633647, "kl": 0.141845703125, "learning_rate": 4.856923411389302e-07, "loss": -0.1423, "num_tokens": 5362559.0, "reward": 0.0443311408162117, "reward_std": 0.04073891416192055, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0006457103881984949, "rewards/logprob_reward/std": 0.001709131756797433, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 703.78125, "completions/mean_terminated_length": 629.8846435546875, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.6820987654320988, "grad_norm": 2.094011752914783, "kl": 0.1314697265625, "learning_rate": 4.855251688347653e-07, "loss": -0.0918, "num_tokens": 5391912.0, "reward": 0.04383155703544617, "reward_std": 0.0465577095746994, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 9.061659511644393e-05, "rewards/logprob_reward/std": 0.000512604892719537, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 637.125, "completions/mean_terminated_length": 624.6451416015625, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.6851851851851852, "grad_norm": 2.128294726656678, "kl": 0.1490478515625, "learning_rate": 4.853570546745014e-07, "loss": 0.005, "num_tokens": 5418584.0, "reward": 0.06256760656833649, "reward_std": 0.04150272160768509, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 7.511388685088605e-05, "rewards/logprob_reward/std": 0.0004249082994647324, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 690.84375, "completions/mean_terminated_length": 629.1481323242188, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.6882716049382716, "grad_norm": 2.3503052167681093, "kl": 0.1336669921875, "learning_rate": 4.851879993304208e-07, "loss": -0.0943, "num_tokens": 5446951.0, "reward": 0.03453109413385391, "reward_std": 0.045533955097198486, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.000173439402715303, "rewards/logprob_reward/std": 0.0007092207088135183, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 574.375, "completions/mean_terminated_length": 559.8709716796875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.691358024691358, "grad_norm": 2.6499183684428376, "kl": 0.1510009765625, "learning_rate": 4.850180034785691e-07, "loss": -0.0877, "num_tokens": 5472051.0, "reward": 0.04117923229932785, "reward_std": 0.041509099304676056, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0006158142350614071, "rewards/logprob_reward/std": 0.0034835711121559143, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 609.3125, "completions/mean_terminated_length": 595.9354858398438, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.6944444444444444, "grad_norm": 2.155445771776111, "kl": 0.14617919921875, "learning_rate": 4.848470677987532e-07, "loss": -0.2251, "num_tokens": 5497749.0, "reward": 0.056270308792591095, "reward_std": 0.046627260744571686, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 2.2565931431017816e-05, "rewards/logprob_reward/std": 0.00012765217979904264, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 671.8125, "completions/mean_terminated_length": 606.5925903320312, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.6975308641975309, "grad_norm": 2.274814183916674, "kl": 0.140380859375, "learning_rate": 4.846751929745383e-07, "loss": -0.0308, "num_tokens": 5525995.0, "reward": 0.04077719897031784, "reward_std": 0.04752274602651596, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.00016910559497773647, "rewards/logprob_reward/std": 0.0009566056542098522, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 622.125, "completions/mean_terminated_length": 580.5516967773438, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.7006172839506173, "grad_norm": 2.9197701183067477, "kl": 0.166259765625, "learning_rate": 4.845023796932454e-07, "loss": -0.1223, "num_tokens": 5552743.0, "reward": 0.05314267426729202, "reward_std": 0.040380291640758514, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 1.963327122211922e-05, "rewards/logprob_reward/std": 0.00011106255260528997, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 589.125, "completions/mean_terminated_length": 560.1333618164062, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.7037037037037037, "grad_norm": 2.424408425246981, "kl": 0.1558837890625, "learning_rate": 4.84328628645948e-07, "loss": -0.0422, "num_tokens": 5577511.0, "reward": 0.037620242685079575, "reward_std": 0.04844558984041214, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.00013360078446567059, "rewards/logprob_reward/std": 0.00046856305561959743, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 596.21875, "completions/mean_terminated_length": 582.4193115234375, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.7067901234567902, "grad_norm": 2.062889286939384, "kl": 0.16064453125, "learning_rate": 4.841539405274698e-07, "loss": -0.069, "num_tokens": 5603286.0, "reward": 0.05941256880760193, "reward_std": 0.04757439345121384, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 4.17455485148821e-05, "rewards/logprob_reward/std": 0.00023614846577402204, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 665.03125, "completions/mean_terminated_length": 613.75, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.7098765432098766, "grad_norm": 2.3825616687668183, "kl": 0.1590576171875, "learning_rate": 4.839783160363821e-07, "loss": -0.1477, "num_tokens": 5631183.0, "reward": 0.031745754182338715, "reward_std": 0.0247428547590971, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0005508353933691978, "rewards/logprob_reward/std": 0.0031159953214228153, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 616.21875, "completions/mean_terminated_length": 540.7037353515625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.7129629629629629, "grad_norm": 2.285236303731177, "kl": 0.1536865234375, "learning_rate": 4.838017558750004e-07, "loss": -0.0934, "num_tokens": 5657486.0, "reward": 0.03437500074505806, "reward_std": 0.03125, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 683.65625, "completions/mean_terminated_length": 635.0357666015625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.7160493827160493, "grad_norm": 2.0886870733529688, "kl": 0.1519775390625, "learning_rate": 4.836242607493819e-07, "loss": -0.1256, "num_tokens": 5686087.0, "reward": 0.04076904058456421, "reward_std": 0.04559372365474701, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.00016004232747945935, "rewards/logprob_reward/std": 0.0009053361136466265, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 652.1875, "completions/mean_terminated_length": 627.4000244140625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.7191358024691358, "grad_norm": 1.9978822850201081, "kl": 0.15631103515625, "learning_rate": 4.834458313693228e-07, "loss": -0.1412, "num_tokens": 5713269.0, "reward": 0.04694700241088867, "reward_std": 0.047571003437042236, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 8.000048546819016e-05, "rewards/logprob_reward/std": 0.0004525511176325381, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 625.28125, "completions/mean_terminated_length": 584.0344848632812, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.7222222222222222, "grad_norm": 2.1953735198767794, "kl": 0.163818359375, "learning_rate": 4.832664684483555e-07, "loss": -0.1073, "num_tokens": 5739410.0, "reward": 0.03125, "reward_std": 0.039433758705854416, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 605.15625, "completions/mean_terminated_length": 605.15625, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.7253086419753086, "grad_norm": 2.350257186190819, "kl": 0.16217041015625, "learning_rate": 4.830861727037453e-07, "loss": -0.1122, "num_tokens": 5765107.0, "reward": 0.04065612331032753, "reward_std": 0.047597043216228485, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 3.458164428593591e-05, "rewards/logprob_reward/std": 0.00019562333181966096, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 599.53125, "completions/mean_terminated_length": 599.53125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.7283950617283951, "grad_norm": 3.016768158405979, "kl": 0.16583251953125, "learning_rate": 4.82904944856488e-07, "loss": -0.2691, "num_tokens": 5790592.0, "reward": 0.05000000447034836, "reward_std": 0.046650636941194534, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 665.09375, "completions/mean_terminated_length": 641.1666870117188, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.7314814814814815, "grad_norm": 1.7930722744681806, "kl": 0.1746826171875, "learning_rate": 4.827227856313066e-07, "loss": -0.0918, "num_tokens": 5818535.0, "reward": 0.046875, "reward_std": 0.033183757215738297, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 652.34375, "completions/mean_terminated_length": 613.8965454101562, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.7345679012345679, "grad_norm": 2.53665479529064, "kl": 0.16156005859375, "learning_rate": 4.825396957566491e-07, "loss": -0.1198, "num_tokens": 5846002.0, "reward": 0.056460775434970856, "reward_std": 0.05556637793779373, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.00023419583158101887, "rewards/logprob_reward/std": 0.0013248117174953222, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 650.625, "completions/mean_terminated_length": 597.2857666015625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7376543209876543, "grad_norm": 2.328099770784432, "kl": 0.16650390625, "learning_rate": 4.823556759646847e-07, "loss": -0.1603, "num_tokens": 5872854.0, "reward": 0.04088599234819412, "reward_std": 0.03160203993320465, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.00028999114874750376, "rewards/logprob_reward/std": 0.0012471069348976016, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 680.03125, "completions/mean_terminated_length": 616.3333129882812, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.7407407407407407, "grad_norm": 2.1795366927271536, "kl": 0.1593017578125, "learning_rate": 4.821707269913016e-07, "loss": -0.1534, "num_tokens": 5901135.0, "reward": 0.040657684206962585, "reward_std": 0.038401514291763306, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 3.631383879110217e-05, "rewards/logprob_reward/std": 0.0002054220822174102, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 624.6875, "completions/mean_terminated_length": 624.6875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.7438271604938271, "grad_norm": 3.011508723204471, "kl": 0.18048095703125, "learning_rate": 4.819848495761037e-07, "loss": -0.2283, "num_tokens": 5927709.0, "reward": 0.03750000149011612, "reward_std": 0.05386751517653465, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 603.53125, "completions/mean_terminated_length": 560.0344848632812, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.7469135802469136, "grad_norm": 2.6892546568064466, "kl": 0.1878662109375, "learning_rate": 4.817980444624076e-07, "loss": -0.0577, "num_tokens": 5953886.0, "reward": 0.03154817223548889, "reward_std": 0.04138335958123207, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0003313017077744007, "rewards/logprob_reward/std": 0.0012565014185383916, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 611.59375, "completions/mean_terminated_length": 552.6785888671875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.75, "grad_norm": 2.3164282719071725, "kl": 0.1785888671875, "learning_rate": 4.816103123972395e-07, "loss": -0.0893, "num_tokens": 5979949.0, "reward": 0.02188117988407612, "reward_std": 0.03317964822053909, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 6.866796866233926e-06, "rewards/logprob_reward/std": 3.8844467781018466e-05, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 640.03125, "completions/mean_terminated_length": 627.6451416015625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.7530864197530864, "grad_norm": 2.5360110689160513, "kl": 0.189208984375, "learning_rate": 4.814216541313329e-07, "loss": -0.119, "num_tokens": 6007210.0, "reward": 0.05000000074505806, "reward_std": 0.05386751517653465, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 574.9375, "completions/mean_terminated_length": 545.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.7561728395061729, "grad_norm": 2.862147931219411, "kl": 0.1947021484375, "learning_rate": 4.812320704191252e-07, "loss": -0.1728, "num_tokens": 6032380.0, "reward": 0.04378309100866318, "reward_std": 0.04661262780427933, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 3.676979031297378e-05, "rewards/logprob_reward/std": 0.00020800135098397732, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 657.0, "completions/mean_terminated_length": 632.5333862304688, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.7592592592592593, "grad_norm": 1.905636384943984, "kl": 0.1708984375, "learning_rate": 4.81041562018754e-07, "loss": -0.0456, "num_tokens": 6059756.0, "reward": 0.031550176441669464, "reward_std": 0.03263639286160469, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.00033352768514305353, "rewards/logprob_reward/std": 0.0015560640022158623, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 638.84375, "completions/mean_terminated_length": 626.4193115234375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.7623456790123457, "grad_norm": 2.316538484469351, "kl": 0.16217041015625, "learning_rate": 4.808501296920552e-07, "loss": -0.0881, "num_tokens": 6086767.0, "reward": 0.03137172758579254, "reward_std": 0.04463999718427658, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.00013525362010113895, "rewards/logprob_reward/std": 0.0007651100168004632, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 625.03125, "completions/mean_terminated_length": 598.433349609375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.7654320987654321, "grad_norm": 2.787586722028665, "kl": 0.199462890625, "learning_rate": 4.806577742045593e-07, "loss": -0.117, "num_tokens": 6113352.0, "reward": 0.04375000298023224, "reward_std": 0.039433758705854416, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 665.1875, "completions/mean_terminated_length": 628.0689697265625, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.7685185185185185, "grad_norm": 2.0220631710175896, "kl": 0.179443359375, "learning_rate": 4.804644963254887e-07, "loss": -0.0636, "num_tokens": 6140810.0, "reward": 0.04062499850988388, "reward_std": 0.04761751741170883, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 559.96875, "completions/mean_terminated_length": 545.0, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.7716049382716049, "grad_norm": 2.0315025235024637, "kl": 0.17327880859375, "learning_rate": 4.80270296827754e-07, "loss": -0.1317, "num_tokens": 6164657.0, "reward": 0.046875, "reward_std": 0.04568375647068024, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 655.9375, "completions/mean_terminated_length": 617.862060546875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.7746913580246914, "grad_norm": 1.5902732578748726, "kl": 0.1885986328125, "learning_rate": 4.800751764879516e-07, "loss": 0.0153, "num_tokens": 6191683.0, "reward": 0.05624999850988388, "reward_std": 0.034150637686252594, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 648.65625, "completions/mean_terminated_length": 609.8275756835938, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.7777777777777778, "grad_norm": 2.1517295561708334, "kl": 0.177978515625, "learning_rate": 4.798791360863602e-07, "loss": -0.0841, "num_tokens": 6219104.0, "reward": 0.028237810358405113, "reward_std": 0.04027276858687401, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.00012534351844806224, "rewards/logprob_reward/std": 0.00070905004395172, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 709.625, "completions/mean_terminated_length": 651.4074096679688, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.7808641975308642, "grad_norm": 2.1099299720320883, "kl": 0.17547607421875, "learning_rate": 4.796821764069378e-07, "loss": -0.0995, "num_tokens": 6248428.0, "reward": 0.031371042132377625, "reward_std": 0.0465136282145977, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.00013449120160657912, "rewards/logprob_reward/std": 0.0007607970619574189, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 633.96875, "completions/mean_terminated_length": 578.25, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.7839506172839507, "grad_norm": 1.9882778612144554, "kl": 0.1922607421875, "learning_rate": 4.794842982373188e-07, "loss": -0.0389, "num_tokens": 6275199.0, "reward": 0.046908486634492874, "reward_std": 0.040423277765512466, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 3.720493259606883e-05, "rewards/logprob_reward/std": 0.0002104628802044317, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 644.5625, "completions/mean_terminated_length": 619.2667236328125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.7870370370370371, "grad_norm": 2.14624151505934, "kl": 0.1773681640625, "learning_rate": 4.7928550236881e-07, "loss": -0.115, "num_tokens": 6302429.0, "reward": 0.03125, "reward_std": 0.039433758705854416, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 629.1875, "completions/mean_terminated_length": 572.7857666015625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.7901234567901234, "grad_norm": 1.6876928844785541, "kl": 0.1802978515625, "learning_rate": 4.790857895963888e-07, "loss": -0.1124, "num_tokens": 6329195.0, "reward": 0.03437500074505806, "reward_std": 0.033183757215738297, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 717.5625, "completions/mean_terminated_length": 631.760009765625, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.7932098765432098, "grad_norm": 2.0101325941957104, "kl": 0.1756591796875, "learning_rate": 4.788851607186988e-07, "loss": -0.0888, "num_tokens": 6359709.0, "reward": 0.015625, "reward_std": 0.03125, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 693.03125, "completions/mean_terminated_length": 645.75, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.7962962962962963, "grad_norm": 1.9579710425311287, "kl": 0.180419921875, "learning_rate": 4.786836165380472e-07, "loss": -0.08, "num_tokens": 6388346.0, "reward": 0.03446173667907715, "reward_std": 0.040301889181137085, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 9.637646871851757e-05, "rewards/logprob_reward/std": 0.0005451876204460859, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 709.34375, "completions/mean_terminated_length": 636.7307739257812, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.7993827160493827, "grad_norm": 1.7946421677104327, "kl": 0.15673828125, "learning_rate": 4.784811578604013e-07, "loss": -0.0107, "num_tokens": 6417713.0, "reward": 0.03437500074505806, "reward_std": 0.03846687823534012, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 690.03125, "completions/mean_terminated_length": 667.7667236328125, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.8024691358024691, "grad_norm": 2.0942031317541856, "kl": 0.16387939453125, "learning_rate": 4.782777854953857e-07, "loss": -0.0174, "num_tokens": 6446446.0, "reward": 0.022133365273475647, "reward_std": 0.033654019236564636, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0002870730822905898, "rewards/logprob_reward/std": 0.0015146363293752074, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 735.9375, "completions/mean_terminated_length": 682.5925903320312, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.8055555555555556, "grad_norm": 2.042868542930924, "kl": 0.175537109375, "learning_rate": 4.780735002562785e-07, "loss": -0.0075, "num_tokens": 6476684.0, "reward": 0.03125, "reward_std": 0.046650636941194534, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 655.09375, "completions/mean_terminated_length": 616.9310302734375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.808641975308642, "grad_norm": 2.447517932403874, "kl": 0.1767578125, "learning_rate": 4.778683029600089e-07, "loss": -0.1325, "num_tokens": 6504075.0, "reward": 0.0375194288790226, "reward_std": 0.05192091315984726, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 2.1584961359621957e-05, "rewards/logprob_reward/std": 0.00012210298154968768, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 715.09375, "completions/mean_terminated_length": 657.888916015625, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.8117283950617284, "grad_norm": 1.875741545500234, "kl": 0.18011474609375, "learning_rate": 4.776621944271526e-07, "loss": -0.0333, "num_tokens": 6533122.0, "reward": 0.03437500074505806, "reward_std": 0.03846687823534012, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 682.5625, "completions/mean_terminated_length": 603.7692260742188, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.8148148148148148, "grad_norm": 1.8667237803366017, "kl": 0.1600341796875, "learning_rate": 4.774551754819299e-07, "loss": -0.0557, "num_tokens": 6561848.0, "reward": 0.025137821212410927, "reward_std": 0.03209925442934036, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0001531346351839602, "rewards/logprob_reward/std": 0.0006108815432526171, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 651.21875, "completions/mean_terminated_length": 626.36669921875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.8179012345679012, "grad_norm": 2.2028515956195074, "kl": 0.180908203125, "learning_rate": 4.772472469522015e-07, "loss": -0.0599, "num_tokens": 6589463.0, "reward": 0.04062499850988388, "reward_std": 0.04761751741170883, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 614.375, "completions/mean_terminated_length": 601.1612548828125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.8209876543209876, "grad_norm": 1.7546127139870733, "kl": 0.1773681640625, "learning_rate": 4.770384096694658e-07, "loss": -0.087, "num_tokens": 6615663.0, "reward": 0.0375639870762825, "reward_std": 0.039305780082941055, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 7.109808211680502e-05, "rewards/logprob_reward/std": 0.00040219147922471166, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 612.71875, "completions/mean_terminated_length": 612.71875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.8240740740740741, "grad_norm": 2.473014319654774, "kl": 0.195068359375, "learning_rate": 4.7682866446885475e-07, "loss": -0.2364, "num_tokens": 6641934.0, "reward": 0.05369546636939049, "reward_std": 0.051967013627290726, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0006338492967188358, "rewards/logprob_reward/std": 0.0025384912732988596, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 631.09375, "completions/mean_terminated_length": 604.9000244140625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.8271604938271605, "grad_norm": 1.9919107132515261, "kl": 0.1748046875, "learning_rate": 4.766180121891316e-07, "loss": -0.1175, "num_tokens": 6668405.0, "reward": 0.04062500223517418, "reward_std": 0.04040063917636871, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 689.53125, "completions/mean_terminated_length": 667.2333984375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.8302469135802469, "grad_norm": 1.9712023706550943, "kl": 0.160888671875, "learning_rate": 4.7640645367268663e-07, "loss": -0.0685, "num_tokens": 6696922.0, "reward": 0.03437500074505806, "reward_std": 0.04040063917636871, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 689.0, "completions/mean_terminated_length": 666.6666870117188, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.8333333333333334, "grad_norm": 2.2532725263612514, "kl": 0.1661376953125, "learning_rate": 4.761939897655343e-07, "loss": -0.2022, "num_tokens": 6725354.0, "reward": 0.04394396394491196, "reward_std": 0.048724181950092316, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.00021551498502958566, "rewards/logprob_reward/std": 0.0012191367568448186, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 748.59375, "completions/mean_terminated_length": 730.2333984375, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.8364197530864198, "grad_norm": 1.2982697679548028, "kl": 0.1802978515625, "learning_rate": 4.7598062131730943e-07, "loss": -0.0056, "num_tokens": 6755825.0, "reward": 0.01875000074505806, "reward_std": 0.019716879352927208, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 706.46875, "completions/mean_terminated_length": 673.6206665039062, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.8395061728395061, "grad_norm": 2.212537028222659, "kl": 0.1746826171875, "learning_rate": 4.757663491812644e-07, "loss": -0.0554, "num_tokens": 6784996.0, "reward": 0.03750000149011612, "reward_std": 0.046650636941194534, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 665.03125, "completions/mean_terminated_length": 613.75, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.8425925925925926, "grad_norm": 1.464750024079652, "kl": 0.1788330078125, "learning_rate": 4.755511742142652e-07, "loss": 0.0147, "num_tokens": 6812541.0, "reward": 0.01875000074505806, "reward_std": 0.02500000037252903, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 713.21875, "completions/mean_terminated_length": 668.8214721679688, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.845679012345679, "grad_norm": 1.6818283675716525, "kl": 0.1953125, "learning_rate": 4.753350972767883e-07, "loss": 0.0092, "num_tokens": 6841740.0, "reward": 0.02530525252223015, "reward_std": 0.03476113826036453, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0003391671634744853, "rewards/logprob_reward/std": 0.001918619149364531, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 691.1875, "completions/mean_terminated_length": 629.5555419921875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.8487654320987654, "grad_norm": 1.902781376698886, "kl": 0.1510009765625, "learning_rate": 4.75118119232917e-07, "loss": -0.0528, "num_tokens": 6870134.0, "reward": 0.02187499962747097, "reward_std": 0.03125, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 648.0, "completions/mean_terminated_length": 609.1034545898438, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.8518518518518519, "grad_norm": 1.8544806027546288, "kl": 0.1793212890625, "learning_rate": 4.749002409503382e-07, "loss": -0.0605, "num_tokens": 6897170.0, "reward": 0.02812499925494194, "reward_std": 0.04040063917636871, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 731.15625, "completions/mean_terminated_length": 649.1599731445312, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.8549382716049383, "grad_norm": 2.106745038986706, "kl": 0.1778564453125, "learning_rate": 4.7468146330033874e-07, "loss": -0.0921, "num_tokens": 6927391.0, "reward": 0.02812499925494194, "reward_std": 0.04568375647068024, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 734.375, "completions/mean_terminated_length": 653.2799682617188, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.8580246913580247, "grad_norm": 1.9572245973206377, "kl": 0.2008056640625, "learning_rate": 4.7446178715780213e-07, "loss": -0.0408, "num_tokens": 6957539.0, "reward": 0.03125, "reward_std": 0.04136751592159271, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 760.53125, "completions/mean_terminated_length": 699.7307739257812, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.8611111111111112, "grad_norm": 1.6558992288536742, "kl": 0.17498779296875, "learning_rate": 4.742412134012047e-07, "loss": -0.0159, "num_tokens": 6988232.0, "reward": 0.01875000074505806, "reward_std": 0.03750000149011612, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 693.875, "completions/mean_terminated_length": 671.86669921875, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.8641975308641975, "grad_norm": 1.6715855208905281, "kl": 0.1986083984375, "learning_rate": 4.740197429126125e-07, "loss": -0.0083, "num_tokens": 7016524.0, "reward": 0.03750000149011612, "reward_std": 0.039433758705854416, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 769.09375, "completions/mean_terminated_length": 697.719970703125, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.8672839506172839, "grad_norm": 1.015516224027939, "kl": 0.183349609375, "learning_rate": 4.7379737657767745e-07, "loss": -0.0242, "num_tokens": 7048051.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 743.5625, "completions/mean_terminated_length": 703.5000610351562, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.8703703703703703, "grad_norm": 1.6338389583240374, "kl": 0.18310546875, "learning_rate": 4.7357411528563393e-07, "loss": -0.0296, "num_tokens": 7078457.0, "reward": 0.012500000186264515, "reward_std": 0.02500000037252903, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 817.75, "completions/mean_terminated_length": 737.0435180664062, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 0.8734567901234568, "grad_norm": 1.3637746511034814, "kl": 0.172607421875, "learning_rate": 4.733499599292955e-07, "loss": -0.0083, "num_tokens": 7111677.0, "reward": 0.01875000074505806, "reward_std": 0.026933757588267326, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 738.09375, "completions/mean_terminated_length": 685.1481323242188, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.8765432098765432, "grad_norm": 1.4644497167448591, "kl": 0.190185546875, "learning_rate": 4.7312491140505064e-07, "loss": 0.001, "num_tokens": 7142244.0, "reward": 0.012586712837219238, "reward_std": 0.019661229103803635, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 9.634702291805297e-05, "rewards/logprob_reward/std": 0.0005450210883282125, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 746.3125, "completions/mean_terminated_length": 706.6428833007812, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.8796296296296297, "grad_norm": 0.9598329914024589, "kl": 0.1812744140625, "learning_rate": 4.7289897061285965e-07, "loss": -0.0125, "num_tokens": 7172806.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 751.53125, "completions/mean_terminated_length": 712.607177734375, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.8827160493827161, "grad_norm": 1.6261381639015626, "kl": 0.1966552734375, "learning_rate": 4.726721384562513e-07, "loss": 0.0051, "num_tokens": 7203255.0, "reward": 0.01579144224524498, "reward_std": 0.03158288449048996, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0001849345862865448, "rewards/logprob_reward/std": 0.0008219811716116965, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 722.5, "completions/mean_terminated_length": 702.4000244140625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.8858024691358025, "grad_norm": 3.3427490662652533, "kl": 2.1798095703125, "learning_rate": 4.724444158423185e-07, "loss": -0.1087, "num_tokens": 7233075.0, "reward": 0.012500000186264515, "reward_std": 0.02500000037252903, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 727.46875, "completions/mean_terminated_length": 644.4400024414062, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.8888888888888888, "grad_norm": 1.5374078937953224, "kl": 0.19140625, "learning_rate": 4.722158036817154e-07, "loss": 0.017, "num_tokens": 7262850.0, "reward": 0.01875000074505806, "reward_std": 0.0322168804705143, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 751.375, "completions/mean_terminated_length": 712.4285888671875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.8919753086419753, "grad_norm": 1.1899686771082865, "kl": 0.202392578125, "learning_rate": 4.7198630288865304e-07, "loss": -0.0259, "num_tokens": 7293198.0, "reward": 0.00937500037252903, "reward_std": 0.01875000074505806, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 716.03125, "completions/mean_terminated_length": 684.1724243164062, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.8950617283950617, "grad_norm": 1.6683712729905364, "kl": 0.2373046875, "learning_rate": 4.7175591438089646e-07, "loss": -0.0445, "num_tokens": 7322647.0, "reward": 0.012500000186264515, "reward_std": 0.02500000037252903, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 737.78125, "completions/mean_terminated_length": 657.6400146484375, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.8981481481481481, "grad_norm": 1.3376678952734042, "kl": 0.1954345703125, "learning_rate": 4.7152463907976024e-07, "loss": -0.0169, "num_tokens": 7352640.0, "reward": 0.01875000074505806, "reward_std": 0.02500000037252903, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 726.03125, "completions/mean_terminated_length": 657.2692260742188, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.9012345679012346, "grad_norm": 1.5295454118951755, "kl": 0.1878662109375, "learning_rate": 4.7129247791010563e-07, "loss": 0.0067, "num_tokens": 7382261.0, "reward": 0.02812500111758709, "reward_std": 0.040400635451078415, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 786.46875, "completions/mean_terminated_length": 719.9599609375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.904320987654321, "grad_norm": 1.1736265111480144, "kl": 0.2108154296875, "learning_rate": 4.710594318003361e-07, "loss": 0.0235, "num_tokens": 7414016.0, "reward": 0.012500000186264515, "reward_std": 0.019716879352927208, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 749.21875, "completions/mean_terminated_length": 672.2799682617188, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.9074074074074074, "grad_norm": 0.6293748884007017, "kl": 0.2100830078125, "learning_rate": 4.7082550168239423e-07, "loss": 0.0103, "num_tokens": 7444459.0, "reward": 0.00937500037252903, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 767.4375, "completions/mean_terminated_length": 708.2307739257812, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.9104938271604939, "grad_norm": 1.3906735964347727, "kl": 0.2081298828125, "learning_rate": 4.705906884917573e-07, "loss": -0.0229, "num_tokens": 7475525.0, "reward": 0.015625, "reward_std": 0.025966878980398178, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 860.03125, "completions/mean_terminated_length": 774.1428833007812, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.9135802469135802, "grad_norm": 1.1260611752928005, "kl": 0.18505859375, "learning_rate": 4.703549931674345e-07, "loss": 0.0285, "num_tokens": 7509594.0, "reward": 0.012500000186264515, "reward_std": 0.019716879352927208, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 741.84375, "completions/mean_terminated_length": 689.5925903320312, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.9166666666666666, "grad_norm": 1.0722793437822726, "kl": 0.2017822265625, "learning_rate": 4.7011841665196227e-07, "loss": 0.0239, "num_tokens": 7539833.0, "reward": 0.015625, "reward_std": 0.020683757960796356, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 696.9375, "completions/mean_terminated_length": 663.1034545898438, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.9197530864197531, "grad_norm": 1.7028779933369635, "kl": 0.215087890625, "learning_rate": 4.6988095989140096e-07, "loss": -0.0488, "num_tokens": 7568163.0, "reward": 0.009437083266675472, "reward_std": 0.018874166533350945, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 6.898110586917028e-05, "rewards/logprob_reward/std": 0.0003902160678990185, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 840.46875, "completions/mean_terminated_length": 757.0454711914062, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.9228395061728395, "grad_norm": 0.7766438983512843, "kl": 0.1976318359375, "learning_rate": 4.6964262383533114e-07, "loss": -0.0189, "num_tokens": 7601986.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 706.34375, "completions/mean_terminated_length": 673.4827270507812, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.9259259259259259, "grad_norm": 0.9381400098964588, "kl": 0.198974609375, "learning_rate": 4.694034094368495e-07, "loss": 0.0088, "num_tokens": 7630801.0, "reward": 0.012500000186264515, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 830.59375, "completions/mean_terminated_length": 766.125, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "epoch": 0.9290123456790124, "grad_norm": 1.001710780823997, "kl": 0.2493896484375, "learning_rate": 4.691633176525651e-07, "loss": 0.0101, "num_tokens": 7663768.0, "reward": 0.00937500037252903, "reward_std": 0.013466878794133663, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 727.0625, "completions/mean_terminated_length": 643.9199829101562, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.9320987654320988, "grad_norm": 1.1882821653317572, "kl": 0.217529296875, "learning_rate": 4.689223494425959e-07, "loss": 0.0133, "num_tokens": 7693190.0, "reward": 0.006325289607048035, "reward_std": 0.01265057921409607, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 8.365471148863435e-05, "rewards/logprob_reward/std": 0.0004732225206680596, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 698.78125, "completions/mean_terminated_length": 623.7307739257812, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.9351851851851852, "grad_norm": 1.0455868863314848, "kl": 0.2236328125, "learning_rate": 4.686805057705645e-07, "loss": 0.036, "num_tokens": 7722127.0, "reward": 0.009409812279045582, "reward_std": 0.013536503538489342, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 3.868029307341203e-05, "rewards/logprob_reward/std": 0.00021880878193769604, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 799.09375, "completions/mean_terminated_length": 711.0869750976562, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.9382716049382716, "grad_norm": 0.8643090576442451, "kl": 0.2166748046875, "learning_rate": 4.684377876035944e-07, "loss": -0.0242, "num_tokens": 7753850.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 720.21875, "completions/mean_terminated_length": 676.8214721679688, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.941358024691358, "grad_norm": 1.4831676362735404, "kl": 0.189453125, "learning_rate": 4.681941959123063e-07, "loss": -0.0018, "num_tokens": 7782765.0, "reward": 0.00959782488644123, "reward_std": 0.01919564977288246, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0002475832006894052, "rewards/logprob_reward/std": 0.0010625235736370087, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 719.40625, "completions/mean_terminated_length": 675.8928833007812, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.9444444444444444, "grad_norm": 1.716708272903363, "kl": 0.2091064453125, "learning_rate": 4.6794973167081397e-07, "loss": -0.0398, "num_tokens": 7812450.0, "reward": 0.012772508896887302, "reward_std": 0.025545017793774605, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.00030278731719590724, "rewards/logprob_reward/std": 0.0013578330399468541, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 702.59375, "completions/mean_terminated_length": 595.4583740234375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.9475308641975309, "grad_norm": 0.009440908633934682, "kl": 0.2236328125, "learning_rate": 4.6770439585672046e-07, "loss": 0.0002, "num_tokens": 7841141.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 731.28125, "completions/mean_terminated_length": 701.0, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.9506172839506173, "grad_norm": 0.8410116745335985, "kl": 0.20556640625, "learning_rate": 4.6745818945111426e-07, "loss": -0.0108, "num_tokens": 7870938.0, "reward": 0.0062500000931322575, "reward_std": 0.007216878701001406, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 716.75, "completions/mean_terminated_length": 645.84619140625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.9537037037037037, "grad_norm": 0.8943559824894225, "kl": 0.2095947265625, "learning_rate": 4.6721111343856547e-07, "loss": 0.0214, "num_tokens": 7900022.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 732.53125, "completions/mean_terminated_length": 665.2692260742188, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.9567901234567902, "grad_norm": 1.1298479953842961, "kl": 0.212646484375, "learning_rate": 4.669631688071214e-07, "loss": -0.017, "num_tokens": 7930307.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 742.59375, "completions/mean_terminated_length": 723.8333740234375, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.9598765432098766, "grad_norm": 1.3189588503979779, "kl": 0.20068359375, "learning_rate": 4.667143565483032e-07, "loss": 0.0114, "num_tokens": 7960646.0, "reward": 0.015625, "reward_std": 0.025966878980398178, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 645.40625, "completions/mean_terminated_length": 633.1935424804688, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.9629629629629629, "grad_norm": 1.6241557360798045, "kl": 0.251953125, "learning_rate": 4.664646776571015e-07, "loss": -0.0593, "num_tokens": 7987431.0, "reward": 0.015625, "reward_std": 0.025966878980398178, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 680.8125, "completions/mean_terminated_length": 617.25927734375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.9660493827160493, "grad_norm": 1.6394861513909418, "kl": 0.2274169921875, "learning_rate": 4.662141331319726e-07, "loss": -0.0584, "num_tokens": 8015461.0, "reward": 0.015625, "reward_std": 0.025966878980398178, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 770.0, "completions/mean_terminated_length": 670.6087036132812, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.9691358024691358, "grad_norm": 6.867913802332101, "kl": 2.4453125, "learning_rate": 4.6596272397483445e-07, "loss": -0.0283, "num_tokens": 8047569.0, "reward": 0.00937500037252903, "reward_std": 0.01875000074505806, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 722.71875, "completions/mean_terminated_length": 666.9259033203125, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.9722222222222222, "grad_norm": 0.8842193813197436, "kl": 0.222412109375, "learning_rate": 4.657104511910626e-07, "loss": 0.0198, "num_tokens": 8076892.0, "reward": 0.00937500037252903, "reward_std": 0.013466878794133663, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 792.96875, "completions/mean_terminated_length": 750.1851806640625, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.9753086419753086, "grad_norm": 0.8702597020094852, "kl": 0.2034912109375, "learning_rate": 4.654573157894861e-07, "loss": 0.071, "num_tokens": 8108903.0, "reward": 0.0034241636749356985, "reward_std": 0.006848327349871397, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00033240404445677996, "rewards/logprob_reward/std": 0.0013712375657632947, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 730.3125, "completions/mean_terminated_length": 615.3912963867188, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.9783950617283951, "grad_norm": 1.4219115095707027, "kl": 0.2218017578125, "learning_rate": 4.652033187823838e-07, "loss": -0.006, "num_tokens": 8138441.0, "reward": 0.015625, "reward_std": 0.025966878980398178, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 802.75, "completions/mean_terminated_length": 740.7999877929688, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.9814814814814815, "grad_norm": 0.8242556230145498, "kl": 0.237060546875, "learning_rate": 4.6494846118548e-07, "loss": 0.0014, "num_tokens": 8171069.0, "reward": 0.012500000186264515, "reward_std": 0.014433757402002811, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 798.9375, "completions/mean_terminated_length": 696.6364135742188, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.9845679012345679, "grad_norm": 0.8312399161974559, "kl": 0.2086181640625, "learning_rate": 4.6469274401794044e-07, "loss": -0.0002, "num_tokens": 8203523.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 758.0625, "completions/mean_terminated_length": 696.6923217773438, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.9876543209876543, "grad_norm": 1.2649856030150175, "kl": 0.210205078125, "learning_rate": 4.6443616830236823e-07, "loss": -0.0077, "num_tokens": 8234337.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 743.875, "completions/mean_terminated_length": 665.4400024414062, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.9907407407407407, "grad_norm": 0.9759895177682247, "kl": 0.20849609375, "learning_rate": 4.641787350647997e-07, "loss": -0.0152, "num_tokens": 8264817.0, "reward": 0.006368691101670265, "reward_std": 0.01242492999881506, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.000131878987303935, "rewards/logprob_reward/std": 0.0007460201741196215, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 747.9375, "completions/mean_terminated_length": 670.6400146484375, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.9938271604938271, "grad_norm": 0.4973960755968238, "kl": 0.2061767578125, "learning_rate": 4.6392044533470053e-07, "loss": 0.0084, "num_tokens": 8294963.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 750.0625, "completions/mean_terminated_length": 710.9285888671875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.9969135802469136, "grad_norm": 1.3680398351724956, "kl": 0.2352294921875, "learning_rate": 4.636613001449615e-07, "loss": -0.1376, "num_tokens": 8325681.0, "reward": 0.021934330463409424, "reward_std": 0.025899026542901993, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 6.592279532924294e-05, "rewards/logprob_reward/std": 0.00037291564512997866, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 763.1875, "completions/mean_terminated_length": 676.25, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 1.0, "grad_norm": 0.9804013605757141, "kl": 0.209716796875, "learning_rate": 4.6340130053189417e-07, "loss": -0.0133, "num_tokens": 8356563.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 751.875, "completions/mean_terminated_length": 713.0000610351562, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 1.0030864197530864, "grad_norm": 0.7121054896570292, "kl": 0.2164306640625, "learning_rate": 4.6314044753522703e-07, "loss": -0.0285, "num_tokens": 8387331.0, "reward": 0.0062500000931322575, "reward_std": 0.007216878701001406, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 807.09375, "completions/mean_terminated_length": 722.2174072265625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 1.0061728395061729, "grad_norm": 0.7481577546494053, "kl": 0.2098388671875, "learning_rate": 4.6287874219810117e-07, "loss": -0.0078, "num_tokens": 8419610.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 678.46875, "completions/mean_terminated_length": 642.72412109375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 1.0092592592592593, "grad_norm": 0.6604872270971704, "kl": 0.24267578125, "learning_rate": 4.626161855670663e-07, "loss": -0.0227, "num_tokens": 8447693.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 718.8125, "completions/mean_terminated_length": 633.3599853515625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 1.0123456790123457, "grad_norm": 1.6790013679876383, "kl": 0.223876953125, "learning_rate": 4.623527786920761e-07, "loss": -0.0138, "num_tokens": 8477111.0, "reward": 0.015625, "reward_std": 0.025966878980398178, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 715.75, "completions/mean_terminated_length": 671.7142944335938, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 1.0154320987654322, "grad_norm": 1.6554652829660685, "kl": 0.244140625, "learning_rate": 4.620885226264847e-07, "loss": -0.092, "num_tokens": 8506231.0, "reward": 0.00937500037252903, "reward_std": 0.01875000074505806, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 699.78125, "completions/mean_terminated_length": 666.2413940429688, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 1.0185185185185186, "grad_norm": 1.361173488498756, "kl": 0.2412109375, "learning_rate": 4.6182341842704177e-07, "loss": -0.057, "num_tokens": 8534736.0, "reward": 0.012500000186264515, "reward_std": 0.02500000037252903, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 777.8125, "completions/mean_terminated_length": 681.478271484375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 1.021604938271605, "grad_norm": 0.009587588033230739, "kl": 0.22265625, "learning_rate": 4.6155746715388903e-07, "loss": 0.0002, "num_tokens": 8565954.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 690.65625, "completions/mean_terminated_length": 628.9259033203125, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 1.0246913580246915, "grad_norm": 0.6248176904592802, "kl": 0.2423095703125, "learning_rate": 4.6129066987055533e-07, "loss": 0.0088, "num_tokens": 8594007.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 763.875, "completions/mean_terminated_length": 677.1666870117188, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 1.0277777777777777, "grad_norm": 1.2574969659485429, "kl": 0.2169189453125, "learning_rate": 4.610230276439526e-07, "loss": -0.0467, "num_tokens": 8625231.0, "reward": 0.006498053669929504, "reward_std": 0.012996107339859009, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0002756151370704174, "rewards/logprob_reward/std": 0.0015591145493090153, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 694.71875, "completions/mean_terminated_length": 660.6551513671875, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 1.0308641975308641, "grad_norm": 1.4501169847195978, "kl": 0.2662353515625, "learning_rate": 4.607545415443721e-07, "loss": -0.0524, "num_tokens": 8653974.0, "reward": 0.01875000074505806, "reward_std": 0.02500000037252903, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 815.125, "completions/mean_terminated_length": 756.6399536132812, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 1.0339506172839505, "grad_norm": 0.8971923851075063, "kl": 0.248779296875, "learning_rate": 4.604852126454792e-07, "loss": 0.0044, "num_tokens": 8686338.0, "reward": 0.00937500037252903, "reward_std": 0.013466878794133663, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 829.375, "completions/mean_terminated_length": 712.6000366210938, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 1.037037037037037, "grad_norm": 0.009864169482841535, "kl": 0.251708984375, "learning_rate": 4.6021504202430983e-07, "loss": 0.0003, "num_tokens": 8719574.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 788.90625, "completions/mean_terminated_length": 665.7619018554688, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 1.0401234567901234, "grad_norm": 0.8852924568708636, "kl": 0.24755859375, "learning_rate": 4.599440307612661e-07, "loss": -0.0153, "num_tokens": 8751939.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 695.9375, "completions/mean_terminated_length": 649.0714721679688, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 1.0432098765432098, "grad_norm": 0.6502115458136378, "kl": 0.249755859375, "learning_rate": 4.5967217994011144e-07, "loss": 0.0045, "num_tokens": 8780457.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 776.375, "completions/mean_terminated_length": 719.2307739257812, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 1.0462962962962963, "grad_norm": 0.8923014443797044, "kl": 0.2325439453125, "learning_rate": 4.593994906479669e-07, "loss": 0.005, "num_tokens": 8811485.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 819.78125, "completions/mean_terminated_length": 739.8695678710938, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 1.0493827160493827, "grad_norm": 0.8974915320488661, "kl": 0.2322998046875, "learning_rate": 4.591259639753066e-07, "loss": 0.0091, "num_tokens": 8844558.0, "reward": 0.00937500037252903, "reward_std": 0.01875000074505806, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 740.25, "completions/mean_terminated_length": 710.8965454101562, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 1.0524691358024691, "grad_norm": 0.02576346005578449, "kl": 0.273681640625, "learning_rate": 4.588516010159529e-07, "loss": 0.0003, "num_tokens": 8874546.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 818.25, "completions/mean_terminated_length": 677.4736938476562, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 1.0555555555555556, "grad_norm": 0.9750758651930964, "kl": 0.23486328125, "learning_rate": 4.58576402867073e-07, "loss": 0.0302, "num_tokens": 8907694.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 800.0625, "completions/mean_terminated_length": 725.4166870117188, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 1.058641975308642, "grad_norm": 1.0796010864414354, "kl": 0.2349853515625, "learning_rate": 4.5830037062917373e-07, "loss": -0.0512, "num_tokens": 8939304.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 896.59375, "completions/mean_terminated_length": 769.1875, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 1.0617283950617284, "grad_norm": 0.007417990712236422, "kl": 0.2103271484375, "learning_rate": 4.580235054060971e-07, "loss": 0.0002, "num_tokens": 8974923.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 729.84375, "completions/mean_terminated_length": 661.9615478515625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 1.0648148148148149, "grad_norm": 0.8842294127676017, "kl": 0.2591552734375, "learning_rate": 4.5774580830501685e-07, "loss": 0.0112, "num_tokens": 9004490.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 842.40625, "completions/mean_terminated_length": 718.1578979492188, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 1.0679012345679013, "grad_norm": 1.2648262971659887, "kl": 0.2259521484375, "learning_rate": 4.574672804364329e-07, "loss": -0.0847, "num_tokens": 9038055.0, "reward": 0.00937500037252903, "reward_std": 0.01875000074505806, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 881.84375, "completions/mean_terminated_length": 771.2777709960938, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 1.0709876543209877, "grad_norm": 0.5589430325783781, "kl": 0.2591552734375, "learning_rate": 4.571879229141674e-07, "loss": 0.0211, "num_tokens": 9073238.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 790.4375, "completions/mean_terminated_length": 725.0399780273438, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 1.074074074074074, "grad_norm": 0.9491041643011326, "kl": 0.220458984375, "learning_rate": 4.5690773685536037e-07, "loss": 0.0157, "num_tokens": 9104660.0, "reward": 0.0032900888472795486, "reward_std": 0.006580177694559097, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00018343192641623318, "rewards/logprob_reward/std": 0.0010376477148383856, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 811.25, "completions/mean_terminated_length": 683.6000366210938, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 1.0771604938271604, "grad_norm": 0.007563171515114929, "kl": 0.228515625, "learning_rate": 4.5662672338046513e-07, "loss": 0.0002, "num_tokens": 9137156.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 861.15625, "completions/mean_terminated_length": 763.4500122070312, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 1.0802469135802468, "grad_norm": 0.006928700848866776, "kl": 0.235107421875, "learning_rate": 4.5634488361324386e-07, "loss": 0.0002, "num_tokens": 9171669.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 879.5625, "completions/mean_terminated_length": 767.2222290039062, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 1.0833333333333333, "grad_norm": 0.8258267922504359, "kl": 0.2430419921875, "learning_rate": 4.560622186807628e-07, "loss": -0.0026, "num_tokens": 9206567.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 863.59375, "completions/mean_terminated_length": 790.6818237304688, "completions/min_length": 549.0, "completions/min_terminated_length": 549.0, "epoch": 1.0864197530864197, "grad_norm": 0.6010216095763495, "kl": 0.2197265625, "learning_rate": 4.5577872971338826e-07, "loss": 0.0034, "num_tokens": 9240642.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 784.625, "completions/mean_terminated_length": 729.3846435546875, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 1.0895061728395061, "grad_norm": 0.735612934087123, "kl": 0.2567138671875, "learning_rate": 4.554944178447816e-07, "loss": -0.0314, "num_tokens": 9271802.0, "reward": 0.0031897961162030697, "reward_std": 0.006208005361258984, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 7.199552055681124e-05, "rewards/logprob_reward/std": 0.00040726817678660154, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 866.3125, "completions/mean_terminated_length": 708.625, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 1.0925925925925926, "grad_norm": 0.527418541642273, "kl": 0.20654296875, "learning_rate": 4.552092842118952e-07, "loss": 0.0045, "num_tokens": 9306448.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 713.875, "completions/mean_terminated_length": 656.4444580078125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 1.095679012345679, "grad_norm": 0.00802996047463695, "kl": 0.24072265625, "learning_rate": 4.549233299549674e-07, "loss": 0.0002, "num_tokens": 9335544.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 785.625, "completions/mean_terminated_length": 718.8800048828125, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 1.0987654320987654, "grad_norm": 0.5839103722605828, "kl": 0.2532958984375, "learning_rate": 4.546365562175184e-07, "loss": -0.0046, "num_tokens": 9367112.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 800.8125, "completions/mean_terminated_length": 648.1052856445312, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 1.1018518518518519, "grad_norm": 0.007467171241563814, "kl": 0.2427978515625, "learning_rate": 4.543489641463452e-07, "loss": 0.0002, "num_tokens": 9399534.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 889.25, "completions/mean_terminated_length": 770.3529663085938, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 1.1049382716049383, "grad_norm": 0.588361555509795, "kl": 0.2142333984375, "learning_rate": 4.540605548915175e-07, "loss": 0.0023, "num_tokens": 9434402.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 780.40625, "completions/mean_terminated_length": 735.2963256835938, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 1.1080246913580247, "grad_norm": 1.3784756902348092, "kl": 0.251708984375, "learning_rate": 4.537713296063729e-07, "loss": -0.091, "num_tokens": 9465811.0, "reward": 0.0033699856139719486, "reward_std": 0.006739971227943897, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00027220614720135927, "rewards/logprob_reward/std": 0.0015398304676637053, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 783.03125, "completions/mean_terminated_length": 727.423095703125, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 1.1111111111111112, "grad_norm": 0.9365005434702497, "kl": 0.25146484375, "learning_rate": 4.534812894475122e-07, "loss": -0.0112, "num_tokens": 9496884.0, "reward": 0.015625, "reward_std": 0.013466878794133663, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 841.53125, "completions/mean_terminated_length": 745.952392578125, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 1.1141975308641976, "grad_norm": 0.7835103765441798, "kl": 0.2340087890625, "learning_rate": 4.5319043557479474e-07, "loss": -0.0229, "num_tokens": 9530005.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 739.21875, "completions/mean_terminated_length": 659.47998046875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 1.117283950617284, "grad_norm": 1.149886821199454, "kl": 0.26953125, "learning_rate": 4.5289876915133394e-07, "loss": -0.0284, "num_tokens": 9559552.0, "reward": 0.015625, "reward_std": 0.020683757960796356, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 824.5625, "completions/mean_terminated_length": 669.4444580078125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 1.1203703703703705, "grad_norm": 0.019166753684404333, "kl": 0.2608642578125, "learning_rate": 4.5260629134349284e-07, "loss": 0.0003, "num_tokens": 9592594.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 815.40625, "completions/mean_terminated_length": 767.269287109375, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 1.123456790123457, "grad_norm": 0.8302941800160129, "kl": 0.2374267578125, "learning_rate": 4.523130033208788e-07, "loss": -0.0012, "num_tokens": 9624795.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 914.15625, "completions/mean_terminated_length": 804.3125, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "epoch": 1.126543209876543, "grad_norm": 0.007455741944843248, "kl": 0.2080078125, "learning_rate": 4.520189062563393e-07, "loss": 0.0002, "num_tokens": 9661116.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 842.84375, "completions/mean_terminated_length": 771.95654296875, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 1.1296296296296295, "grad_norm": 0.007953891342227685, "kl": 0.2535400390625, "learning_rate": 4.5172400132595737e-07, "loss": 0.0003, "num_tokens": 9694599.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 877.875, "completions/mean_terminated_length": 790.2000122070312, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "epoch": 1.132716049382716, "grad_norm": 1.1755357599564493, "kl": 0.2427978515625, "learning_rate": 4.514282897090464e-07, "loss": -0.0241, "num_tokens": 9729527.0, "reward": 0.012500000186264515, "reward_std": 0.02500000037252903, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 831.53125, "completions/mean_terminated_length": 756.2174072265625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 1.1358024691358024, "grad_norm": 0.4794331986896714, "kl": 0.2257080078125, "learning_rate": 4.511317725881457e-07, "loss": 0.0135, "num_tokens": 9763080.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 892.8125, "completions/mean_terminated_length": 724.1428833007812, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 1.1388888888888888, "grad_norm": 0.9429283589487057, "kl": 0.222412109375, "learning_rate": 4.50834451149016e-07, "loss": 0.0209, "num_tokens": 9798514.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 883.53125, "completions/mean_terminated_length": 774.2777709960938, "completions/min_length": 567.0, "completions/min_terminated_length": 567.0, "epoch": 1.1419753086419753, "grad_norm": 0.015974949220601272, "kl": 0.2415771484375, "learning_rate": 4.505363265806342e-07, "loss": 0.0002, "num_tokens": 9833203.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 819.5, "completions/mean_terminated_length": 726.5454711914062, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 1.1450617283950617, "grad_norm": 0.00832902699087954, "kl": 0.237548828125, "learning_rate": 4.502374000751891e-07, "loss": 0.0002, "num_tokens": 9866067.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 797.34375, "completions/mean_terminated_length": 621.0555419921875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 1.1481481481481481, "grad_norm": 0.007743588945211472, "kl": 0.2412109375, "learning_rate": 4.49937672828076e-07, "loss": 0.0002, "num_tokens": 9898042.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 785.53125, "completions/mean_terminated_length": 718.760009765625, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 1.1512345679012346, "grad_norm": 0.8878095648927022, "kl": 0.239013671875, "learning_rate": 4.4963714603789315e-07, "loss": -0.0012, "num_tokens": 9929267.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 847.78125, "completions/mean_terminated_length": 789.0416870117188, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 1.154320987654321, "grad_norm": 0.5494838061699857, "kl": 0.2425537109375, "learning_rate": 4.4933582090643516e-07, "loss": 0.0056, "num_tokens": 9963272.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 866.5, "completions/mean_terminated_length": 727.5294189453125, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 1.1574074074074074, "grad_norm": 0.8639857177081212, "kl": 0.2451171875, "learning_rate": 4.4903369863869e-07, "loss": 0.0069, "num_tokens": 9997876.0, "reward": 0.003503691405057907, "reward_std": 0.007007382810115814, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0004207683086860925, "rewards/logprob_reward/std": 0.0023802248761057854, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 837.5625, "completions/mean_terminated_length": 739.90478515625, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 1.1604938271604939, "grad_norm": 0.5189699901753596, "kl": 0.2509765625, "learning_rate": 4.4873078044283273e-07, "loss": 0.0215, "num_tokens": 10030938.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 769.4375, "completions/mean_terminated_length": 669.8261108398438, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 1.1635802469135803, "grad_norm": 0.6206949715555347, "kl": 0.249267578125, "learning_rate": 4.484270675302218e-07, "loss": -0.0048, "num_tokens": 10061652.0, "reward": 0.0062500000931322575, "reward_std": 0.007216878701001406, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 903.375, "completions/mean_terminated_length": 796.941162109375, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 1.1666666666666667, "grad_norm": 0.5131021280005653, "kl": 0.20849609375, "learning_rate": 4.481225611153933e-07, "loss": 0.0262, "num_tokens": 10097412.0, "reward": 0.00010621514957165346, "reward_std": 0.00021243031369522214, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00011801683285739273, "rewards/logprob_reward/std": 0.0006676040356978774, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 814.75, "completions/mean_terminated_length": 745.0, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 1.1697530864197532, "grad_norm": 0.6250156569708267, "kl": 0.24755859375, "learning_rate": 4.4781726241605683e-07, "loss": -0.0184, "num_tokens": 10129588.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 798.03125, "completions/mean_terminated_length": 662.4500122070312, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 1.1728395061728394, "grad_norm": 1.4315230781531085, "kl": 0.2479248046875, "learning_rate": 4.4751117265309e-07, "loss": -0.0385, "num_tokens": 10162005.0, "reward": 0.006622622720897198, "reward_std": 0.013245245441794395, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00041402498027309775, "rewards/logprob_reward/std": 0.001968799391761422, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 760.03125, "completions/mean_terminated_length": 711.1481323242188, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 1.175925925925926, "grad_norm": 0.00883932204938891, "kl": 0.2576904296875, "learning_rate": 4.472042930505342e-07, "loss": 0.0003, "num_tokens": 10192274.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 839.6875, "completions/mean_terminated_length": 755.9091186523438, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 1.1790123456790123, "grad_norm": 0.03892469368776792, "kl": 0.2503662109375, "learning_rate": 4.46896624835589e-07, "loss": 0.0003, "num_tokens": 10225556.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 774.84375, "completions/mean_terminated_length": 728.7037353515625, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 1.1820987654320987, "grad_norm": 0.010711275418245406, "kl": 0.266357421875, "learning_rate": 4.465881692386078e-07, "loss": 0.0003, "num_tokens": 10256947.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 793.09375, "completions/mean_terminated_length": 728.4400024414062, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 1.1851851851851851, "grad_norm": 0.761983580989383, "kl": 0.2490234375, "learning_rate": 4.4627892749309273e-07, "loss": 0.0089, "num_tokens": 10288978.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 733.65625, "completions/mean_terminated_length": 679.888916015625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 1.1882716049382716, "grad_norm": 1.1495846680178599, "kl": 0.2626953125, "learning_rate": 4.459689008356896e-07, "loss": -0.0152, "num_tokens": 10318591.0, "reward": 0.00937500037252903, "reward_std": 0.01875000074505806, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 783.28125, "completions/mean_terminated_length": 703.0416870117188, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 1.191358024691358, "grad_norm": 0.8158747930461432, "kl": 0.2667236328125, "learning_rate": 4.4565809050618317e-07, "loss": -0.0165, "num_tokens": 10349916.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 832.65625, "completions/mean_terminated_length": 788.5, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 1.1944444444444444, "grad_norm": 0.552951708589812, "kl": 0.237060546875, "learning_rate": 4.45346497747492e-07, "loss": 0.0138, "num_tokens": 10382705.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 725.34375, "completions/mean_terminated_length": 705.433349609375, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 1.1975308641975309, "grad_norm": 0.9796287105685971, "kl": 0.25048828125, "learning_rate": 4.450341238056634e-07, "loss": -0.0002, "num_tokens": 10412128.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 821.1875, "completions/mean_terminated_length": 682.4210815429688, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 1.2006172839506173, "grad_norm": 0.006439249094962117, "kl": 0.23486328125, "learning_rate": 4.4472096992986895e-07, "loss": 0.0002, "num_tokens": 10444734.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 849.09375, "completions/mean_terminated_length": 769.5909423828125, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 1.2037037037037037, "grad_norm": 0.008083405405338903, "kl": 0.245361328125, "learning_rate": 4.444070373723989e-07, "loss": 0.0002, "num_tokens": 10478673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 817.65625, "completions/mean_terminated_length": 736.9130859375, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 1.2067901234567902, "grad_norm": 0.3493451746942101, "kl": 0.23046875, "learning_rate": 4.4409232738865744e-07, "loss": 0.0107, "num_tokens": 10511194.0, "reward": 1.1146000360895414e-05, "reward_std": 2.2292000721790828e-05, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 1.2384445653879084e-05, "rewards/logprob_reward/std": 7.005700172157958e-05, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 843.40625, "completions/mean_terminated_length": 702.9444580078125, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 1.2098765432098766, "grad_norm": 0.008235397741820484, "kl": 0.2247314453125, "learning_rate": 4.4377684123715763e-07, "loss": 0.0002, "num_tokens": 10544707.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 842.4375, "completions/mean_terminated_length": 747.3333740234375, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 1.212962962962963, "grad_norm": 0.009912324408120282, "kl": 0.212890625, "learning_rate": 4.434605801795167e-07, "loss": 0.0002, "num_tokens": 10578045.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 866.03125, "completions/mean_terminated_length": 771.25, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 1.2160493827160495, "grad_norm": 0.5795413703576104, "kl": 0.20947265625, "learning_rate": 4.431435454804503e-07, "loss": 0.0015, "num_tokens": 10612394.0, "reward": 9.790882904781029e-05, "reward_std": 0.00019581765809562057, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00010878759348997846, "rewards/logprob_reward/std": 0.0006153955473564565, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 874.375, "completions/mean_terminated_length": 815.8261108398438, "completions/min_length": 579.0, "completions/min_terminated_length": 579.0, "epoch": 1.2191358024691359, "grad_norm": 0.009494320279332596, "kl": 0.241455078125, "learning_rate": 4.42825738407768e-07, "loss": 0.0002, "num_tokens": 10646382.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 815.15625, "completions/mean_terminated_length": 672.26318359375, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 1.2222222222222223, "grad_norm": 0.007764736406968568, "kl": 0.2137451171875, "learning_rate": 4.425071602323681e-07, "loss": 0.0002, "num_tokens": 10678903.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 784.84375, "completions/mean_terminated_length": 691.2608642578125, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 1.2253086419753085, "grad_norm": 0.006847226776488038, "kl": 0.235595703125, "learning_rate": 4.421878122282325e-07, "loss": 0.0002, "num_tokens": 10710122.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 802.25, "completions/mean_terminated_length": 715.478271484375, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 1.228395061728395, "grad_norm": 0.007805958167803609, "kl": 0.228759765625, "learning_rate": 4.4186769567242163e-07, "loss": 0.0002, "num_tokens": 10742558.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 812.375, "completions/mean_terminated_length": 685.4000244140625, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 1.2314814814814814, "grad_norm": 0.6232169916712146, "kl": 0.21337890625, "learning_rate": 4.4154681184506927e-07, "loss": 0.0202, "num_tokens": 10775026.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 876.40625, "completions/mean_terminated_length": 799.0952758789062, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 1.2345679012345678, "grad_norm": 0.8426294509652541, "kl": 0.2391357421875, "learning_rate": 4.4122516202937745e-07, "loss": 0.0246, "num_tokens": 10810183.0, "reward": 0.003223419887945056, "reward_std": 0.006446839775890112, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00010935530008282512, "rewards/logprob_reward/std": 0.0006186070386320353, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 834.28125, "completions/mean_terminated_length": 771.0416870117188, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 1.2376543209876543, "grad_norm": 0.007790026200100888, "kl": 0.25146484375, "learning_rate": 4.4090274751161144e-07, "loss": 0.0003, "num_tokens": 10843284.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 866.625, "completions/mean_terminated_length": 744.2222290039062, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 1.2407407407407407, "grad_norm": 0.5148751677044905, "kl": 0.230712890625, "learning_rate": 4.4057956958109453e-07, "loss": 0.0217, "num_tokens": 10877660.0, "reward": 4.3019586883019656e-05, "reward_std": 8.603917376603931e-05, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 4.7799541789572686e-05, "rewards/logprob_reward/std": 0.0002703950449358672, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 866.15625, "completions/mean_terminated_length": 726.8823852539062, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 1.2438271604938271, "grad_norm": 0.5816434254295519, "kl": 0.220458984375, "learning_rate": 4.402556295302029e-07, "loss": 0.0167, "num_tokens": 10912053.0, "reward": 0.0062500000931322575, "reward_std": 0.007216878701001406, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 776.53125, "completions/mean_terminated_length": 694.0416870117188, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 1.2469135802469136, "grad_norm": 0.6479712483848744, "kl": 0.24169921875, "learning_rate": 4.3993092865436035e-07, "loss": 0.0093, "num_tokens": 10943290.0, "reward": 0.0062500000931322575, "reward_std": 0.007216878701001406, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 783.0, "completions/mean_terminated_length": 702.6666870117188, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 1.25, "grad_norm": 0.7618585239127673, "kl": 0.2161865234375, "learning_rate": 4.3960546825203304e-07, "loss": -0.0298, "num_tokens": 10974822.0, "reward": 0.003186756744980812, "reward_std": 0.006209921091794968, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 6.861852307338268e-05, "rewards/logprob_reward/std": 0.0003881649754475802, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 818.3125, "completions/mean_terminated_length": 724.8181762695312, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 1.2530864197530864, "grad_norm": 0.007887791782644819, "kl": 0.2255859375, "learning_rate": 4.392792496247248e-07, "loss": 0.0002, "num_tokens": 11008236.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 809.65625, "completions/mean_terminated_length": 738.2083740234375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 1.2561728395061729, "grad_norm": 0.6310960871369747, "kl": 0.225341796875, "learning_rate": 4.3895227407697135e-07, "loss": 0.0222, "num_tokens": 11040653.0, "reward": 0.0001372906262986362, "reward_std": 0.0002745812525972724, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0001525451516499743, "rewards/logprob_reward/std": 0.0008629257208667696, "step": 407 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 792.625, "completions/mean_terminated_length": 739.2307739257812, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 1.2592592592592593, "grad_norm": 0.005567178329840646, "kl": NaN, "learning_rate": 4.3862454291633523e-07, "loss": 0.0002, "num_tokens": 11072561.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 831.1875, "completions/mean_terminated_length": 699.26318359375, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 1.2623456790123457, "grad_norm": 137.7828505628255, "kl": 60.7132568359375, "learning_rate": 4.382960574534009e-07, "loss": 0.0994, "num_tokens": 11106103.0, "reward": 0.0034079812467098236, "reward_std": 0.006815962493419647, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00031442352337762713, "rewards/logprob_reward/std": 0.0017786480020731688, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 832.875, "completions/mean_terminated_length": 758.0869750976562, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 1.2654320987654322, "grad_norm": 0.7507446008491451, "kl": 0.230224609375, "learning_rate": 4.3796681900176903e-07, "loss": 0.0227, "num_tokens": 11139267.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 817.65625, "completions/mean_terminated_length": 770.0385131835938, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 1.2685185185185186, "grad_norm": 0.8242287130162602, "kl": 0.2332763671875, "learning_rate": 4.3763682887805153e-07, "loss": 0.001, "num_tokens": 11172004.0, "reward": 0.00010974896576954052, "reward_std": 0.00021949793153908104, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0001219432961079292, "rewards/logprob_reward/std": 0.0006898154970258474, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 792.25, "completions/mean_terminated_length": 701.5652465820312, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 1.2716049382716048, "grad_norm": 0.6068112356341268, "kl": 0.233154296875, "learning_rate": 4.3730608840186625e-07, "loss": 0.0106, "num_tokens": 11204188.0, "reward": 0.0062500000931322575, "reward_std": 0.007216878701001406, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 797.125, "completions/mean_terminated_length": 694.0, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 1.2746913580246915, "grad_norm": 0.8086582079593566, "kl": 0.2498779296875, "learning_rate": 4.3697459889583166e-07, "loss": 0.0192, "num_tokens": 11236384.0, "reward": 0.0001524329709354788, "reward_std": 0.0003048659418709576, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00016936997417360544, "rewards/logprob_reward/std": 0.0008339976775459945, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 819.03125, "completions/mean_terminated_length": 771.7307739257812, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 1.2777777777777777, "grad_norm": 0.7641636281958442, "kl": 0.2178955078125, "learning_rate": 4.366423616855615e-07, "loss": -0.0009, "num_tokens": 11268933.0, "reward": 0.0031583672389388084, "reward_std": 0.006228073500096798, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 3.707461655722e-05, "rewards/logprob_reward/std": 0.00020972569473087788, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 757.8125, "completions/mean_terminated_length": 719.7857666015625, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 1.2808641975308643, "grad_norm": 0.9999706539197956, "kl": 0.244384765625, "learning_rate": 4.363093780996596e-07, "loss": 0.0114, "num_tokens": 11299483.0, "reward": 0.006281028036028147, "reward_std": 0.007278934586793184, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 3.447556446189992e-05, "rewards/logprob_reward/std": 0.00019502323993947357, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 798.21875, "completions/mean_terminated_length": 756.4074096679688, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 1.2839506172839505, "grad_norm": 1.0122308107373106, "kl": 0.243896484375, "learning_rate": 4.359756494697146e-07, "loss": 0.01, "num_tokens": 11331722.0, "reward": 0.0033252749126404524, "reward_std": 0.006650549825280905, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0002225275820819661, "rewards/logprob_reward/std": 0.0012588060926645994, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 693.25, "completions/mean_terminated_length": 659.0344848632812, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 1.287037037037037, "grad_norm": 0.9867934630640751, "kl": 0.2191162109375, "learning_rate": 4.356411771302944e-07, "loss": -0.0119, "num_tokens": 11360202.0, "reward": 0.009411254897713661, "reward_std": 0.013443084433674812, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 4.028295734315179e-05, "rewards/logprob_reward/std": 0.00022787481429986656, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 820.25, "completions/mean_terminated_length": 782.5185546875, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "epoch": 1.2901234567901234, "grad_norm": 0.8168515073126328, "kl": 0.2159423828125, "learning_rate": 4.353059624189411e-07, "loss": -0.001, "num_tokens": 11393438.0, "reward": 0.00036042043939232826, "reward_std": 0.0007208408787846565, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00040046716458164155, "rewards/logprob_reward/std": 0.0022653844207525253, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 845.0, "completions/mean_terminated_length": 785.3333740234375, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 1.2932098765432098, "grad_norm": 0.9737864152659005, "kl": 0.203369140625, "learning_rate": 4.3497000667616534e-07, "loss": -0.0195, "num_tokens": 11427646.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 828.5, "completions/mean_terminated_length": 763.3333740234375, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 1.2962962962962963, "grad_norm": 0.7791225374329636, "kl": 0.19439697265625, "learning_rate": 4.346333112454413e-07, "loss": 0.0309, "num_tokens": 11460646.0, "reward": 0.0001546065614093095, "reward_std": 0.000309213122818619, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00017178506823256612, "rewards/logprob_reward/std": 0.0008412457536906004, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 710.15625, "completions/mean_terminated_length": 689.2333984375, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 1.2993827160493827, "grad_norm": 0.9427759344956138, "kl": 0.2515869140625, "learning_rate": 4.342958774732011e-07, "loss": -0.0213, "num_tokens": 11489603.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 741.4375, "completions/mean_terminated_length": 662.3200073242188, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 1.3024691358024691, "grad_norm": 0.7831719097110312, "kl": 0.2220458984375, "learning_rate": 4.3395770670882935e-07, "loss": -0.0053, "num_tokens": 11520029.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 844.46875, "completions/mean_terminated_length": 803.0385131835938, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 1.3055555555555556, "grad_norm": 0.006609870276807969, "kl": 0.2305908203125, "learning_rate": 4.3361880030465803e-07, "loss": 0.0002, "num_tokens": 11553660.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 799.6875, "completions/mean_terminated_length": 697.727294921875, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 1.308641975308642, "grad_norm": 1.2247530200330305, "kl": 0.227783203125, "learning_rate": 4.3327915961596066e-07, "loss": -0.1106, "num_tokens": 11585826.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 424 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 762.46875, "completions/mean_terminated_length": 714.0370483398438, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 1.3117283950617284, "grad_norm": 0.98173031770165, "kl": NaN, "learning_rate": 4.3293878600094746e-07, "loss": -0.0167, "num_tokens": 11616589.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 782.59375, "completions/mean_terminated_length": 726.8846435546875, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 1.3148148148148149, "grad_norm": 1.3587790999170124, "kl": 0.22607421875, "learning_rate": 4.325976808207594e-07, "loss": -0.0105, "num_tokens": 11648068.0, "reward": 0.006669376045465469, "reward_std": 0.01258049812167883, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0004659731639549136, "rewards/logprob_reward/std": 0.001984819769859314, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 750.15625, "completions/mean_terminated_length": 673.47998046875, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 1.3179012345679013, "grad_norm": 0.7013206768494553, "kl": 0.218994140625, "learning_rate": 4.3225584543946303e-07, "loss": -0.0138, "num_tokens": 11678481.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 739.1875, "completions/mean_terminated_length": 709.72412109375, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 1.3209876543209877, "grad_norm": 0.736204130156878, "kl": 0.2166748046875, "learning_rate": 4.319132812240448e-07, "loss": -0.011, "num_tokens": 11708567.0, "reward": 0.00011441440437920392, "reward_std": 0.00022882880875840783, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0001271271175937727, "rewards/logprob_reward/std": 0.0007191395852714777, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 695.0625, "completions/mean_terminated_length": 602.9599609375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 1.324074074074074, "grad_norm": 1.641092297738994, "kl": 0.22650146484375, "learning_rate": 4.3156998954440587e-07, "loss": -0.0958, "num_tokens": 11737389.0, "reward": 0.003597402013838291, "reward_std": 0.0068130940198898315, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0005248911329545081, "rewards/logprob_reward/std": 0.0022221512626856565, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 681.625, "completions/mean_terminated_length": 632.7142944335938, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 1.3271604938271606, "grad_norm": 1.575642809217235, "kl": 0.261474609375, "learning_rate": 4.312259717733565e-07, "loss": -0.1046, "num_tokens": 11765653.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 730.4375, "completions/mean_terminated_length": 632.5833740234375, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 1.3302469135802468, "grad_norm": 0.04890130949547042, "kl": 0.226806640625, "learning_rate": 4.308812292866105e-07, "loss": 0.0002, "num_tokens": 11795639.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 691.78125, "completions/mean_terminated_length": 669.6333618164062, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 1.3333333333333333, "grad_norm": 0.9286398876317612, "kl": 0.2548828125, "learning_rate": 4.3053576346277997e-07, "loss": 0.0227, "num_tokens": 11823924.0, "reward": 0.006325773429125547, "reward_std": 0.007368425372987986, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 8.419268124271184e-05, "rewards/logprob_reward/std": 0.00047626570449210703, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 786.875, "completions/mean_terminated_length": 694.0869750976562, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 1.3364197530864197, "grad_norm": 1.2072542458909037, "kl": 0.2181396484375, "learning_rate": 4.301895756833692e-07, "loss": -0.0002, "num_tokens": 11855800.0, "reward": 0.00937500037252903, "reward_std": 0.01875000074505806, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 718.5625, "completions/mean_terminated_length": 648.0769653320312, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 1.3395061728395061, "grad_norm": 1.3281527836782274, "kl": 0.22998046875, "learning_rate": 4.298426673327701e-07, "loss": -0.0395, "num_tokens": 11885422.0, "reward": 0.006339985877275467, "reward_std": 0.012679971754550934, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 9.998405585065484e-05, "rewards/logprob_reward/std": 0.0004191905609332025, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 782.25, "completions/mean_terminated_length": 726.4615478515625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 1.3425925925925926, "grad_norm": 1.4341930099899078, "kl": 0.24560546875, "learning_rate": 4.2949503979825563e-07, "loss": -0.0136, "num_tokens": 11916806.0, "reward": 0.00984628964215517, "reward_std": 0.018601160496473312, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0005236548604443669, "rewards/logprob_reward/std": 0.002705881604924798, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 803.125, "completions/mean_terminated_length": 741.2799682617188, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 1.345679012345679, "grad_norm": 0.8642238947769881, "kl": 0.232421875, "learning_rate": 4.2914669446997504e-07, "loss": -0.0011, "num_tokens": 11948830.0, "reward": 0.006460936740040779, "reward_std": 0.01237230934202671, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0002343738597119227, "rewards/logprob_reward/std": 0.001325818826444447, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 710.875, "completions/mean_terminated_length": 652.888916015625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 1.3487654320987654, "grad_norm": 1.0281930220539874, "kl": 0.22412109375, "learning_rate": 4.287976327409478e-07, "loss": 0.0191, "num_tokens": 11977982.0, "reward": 0.006372864358127117, "reward_std": 0.012745728716254234, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00013651592598762363, "rewards/logprob_reward/std": 0.0007722506416030228, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 770.34375, "completions/mean_terminated_length": 711.8077392578125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 1.3518518518518519, "grad_norm": 1.4499981004239422, "kl": 0.2191162109375, "learning_rate": 4.284478560070585e-07, "loss": -0.0696, "num_tokens": 12009161.0, "reward": 0.00633084774017334, "reward_std": 0.01266169548034668, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 8.983034058474004e-05, "rewards/logprob_reward/std": 0.0005081571289338171, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 807.0625, "completions/mean_terminated_length": 693.4285888671875, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 1.3549382716049383, "grad_norm": 0.7341836753699728, "kl": 0.2386474609375, "learning_rate": 4.280973656670508e-07, "loss": 0.0018, "num_tokens": 12041515.0, "reward": 0.0033574083354324102, "reward_std": 0.006110795307904482, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0002582314482424408, "rewards/logprob_reward/std": 0.0014607777120545506, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 805.5, "completions/mean_terminated_length": 706.1818237304688, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 1.3580246913580247, "grad_norm": 1.2039689263705495, "kl": 0.2305908203125, "learning_rate": 4.277461631225221e-07, "loss": -0.0319, "num_tokens": 12074411.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 709.1875, "completions/mean_terminated_length": 664.2142944335938, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 1.3611111111111112, "grad_norm": 0.9632888697626055, "kl": 0.258544921875, "learning_rate": 4.2739424977791784e-07, "loss": 0.0093, "num_tokens": 12103149.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 702.625, "completions/mean_terminated_length": 669.3793334960938, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 1.3641975308641976, "grad_norm": 1.4351619860940739, "kl": 0.2476806640625, "learning_rate": 4.2704162704052594e-07, "loss": -0.0805, "num_tokens": 12132189.0, "reward": 0.006343253888189793, "reward_std": 0.012686507776379585, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00010361517342971638, "rewards/logprob_reward/std": 0.0005861359531991184, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 756.71875, "completions/mean_terminated_length": 729.0689697265625, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 1.367283950617284, "grad_norm": 1.0343246403227044, "kl": 0.2568359375, "learning_rate": 4.2668829632047124e-07, "loss": -0.0203, "num_tokens": 12162592.0, "reward": 0.003239110577851534, "reward_std": 0.006478221155703068, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0001267895131604746, "rewards/logprob_reward/std": 0.0007172297919169068, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 714.3125, "completions/mean_terminated_length": 656.9629516601562, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 1.3703703703703702, "grad_norm": 0.03445418685545314, "kl": 0.25927734375, "learning_rate": 4.2633425903070973e-07, "loss": 0.0003, "num_tokens": 12191910.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 763.875, "completions/mean_terminated_length": 726.7142944335938, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 1.373456790123457, "grad_norm": 0.8898686519862932, "kl": 0.2269287109375, "learning_rate": 4.259795165870229e-07, "loss": 0.003, "num_tokens": 12222826.0, "reward": 0.0063004931434988976, "reward_std": 0.007317864801734686, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 5.610326479654759e-05, "rewards/logprob_reward/std": 0.00031736798700876534, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 731.65625, "completions/mean_terminated_length": 677.5184936523438, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 1.376543209876543, "grad_norm": 0.8783443173591432, "kl": 0.218505859375, "learning_rate": 4.256240704080121e-07, "loss": -0.0138, "num_tokens": 12252615.0, "reward": 0.0001754740223987028, "reward_std": 0.0003509480447974056, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00019497114408295602, "rewards/logprob_reward/std": 0.0011029232991859317, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 805.65625, "completions/mean_terminated_length": 706.4091186523438, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 1.3796296296296298, "grad_norm": 0.8940524822228907, "kl": 0.253173828125, "learning_rate": 4.2526792191509297e-07, "loss": 0.0231, "num_tokens": 12285108.0, "reward": 0.0034149284474551678, "reward_std": 0.0068298568949103355, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00032214270322583616, "rewards/logprob_reward/std": 0.0018223143415525556, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 836.96875, "completions/mean_terminated_length": 784.5999755859375, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 1.382716049382716, "grad_norm": 0.007217653048510719, "kl": 0.229736328125, "learning_rate": 4.249110725324897e-07, "loss": 0.0002, "num_tokens": 12318715.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 808.625, "completions/mean_terminated_length": 758.923095703125, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 1.3858024691358024, "grad_norm": 1.0950945919731718, "kl": 0.2406005859375, "learning_rate": 4.2455352368722916e-07, "loss": 0.0005, "num_tokens": 12351235.0, "reward": 0.009825151413679123, "reward_std": 0.018510140478610992, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0005001676036044955, "rewards/logprob_reward/std": 0.002829375211149454, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 786.25, "completions/mean_terminated_length": 731.3846435546875, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 1.3888888888888888, "grad_norm": 3.214026215003321, "kl": 0.47955322265625, "learning_rate": 4.2419527680913554e-07, "loss": -0.1991, "num_tokens": 12383055.0, "reward": 0.006404605228453875, "reward_std": 0.01280921045690775, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00017178348207380623, "rewards/logprob_reward/std": 0.0009717540815472603, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 771.6875, "completions/mean_terminated_length": 713.4615478515625, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 1.3919753086419753, "grad_norm": 0.7936696463586366, "kl": 0.2222900390625, "learning_rate": 4.2383633333082423e-07, "loss": 0.0021, "num_tokens": 12414541.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 715.75, "completions/mean_terminated_length": 671.7142944335938, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 1.3950617283950617, "grad_norm": 3.459592763290137, "kl": 0.55084228515625, "learning_rate": 4.234766946876965e-07, "loss": 0.0037, "num_tokens": 12444073.0, "reward": 0.0062960912473499775, "reward_std": 0.012469880282878876, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 5.121239155414514e-05, "rewards/logprob_reward/std": 0.00028970104176551104, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 706.0625, "completions/mean_terminated_length": 632.6923217773438, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 1.3981481481481481, "grad_norm": 1.3456803869912601, "kl": 0.2529296875, "learning_rate": 4.231163623179335e-07, "loss": -0.0036, "num_tokens": 12472799.0, "reward": 0.00937500037252903, "reward_std": 0.01875000074505806, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 727.3125, "completions/mean_terminated_length": 672.370361328125, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 1.4012345679012346, "grad_norm": 1.6723204574364663, "kl": 0.218505859375, "learning_rate": 4.227553376624904e-07, "loss": 0.0056, "num_tokens": 12502501.0, "reward": 0.003401299240067601, "reward_std": 0.006802598480135202, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00030699922353960574, "rewards/logprob_reward/std": 0.0010323910973966122, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 656.65625, "completions/mean_terminated_length": 604.1785888671875, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 1.404320987654321, "grad_norm": 1.6559128796505724, "kl": 0.240478515625, "learning_rate": 4.22393622165091e-07, "loss": 0.0077, "num_tokens": 12529674.0, "reward": 0.009458878077566624, "reward_std": 0.018917756155133247, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 9.31981485337019e-05, "rewards/logprob_reward/std": 0.00037146464455872774, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 761.34375, "completions/mean_terminated_length": 712.7037353515625, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 1.4074074074074074, "grad_norm": 0.5264237404210453, "kl": 0.2056884765625, "learning_rate": 4.220312172722216e-07, "loss": -0.0058, "num_tokens": 12560489.0, "reward": 0.0062500000931322575, "reward_std": 0.007216878701001406, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 867.0625, "completions/mean_terminated_length": 759.6842041015625, "completions/min_length": 590.0, "completions/min_terminated_length": 590.0, "epoch": 1.4104938271604939, "grad_norm": 1.075088647657135, "kl": 0.224609375, "learning_rate": 4.216681244331256e-07, "loss": -0.1083, "num_tokens": 12595327.0, "reward": 0.006331074051558971, "reward_std": 0.007311693392693996, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 9.008179040392861e-05, "rewards/logprob_reward/std": 0.0005095795495435596, "step": 457 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 687.375, "completions/mean_terminated_length": 609.6923217773438, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 1.4135802469135803, "grad_norm": 0.41564115682942565, "kl": NaN, "learning_rate": 4.2130434509979714e-07, "loss": 0.0154, "num_tokens": 12623371.0, "reward": 8.736336894799024e-06, "reward_std": 1.7472673789598048e-05, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 9.707041499495972e-06, "rewards/logprob_reward/std": 5.491132105817087e-05, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 718.8125, "completions/mean_terminated_length": 687.2413940429688, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 1.4166666666666667, "grad_norm": 1.2057762501743143, "kl": 0.2008056640625, "learning_rate": 4.209398807269758e-07, "loss": 0.0058, "num_tokens": 12653257.0, "reward": 0.0063441055826842785, "reward_std": 0.012688211165368557, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00010456141899339855, "rewards/logprob_reward/std": 0.0005914886714890599, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 761.40625, "completions/mean_terminated_length": 712.7777709960938, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 1.4197530864197532, "grad_norm": 0.869286542617104, "kl": 0.2095947265625, "learning_rate": 4.205747327721407e-07, "loss": -0.0244, "num_tokens": 12684330.0, "reward": 0.00018575230205897242, "reward_std": 0.00037150460411794484, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00020639145805034786, "rewards/logprob_reward/std": 0.0011675263522192836, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 743.0625, "completions/mean_terminated_length": 702.9285888671875, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 1.4228395061728394, "grad_norm": 1.0774651339855361, "kl": 0.2396240234375, "learning_rate": 4.2020890269550454e-07, "loss": 0.0084, "num_tokens": 12714532.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 713.40625, "completions/mean_terminated_length": 692.7000122070312, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 1.425925925925926, "grad_norm": 1.3919502874150627, "kl": 0.248291015625, "learning_rate": 4.198423919600076e-07, "loss": -0.043, "num_tokens": 12743541.0, "reward": 0.0095895417034626, "reward_std": 0.018620356917381287, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.00023837909975554794, "rewards/logprob_reward/std": 0.0013484758092090487, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 717.1875, "completions/mean_terminated_length": 673.357177734375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 1.4290123456790123, "grad_norm": 0.9788629478908946, "kl": 0.2410888671875, "learning_rate": 4.1947520203131217e-07, "loss": -0.0199, "num_tokens": 12773231.0, "reward": 0.0031524146907031536, "reward_std": 0.006304829381406307, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 3.0460822017630562e-05, "rewards/logprob_reward/std": 0.00017231242964044213, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 783.5, "completions/mean_terminated_length": 689.3912963867188, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 1.4320987654320987, "grad_norm": 0.008039119546979199, "kl": 0.2203369140625, "learning_rate": 4.191073343777968e-07, "loss": 0.0002, "num_tokens": 12805019.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 692.375, "completions/mean_terminated_length": 645.0, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 1.4351851851851851, "grad_norm": 1.0947998540882207, "kl": 0.2392578125, "learning_rate": 4.1873879047055005e-07, "loss": -0.0068, "num_tokens": 12833295.0, "reward": 0.003245285712182522, "reward_std": 0.006490571424365044, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00013365063932724297, "rewards/logprob_reward/std": 0.0007560421363450587, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 705.3125, "completions/mean_terminated_length": 684.0667114257812, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 1.4382716049382716, "grad_norm": 2.753381604776745, "kl": 0.222900390625, "learning_rate": 4.183695717833649e-07, "loss": -0.2018, "num_tokens": 12862265.0, "reward": 0.006487657316029072, "reward_std": 0.012975314632058144, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00026406353572383523, "rewards/logprob_reward/std": 0.0014937689993530512, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 649.28125, "completions/mean_terminated_length": 624.300048828125, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 1.441358024691358, "grad_norm": 1.1900591830657552, "kl": 0.22802734375, "learning_rate": 4.179996797927326e-07, "loss": 0.0072, "num_tokens": 12889074.0, "reward": 0.0064575402066111565, "reward_std": 0.012915080413222313, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00023059980594553053, "rewards/logprob_reward/std": 0.0013044695369899273, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 743.9375, "completions/mean_terminated_length": 703.9285888671875, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 1.4444444444444444, "grad_norm": 1.6501266285285199, "kl": 0.2314453125, "learning_rate": 4.17629115977837e-07, "loss": -0.1072, "num_tokens": 12919324.0, "reward": 0.00937500037252903, "reward_std": 0.01875000074505806, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 816.15625, "completions/mean_terminated_length": 721.6818237304688, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 1.4475308641975309, "grad_norm": 0.008942735752603462, "kl": 0.22998046875, "learning_rate": 4.1725788182054867e-07, "loss": 0.0002, "num_tokens": 12952597.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 728.3125, "completions/mean_terminated_length": 660.0769653320312, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 1.4506172839506173, "grad_norm": 1.7481401793763334, "kl": 0.2347412109375, "learning_rate": 4.1688597880541863e-07, "loss": 0.0162, "num_tokens": 12982287.0, "reward": 0.009573226794600487, "reward_std": 0.019079623743891716, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0002202522591687739, "rewards/logprob_reward/std": 0.000702223158441484, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 706.1875, "completions/mean_terminated_length": 685.0000610351562, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 1.4537037037037037, "grad_norm": 1.326458316628465, "kl": 0.220947265625, "learning_rate": 4.1651340841967284e-07, "loss": -0.0554, "num_tokens": 13010945.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 755.0625, "completions/mean_terminated_length": 679.760009765625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 1.4567901234567902, "grad_norm": 0.6618278332040402, "kl": 0.210693359375, "learning_rate": 4.161401721532059e-07, "loss": 0.0077, "num_tokens": 13042015.0, "reward": 4.2685020162025467e-05, "reward_std": 8.537004032405093e-05, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 4.7427802201127633e-05, "rewards/logprob_reward/std": 0.00026829217677004635, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 626.3125, "completions/mean_terminated_length": 613.4838256835938, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 1.4598765432098766, "grad_norm": 1.1416619745506837, "kl": 0.266845703125, "learning_rate": 4.1576627149857513e-07, "loss": -0.0454, "num_tokens": 13068481.0, "reward": 0.0032101499382406473, "reward_std": 0.006420299876481295, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 9.461108857067302e-05, "rewards/logprob_reward/std": 0.0005352011066861451, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 729.4375, "completions/mean_terminated_length": 674.888916015625, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 1.462962962962963, "grad_norm": 0.9798319736211897, "kl": 0.2449951171875, "learning_rate": 4.153917079509952e-07, "loss": -0.0141, "num_tokens": 13098375.0, "reward": 9.32246693992056e-05, "reward_std": 0.0001864493387984112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.000103582970041316, "rewards/logprob_reward/std": 0.0005859537632204592, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 745.53125, "completions/mean_terminated_length": 705.7500610351562, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 1.4660493827160495, "grad_norm": 0.6640956393226755, "kl": 0.219482421875, "learning_rate": 4.150164830083311e-07, "loss": 0.0052, "num_tokens": 13128980.0, "reward": 0.0032039100769907236, "reward_std": 0.006199179217219353, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 8.767777762841433e-05, "rewards/logprob_reward/std": 0.0004959804355166852, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 699.75, "completions/mean_terminated_length": 678.1333618164062, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 1.4691358024691357, "grad_norm": 0.00856726162889787, "kl": 0.2303466796875, "learning_rate": 4.146405981710931e-07, "loss": 0.0002, "num_tokens": 13157432.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 740.03125, "completions/mean_terminated_length": 660.5199584960938, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 1.4722222222222223, "grad_norm": 0.7604375052940118, "kl": 0.215576171875, "learning_rate": 4.142640549424302e-07, "loss": -0.0034, "num_tokens": 13187909.0, "reward": 2.138299350917805e-05, "reward_std": 4.27659870183561e-05, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 2.3758882889524102e-05, "rewards/logprob_reward/std": 0.0001344005431747064, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 786.4375, "completions/mean_terminated_length": 693.478271484375, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 1.4753086419753085, "grad_norm": 0.7540158733026404, "kl": 0.19873046875, "learning_rate": 4.1388685482812413e-07, "loss": -0.0112, "num_tokens": 13219503.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 732.46875, "completions/mean_terminated_length": 650.8399658203125, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 1.4783950617283952, "grad_norm": 1.2765891413572787, "kl": 0.21826171875, "learning_rate": 4.135089993365839e-07, "loss": 0.0176, "num_tokens": 13249258.0, "reward": 0.0002850447781383991, "reward_std": 0.0005700895562767982, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00031671643955633044, "rewards/logprob_reward/std": 0.0011264249915257096, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 734.40625, "completions/mean_terminated_length": 715.1000366210938, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 1.4814814814814814, "grad_norm": 1.4233734996648617, "kl": 0.19580078125, "learning_rate": 4.131304899788389e-07, "loss": -0.0326, "num_tokens": 13279483.0, "reward": 0.0003334644134156406, "reward_std": 0.0006669288268312812, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00037051603430882096, "rewards/logprob_reward/std": 0.0012172131100669503, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 681.125, "completions/mean_terminated_length": 602.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 1.4845679012345678, "grad_norm": 0.007854875065843677, "kl": 0.235107421875, "learning_rate": 4.127513282685336e-07, "loss": 0.0002, "num_tokens": 13307583.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 763.65625, "completions/mean_terminated_length": 703.5769653320312, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 1.4876543209876543, "grad_norm": 1.3271718238623136, "kl": 0.206787109375, "learning_rate": 4.123715157219211e-07, "loss": -0.0084, "num_tokens": 13338964.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 667.3125, "completions/mean_terminated_length": 643.5333862304688, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 1.4907407407407407, "grad_norm": 0.5993122360613606, "kl": 0.217041015625, "learning_rate": 4.1199105385785727e-07, "loss": -0.0016, "num_tokens": 13366546.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 768.59375, "completions/mean_terminated_length": 732.107177734375, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 1.4938271604938271, "grad_norm": 0.006839411443825314, "kl": 0.2059326171875, "learning_rate": 4.116099441977943e-07, "loss": 0.0002, "num_tokens": 13397565.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 755.28125, "completions/mean_terminated_length": 716.8928833007812, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 1.4969135802469136, "grad_norm": 1.1233779045963534, "kl": 0.2186279296875, "learning_rate": 4.112281882657751e-07, "loss": 0.0319, "num_tokens": 13428318.0, "reward": 0.006400207057595253, "reward_std": 0.0075172921642661095, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0001668964105192572, "rewards/logprob_reward/std": 0.0006705340929329395, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 734.03125, "completions/mean_terminated_length": 692.607177734375, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 1.5, "grad_norm": 0.008426954253993934, "kl": 0.205322265625, "learning_rate": 4.1084578758842714e-07, "loss": 0.0002, "num_tokens": 13457907.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 658.59375, "completions/mean_terminated_length": 590.9259033203125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 1.5030864197530864, "grad_norm": 1.6402412231893142, "kl": 0.25927734375, "learning_rate": 4.104627436949559e-07, "loss": 0.0074, "num_tokens": 13485362.0, "reward": 0.012500000186264515, "reward_std": 0.02500000037252903, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 712.125, "completions/mean_terminated_length": 679.862060546875, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 1.5061728395061729, "grad_norm": 0.6426115974868853, "kl": 0.2218017578125, "learning_rate": 4.1007905811713915e-07, "loss": 0.0205, "num_tokens": 13514342.0, "reward": 8.19168271846138e-05, "reward_std": 0.0001638336543692276, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 9.101870091399178e-05, "rewards/logprob_reward/std": 0.0005148795316927135, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 643.0625, "completions/mean_terminated_length": 603.6551513671875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 1.5092592592592593, "grad_norm": 0.8659957852460237, "kl": 0.22216796875, "learning_rate": 4.096947323893209e-07, "loss": -0.0122, "num_tokens": 13540668.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 701.40625, "completions/mean_terminated_length": 641.6666870117188, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 1.5123456790123457, "grad_norm": 1.2243377483078597, "kl": 0.2393798828125, "learning_rate": 4.0930976804840487e-07, "loss": 0.0068, "num_tokens": 13569205.0, "reward": 0.0032299254089593887, "reward_std": 0.0064598508179187775, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0001165838330052793, "rewards/logprob_reward/std": 0.0004594208439812064, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 744.53125, "completions/mean_terminated_length": 692.7777709960938, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 1.515432098765432, "grad_norm": 0.9748368726692964, "kl": 0.239501953125, "learning_rate": 4.0892416663384874e-07, "loss": 0.0009, "num_tokens": 13599022.0, "reward": 0.0032928246073424816, "reward_std": 0.006585649214684963, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00018647167598828673, "rewards/logprob_reward/std": 0.0010548430727794766, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 653.09375, "completions/mean_terminated_length": 628.36669921875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 1.5185185185185186, "grad_norm": 0.910886561519072, "kl": 0.23583984375, "learning_rate": 4.0853792968765765e-07, "loss": 0.0509, "num_tokens": 13626137.0, "reward": 0.00042957477853633463, "reward_std": 0.0008591495570726693, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.000477305322419852, "rewards/logprob_reward/std": 0.0019594368059188128, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 780.8125, "completions/mean_terminated_length": 712.719970703125, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 1.5216049382716048, "grad_norm": 1.274269025857144, "kl": 0.234130859375, "learning_rate": 4.081510587543784e-07, "loss": 0.0175, "num_tokens": 13657803.0, "reward": 0.007064519450068474, "reward_std": 0.012881873175501823, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0009050215012393892, "rewards/logprob_reward/std": 0.0036485553719103336, "step": 493 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 737.375, "completions/mean_terminated_length": 671.2307739257812, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 1.5246913580246915, "grad_norm": 0.7721523436134393, "kl": NaN, "learning_rate": 4.0776355538109285e-07, "loss": -0.0027, "num_tokens": 13687715.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 679.46875, "completions/mean_terminated_length": 615.6666870117188, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 1.5277777777777777, "grad_norm": 1.1190525113595386, "kl": 0.24267578125, "learning_rate": 4.073754211174123e-07, "loss": -0.0302, "num_tokens": 13716146.0, "reward": 0.0032679312862455845, "reward_std": 0.006535862572491169, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0001588123559486121, "rewards/logprob_reward/std": 0.0008983783191069961, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 718.5625, "completions/mean_terminated_length": 648.0769653320312, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 1.5308641975308643, "grad_norm": 0.011772250359833447, "kl": 0.235595703125, "learning_rate": 4.069866575154706e-07, "loss": 0.0002, "num_tokens": 13746456.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 748.75, "completions/mean_terminated_length": 685.2307739257812, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 1.5339506172839505, "grad_norm": 1.036641945037653, "kl": 0.2569580078125, "learning_rate": 4.0659726612991853e-07, "loss": 0.0029, "num_tokens": 13777220.0, "reward": 0.00014904368435963988, "reward_std": 0.00029808736871927977, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0001656040985835716, "rewards/logprob_reward/std": 0.0009367982274852693, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 775.65625, "completions/mean_terminated_length": 692.875, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 1.5370370370370372, "grad_norm": 2.040791241615359, "kl": 0.21435546875, "learning_rate": 4.062072485179172e-07, "loss": -0.0446, "num_tokens": 13808961.0, "reward": 0.0074306377209723, "reward_std": 0.013677163049578667, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00131182000041008, "rewards/logprob_reward/std": 0.004722681827843189, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 690.0, "completions/mean_terminated_length": 628.1481323242188, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 1.5401234567901234, "grad_norm": 0.8678643078533776, "kl": 0.20458984375, "learning_rate": 4.0581660623913216e-07, "loss": 0.004, "num_tokens": 13837493.0, "reward": 0.006410412490367889, "reward_std": 0.007036527618765831, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0001782359031494707, "rewards/logprob_reward/std": 0.0010082544758915901, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 714.0625, "completions/mean_terminated_length": 693.4000244140625, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 1.5432098765432098, "grad_norm": 0.007614484510444482, "kl": 0.2413330078125, "learning_rate": 4.0542534085572677e-07, "loss": 0.0002, "num_tokens": 13866455.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 751.84375, "completions/mean_terminated_length": 675.6400146484375, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 1.5462962962962963, "grad_norm": 1.5316793683826315, "kl": 0.2237548828125, "learning_rate": 4.050334539323563e-07, "loss": -0.0311, "num_tokens": 13897134.0, "reward": 0.003464690176770091, "reward_std": 0.006929380353540182, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00037743343273177743, "rewards/logprob_reward/std": 0.0015973311383277178, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 726.75, "completions/mean_terminated_length": 684.2857666015625, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 1.5493827160493827, "grad_norm": 0.8030631221407978, "kl": 0.212890625, "learning_rate": 4.046409470361615e-07, "loss": -0.0132, "num_tokens": 13926998.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 748.375, "completions/mean_terminated_length": 697.3333129882812, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 1.5524691358024691, "grad_norm": 1.1411802118497676, "kl": 0.1939697265625, "learning_rate": 4.0424782173676235e-07, "loss": -0.0043, "num_tokens": 13957978.0, "reward": 0.003966109361499548, "reward_std": 0.007221629843115807, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0009345660218968987, "rewards/logprob_reward/std": 0.0036782834213227034, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 754.375, "completions/mean_terminated_length": 692.1538696289062, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 1.5555555555555556, "grad_norm": 1.1392876503462597, "kl": 0.2095947265625, "learning_rate": 4.0385407960625185e-07, "loss": 0.0484, "num_tokens": 13988954.0, "reward": 0.0007476196624338627, "reward_std": 0.0014952393248677254, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0008306884556077421, "rewards/logprob_reward/std": 0.003281813580542803, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 757.21875, "completions/mean_terminated_length": 719.107177734375, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 1.558641975308642, "grad_norm": 0.007288560957668381, "kl": 0.2373046875, "learning_rate": 4.034597222191896e-07, "loss": 0.0002, "num_tokens": 14020117.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 667.28125, "completions/mean_terminated_length": 616.3214721679688, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 1.5617283950617284, "grad_norm": 0.5935219450571073, "kl": 0.2216796875, "learning_rate": 4.030647511525956e-07, "loss": 0.0217, "num_tokens": 14047862.0, "reward": 0.00018285616533830762, "reward_std": 0.00036571233067661524, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00020317352027632296, "rewards/logprob_reward/std": 0.001149323070421815, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 682.125, "completions/mean_terminated_length": 659.3333740234375, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 1.5648148148148149, "grad_norm": 1.258804726631916, "kl": 0.2191162109375, "learning_rate": 4.0266916798594417e-07, "loss": -0.0273, "num_tokens": 14076670.0, "reward": 0.00024014676455408335, "reward_std": 0.0004802935291081667, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00026682973839342594, "rewards/logprob_reward/std": 0.0012838775292038918, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 726.0, "completions/mean_terminated_length": 695.1724243164062, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 1.567901234567901, "grad_norm": 0.6454728419319262, "kl": 0.21533203125, "learning_rate": 4.02272974301157e-07, "loss": 0.0095, "num_tokens": 14105914.0, "reward": 0.0001895047607831657, "reward_std": 0.0003790095215663314, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00021056084369774908, "rewards/logprob_reward/std": 0.0011911119800060987, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 791.03125, "completions/mean_terminated_length": 737.269287109375, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 1.5709876543209877, "grad_norm": 1.2859936176358455, "kl": 0.197265625, "learning_rate": 4.018761716825974e-07, "loss": -0.0258, "num_tokens": 14138467.0, "reward": 0.00028381761512719095, "reward_std": 0.0005676352302543819, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00031535292509943247, "rewards/logprob_reward/std": 0.001241048565134406, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 646.5, "completions/mean_terminated_length": 607.4483032226562, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 1.574074074074074, "grad_norm": 0.008592360131948421, "kl": 0.2469482421875, "learning_rate": 4.014787617170639e-07, "loss": 0.0002, "num_tokens": 14165595.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 675.03125, "completions/mean_terminated_length": 638.9310302734375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 1.5771604938271606, "grad_norm": 1.171419955274731, "kl": 0.1822509765625, "learning_rate": 4.010807459937836e-07, "loss": -0.0501, "num_tokens": 14194016.0, "reward": 0.003713682759553194, "reward_std": 0.006750314496457577, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0006540918839164078, "rewards/logprob_reward/std": 0.002199083101004362, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 697.21875, "completions/mean_terminated_length": 621.8077392578125, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 1.5802469135802468, "grad_norm": 0.9975735423759305, "kl": 0.2100830078125, "learning_rate": 4.006821261044061e-07, "loss": 0.0032, "num_tokens": 14222627.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 739.6875, "completions/mean_terminated_length": 720.7333984375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 1.5833333333333335, "grad_norm": 1.0210658224486546, "kl": 0.2042236328125, "learning_rate": 4.002829036429971e-07, "loss": -0.0316, "num_tokens": 14253177.0, "reward": 0.0034115025773644447, "reward_std": 0.0068230051547288895, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0003183361841365695, "rewards/logprob_reward/std": 0.001800781348720193, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 618.25, "completions/mean_terminated_length": 605.1612548828125, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 1.5864197530864197, "grad_norm": 1.5387658671321223, "kl": 0.2314453125, "learning_rate": 3.998830802060317e-07, "loss": -0.0561, "num_tokens": 14279533.0, "reward": 9.884743485599756e-05, "reward_std": 0.00019769486971199512, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00010983048559864983, "rewards/logprob_reward/std": 0.0004931351286359131, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 653.0625, "completions/mean_terminated_length": 614.6896362304688, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 1.5895061728395061, "grad_norm": 0.6673077990520957, "kl": 0.228271484375, "learning_rate": 3.994826573923886e-07, "loss": 0.0146, "num_tokens": 14306735.0, "reward": 2.7812768166768365e-05, "reward_std": 5.562553633353673e-05, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 3.090307654929347e-05, "rewards/logprob_reward/std": 0.00017481419490650296, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 638.40625, "completions/mean_terminated_length": 612.7000122070312, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 1.5925925925925926, "grad_norm": 0.7409619007185388, "kl": 0.21533203125, "learning_rate": 3.9908163680334326e-07, "loss": -0.0085, "num_tokens": 14333544.0, "reward": 0.006347795017063618, "reward_std": 0.007331542205065489, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00010866135562537238, "rewards/logprob_reward/std": 0.0006146814557723701, "step": 516 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 703.125, "completions/mean_terminated_length": 703.125, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 1.595679012345679, "grad_norm": 1.803806100339916, "kl": NaN, "learning_rate": 3.9868002004256165e-07, "loss": -0.0982, "num_tokens": 14362676.0, "reward": 0.0035705072805285454, "reward_std": 0.007141014561057091, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0004950080765411258, "rewards/logprob_reward/std": 0.0022015306167304516, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 700.46875, "completions/mean_terminated_length": 667.0, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 1.5987654320987654, "grad_norm": 1.6329114031815135, "kl": 0.19091796875, "learning_rate": 3.982778087160935e-07, "loss": -0.0519, "num_tokens": 14391563.0, "reward": 0.0034556398168206215, "reward_std": 0.006911279633641243, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0003673774190247059, "rewards/logprob_reward/std": 0.0014456151984632015, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 665.78125, "completions/mean_terminated_length": 628.72412109375, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 1.6018518518518519, "grad_norm": 0.628706653315792, "kl": 0.1845703125, "learning_rate": 3.9787500443236664e-07, "loss": 0.004, "num_tokens": 14419624.0, "reward": 1.9111619621980935e-05, "reward_std": 3.8223235605983064e-05, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 2.1235133317532018e-05, "rewards/logprob_reward/std": 0.00012012405932182446, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 671.21875, "completions/mean_terminated_length": 647.7000122070312, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 1.6049382716049383, "grad_norm": 1.3226224371308348, "kl": 0.2242431640625, "learning_rate": 3.9747160880217994e-07, "loss": -0.0485, "num_tokens": 14447795.0, "reward": 0.003268610220402479, "reward_std": 0.006537220440804958, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00015956697461660951, "rewards/logprob_reward/std": 0.0009026471525430679, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 711.84375, "completions/mean_terminated_length": 667.25, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 1.6080246913580247, "grad_norm": 1.403521045440835, "kl": 0.1754150390625, "learning_rate": 3.9706762343869705e-07, "loss": -0.0531, "num_tokens": 14477182.0, "reward": 0.0002255470462841913, "reward_std": 0.0004165461577940732, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0002506078453734517, "rewards/logprob_reward/std": 0.0010859397007152438, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 693.28125, "completions/mean_terminated_length": 659.0689697265625, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 1.6111111111111112, "grad_norm": 1.297088064026116, "kl": 0.198486328125, "learning_rate": 3.966630499574397e-07, "loss": -0.0447, "num_tokens": 14505659.0, "reward": 0.003463702742010355, "reward_std": 0.006641572806984186, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0003763362765312195, "rewards/logprob_reward/std": 0.0014818207127973437, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 674.4375, "completions/mean_terminated_length": 663.1612548828125, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 1.6141975308641974, "grad_norm": 1.2878515019824124, "kl": 0.197021484375, "learning_rate": 3.9625788997628196e-07, "loss": 0.0182, "num_tokens": 14533549.0, "reward": 0.0033389755990356207, "reward_std": 0.006677951198071241, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0002377507626079023, "rewards/logprob_reward/std": 0.00101320946123451, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 619.375, "completions/mean_terminated_length": 619.375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 1.617283950617284, "grad_norm": 1.5402558614689486, "kl": 0.2086181640625, "learning_rate": 3.958521451154428e-07, "loss": -0.0177, "num_tokens": 14559761.0, "reward": 0.006601343862712383, "reward_std": 0.012623758986592293, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00039038198883645236, "rewards/logprob_reward/std": 0.0012395030353218317, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 719.625, "completions/mean_terminated_length": 676.1428833007812, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 1.6203703703703702, "grad_norm": 0.8735394195416519, "kl": 0.1729736328125, "learning_rate": 3.954458169974805e-07, "loss": -0.0163, "num_tokens": 14589565.0, "reward": 0.0002931773487944156, "reward_std": 0.0005863546975888312, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0003257526259403676, "rewards/logprob_reward/std": 0.0018427351024001837, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 649.78125, "completions/mean_terminated_length": 624.8333740234375, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 1.623456790123457, "grad_norm": 2.12400021297932, "kl": 0.2386474609375, "learning_rate": 3.950389072472855e-07, "loss": -0.1252, "num_tokens": 14616710.0, "reward": 0.009645880199968815, "reward_std": 0.014008638449013233, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.00030097743729129434, "rewards/logprob_reward/std": 0.001185261644423008, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 669.53125, "completions/mean_terminated_length": 632.862060546875, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 1.626543209876543, "grad_norm": 1.2793150513425062, "kl": 0.183837890625, "learning_rate": 3.9463141749207425e-07, "loss": -0.0152, "num_tokens": 14644815.0, "reward": 0.0002448577433824539, "reward_std": 0.0004897154867649078, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0002720641787163913, "rewards/logprob_reward/std": 0.001189305679872632, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 629.28125, "completions/mean_terminated_length": 616.54833984375, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 1.6296296296296298, "grad_norm": 2.7478992262055466, "kl": 0.254150390625, "learning_rate": 3.9422334936138255e-07, "loss": -0.1333, "num_tokens": 14671220.0, "reward": 0.0062837013974785805, "reward_std": 0.012540229596197605, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 3.744592686416581e-05, "rewards/logprob_reward/std": 0.0001500230428064242, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 686.3125, "completions/mean_terminated_length": 651.3793334960938, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 1.632716049382716, "grad_norm": 0.7965683182840033, "kl": 0.251708984375, "learning_rate": 3.938147044870594e-07, "loss": 0.0158, "num_tokens": 14699818.0, "reward": 9.006389882415533e-05, "reward_std": 0.00018012779764831066, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00010007100354414433, "rewards/logprob_reward/std": 0.0005295322043821216, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 765.0625, "completions/mean_terminated_length": 692.5599975585938, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 1.6358024691358026, "grad_norm": 2.232671746705429, "kl": 0.2332763671875, "learning_rate": 3.934054845032598e-07, "loss": -0.0223, "num_tokens": 14731156.0, "reward": 0.0006742465193383396, "reward_std": 0.0013484930386766791, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0007491627475246787, "rewards/logprob_reward/std": 0.002688066568225622, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 676.75, "completions/mean_terminated_length": 665.54833984375, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 1.6388888888888888, "grad_norm": 1.2362784051265552, "kl": 0.1810302734375, "learning_rate": 3.9299569104643876e-07, "loss": -0.0151, "num_tokens": 14759360.0, "reward": 0.003136072074994445, "reward_std": 0.00627214414998889, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 1.2302157301746774e-05, "rewards/logprob_reward/std": 6.959151505725458e-05, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 714.46875, "completions/mean_terminated_length": 657.1481323242188, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 1.6419753086419753, "grad_norm": 0.01923557030109291, "kl": 0.2412109375, "learning_rate": 3.925853257553445e-07, "loss": 0.0002, "num_tokens": 14788351.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 663.0, "completions/mean_terminated_length": 638.933349609375, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 1.6450617283950617, "grad_norm": 0.7118540255991272, "kl": 0.24749755859375, "learning_rate": 3.921743902710122e-07, "loss": 0.0148, "num_tokens": 14816111.0, "reward": 7.841860497137532e-05, "reward_std": 0.00015683720994275063, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 8.71317824930884e-05, "rewards/logprob_reward/std": 0.0004928918206132948, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 676.6875, "completions/mean_terminated_length": 653.5333862304688, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 1.6481481481481481, "grad_norm": 1.9074909191744749, "kl": 0.23583984375, "learning_rate": 3.917628862367569e-07, "loss": -0.0625, "num_tokens": 14844345.0, "reward": 0.0003346746671013534, "reward_std": 0.0006693493342027068, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0003718607476912439, "rewards/logprob_reward/std": 0.0017217351123690605, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 746.03125, "completions/mean_terminated_length": 668.2000122070312, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 1.6512345679012346, "grad_norm": 1.345902352064227, "kl": 0.364990234375, "learning_rate": 3.913508152981674e-07, "loss": 0.0156, "num_tokens": 14875014.0, "reward": 0.00032999191898852587, "reward_std": 0.0006599838379770517, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00036665768129751086, "rewards/logprob_reward/std": 0.0013613435439765453, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 632.40625, "completions/mean_terminated_length": 591.8965454101562, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 1.654320987654321, "grad_norm": 1.6643347435632014, "kl": 0.248046875, "learning_rate": 3.909381791030998e-07, "loss": -0.0584, "num_tokens": 14901283.0, "reward": 0.0005056440131738782, "reward_std": 0.0010112880263477564, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0005618267459794879, "rewards/logprob_reward/std": 0.002438273513689637, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 737.8125, "completions/mean_terminated_length": 684.8148193359375, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 1.6574074074074074, "grad_norm": 1.1208025492877656, "kl": 0.2255859375, "learning_rate": 3.905249793016702e-07, "loss": -0.0176, "num_tokens": 14931865.0, "reward": 0.0033837691880762577, "reward_std": 0.006767538376152515, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0002875212230719626, "rewards/logprob_reward/std": 0.001626465586014092, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 689.3125, "completions/mean_terminated_length": 641.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 1.6604938271604939, "grad_norm": 0.012149488981183494, "kl": 0.2154541015625, "learning_rate": 3.9011121754624865e-07, "loss": 0.0002, "num_tokens": 14960259.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 717.8125, "completions/mean_terminated_length": 697.4000244140625, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 1.6635802469135803, "grad_norm": 1.6654338514291054, "kl": 0.201171875, "learning_rate": 3.8969689549145266e-07, "loss": -0.0166, "num_tokens": 14989973.0, "reward": 0.0038445612881332636, "reward_std": 0.00677096052095294, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0007995126652531326, "rewards/logprob_reward/std": 0.0022769954521209, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 727.25, "completions/mean_terminated_length": 644.1599731445312, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 1.6666666666666665, "grad_norm": 0.025271756498403648, "kl": 0.2305908203125, "learning_rate": 3.8928201479414024e-07, "loss": 0.0002, "num_tokens": 15019449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 682.75, "completions/mean_terminated_length": 647.4483032226562, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 1.6697530864197532, "grad_norm": 0.0343700147662808, "kl": 0.259033203125, "learning_rate": 3.888665771134032e-07, "loss": 0.0003, "num_tokens": 15047285.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 632.8125, "completions/mean_terminated_length": 620.1935424804688, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 1.6728395061728394, "grad_norm": 1.0155951920968442, "kl": 0.2408447265625, "learning_rate": 3.8845058411056095e-07, "loss": -0.0351, "num_tokens": 15073459.0, "reward": 0.003251735121011734, "reward_std": 0.006403619423508644, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.000140816715429537, "rewards/logprob_reward/std": 0.0005689717945642769, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 716.4375, "completions/mean_terminated_length": 659.4815063476562, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 1.675925925925926, "grad_norm": 1.0392347068243504, "kl": 0.205322265625, "learning_rate": 3.880340374491535e-07, "loss": -0.0343, "num_tokens": 15103001.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 636.03125, "completions/mean_terminated_length": 610.1666870117188, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 1.6790123456790123, "grad_norm": 1.3288486748243753, "kl": 0.240234375, "learning_rate": 3.8761693879493495e-07, "loss": -0.0364, "num_tokens": 15129690.0, "reward": 0.003559510223567486, "reward_std": 0.006802430842071772, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00048278900794684887, "rewards/logprob_reward/std": 0.0016537087503820658, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 671.0625, "completions/mean_terminated_length": 647.5333862304688, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 1.682098765432099, "grad_norm": 1.3301409124638641, "kl": 0.201416015625, "learning_rate": 3.871992898158667e-07, "loss": -0.0243, "num_tokens": 15157504.0, "reward": 0.009390555322170258, "reward_std": 0.013497989624738693, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 1.7283886336372234e-05, "rewards/logprob_reward/std": 9.777242667041719e-05, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 779.375, "completions/mean_terminated_length": 722.923095703125, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 1.6851851851851851, "grad_norm": 1.8399131374109736, "kl": 0.236083984375, "learning_rate": 3.867810921821112e-07, "loss": -0.0683, "num_tokens": 15189700.0, "reward": 0.0002412402827758342, "reward_std": 0.0004824805655516684, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00026804476510733366, "rewards/logprob_reward/std": 0.0015162901254370809, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 663.625, "completions/mean_terminated_length": 639.6000366210938, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 1.6882716049382716, "grad_norm": 1.6899053313155517, "kl": 0.2037353515625, "learning_rate": 3.863623475660245e-07, "loss": -0.1045, "num_tokens": 15217380.0, "reward": 0.006546499207615852, "reward_std": 0.013092998415231705, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0003294435446150601, "rewards/logprob_reward/std": 0.0016374721890315413, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 728.0, "completions/mean_terminated_length": 697.3793334960938, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 1.691358024691358, "grad_norm": 1.1925377947779334, "kl": 0.230224609375, "learning_rate": 3.859430576421503e-07, "loss": 0.0307, "num_tokens": 15247136.0, "reward": 0.0033502508886158466, "reward_std": 0.006618572399020195, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0002502789138816297, "rewards/logprob_reward/std": 0.0008935660007409751, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 686.46875, "completions/mean_terminated_length": 623.9629516601562, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 1.6944444444444444, "grad_norm": 1.8250365220303777, "kl": 0.200927734375, "learning_rate": 3.855232240872128e-07, "loss": -0.0448, "num_tokens": 15275503.0, "reward": 0.0033973990939557552, "reward_std": 0.0067947981879115105, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00030266543035395443, "rewards/logprob_reward/std": 0.001362743554636836, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 659.1875, "completions/mean_terminated_length": 621.4483032226562, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 1.6975308641975309, "grad_norm": 1.2164846800422262, "kl": 0.24462890625, "learning_rate": 3.851028485801105e-07, "loss": -0.0359, "num_tokens": 15303321.0, "reward": 0.0005772442673332989, "reward_std": 0.0007730101933702826, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0006413825321942568, "rewards/logprob_reward/std": 0.002183767966926098, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 659.90625, "completions/mean_terminated_length": 622.2413940429688, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 1.7006172839506173, "grad_norm": 1.64498439566592, "kl": 0.2210693359375, "learning_rate": 3.8468193280190864e-07, "loss": -0.0603, "num_tokens": 15330974.0, "reward": 0.00019263531430624425, "reward_std": 0.0003852706286124885, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00021403926075436175, "rewards/logprob_reward/std": 0.0008833868196234107, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 651.25, "completions/mean_terminated_length": 639.2257690429688, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 1.7037037037037037, "grad_norm": 0.8204484969903891, "kl": 0.2283935546875, "learning_rate": 3.842604784358333e-07, "loss": -0.0067, "num_tokens": 15357954.0, "reward": 0.00011239574087085202, "reward_std": 0.00022479148174170405, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00012488415814004838, "rewards/logprob_reward/std": 0.0007064514793455601, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 730.4375, "completions/mean_terminated_length": 710.86669921875, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 1.7067901234567902, "grad_norm": 0.6324076683656258, "kl": 0.1995849609375, "learning_rate": 3.8383848716726444e-07, "loss": 0.0143, "num_tokens": 15388404.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 598.65625, "completions/mean_terminated_length": 570.300048828125, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 1.7098765432098766, "grad_norm": 1.3854096979699027, "kl": 0.243408203125, "learning_rate": 3.8341596068372874e-07, "loss": -0.0223, "num_tokens": 15413525.0, "reward": 0.0031891183461993933, "reward_std": 0.0063782366923987865, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 7.124279363779351e-05, "rewards/logprob_reward/std": 0.0003295870847068727, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 641.375, "completions/mean_terminated_length": 586.7142944335938, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 1.7129629629629628, "grad_norm": 1.5483551432739786, "kl": 0.274658203125, "learning_rate": 3.829929006748934e-07, "loss": -0.0428, "num_tokens": 15440121.0, "reward": 0.0006713579641655087, "reward_std": 0.0013427160447463393, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0007459533517248929, "rewards/logprob_reward/std": 0.0038864153902977705, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 741.15625, "completions/mean_terminated_length": 711.8965454101562, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 1.7160493827160495, "grad_norm": 1.8155255721411654, "kl": 0.1968994140625, "learning_rate": 3.8256930883255927e-07, "loss": -0.0472, "num_tokens": 15470438.0, "reward": 0.0034768604673445225, "reward_std": 0.006953720934689045, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00039095626561902463, "rewards/logprob_reward/std": 0.001544651691801846, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 685.96875, "completions/mean_terminated_length": 651.0, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 1.7191358024691357, "grad_norm": 0.6701429724195487, "kl": 0.204833984375, "learning_rate": 3.8214518685065377e-07, "loss": 0.001, "num_tokens": 15499145.0, "reward": 0.00042355037294328213, "reward_std": 0.0004890738055109978, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00047061152872629464, "rewards/logprob_reward/std": 0.0018518351716920733, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 646.34375, "completions/mean_terminated_length": 621.1666870117188, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 1.7222222222222223, "grad_norm": 1.698009864594641, "kl": 0.225830078125, "learning_rate": 3.817205364252244e-07, "loss": 0.0202, "num_tokens": 15525912.0, "reward": 0.001057287328876555, "reward_std": 0.0018129103118553758, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0011747637763619423, "rewards/logprob_reward/std": 0.003242099191993475, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 712.84375, "completions/mean_terminated_length": 692.1000366210938, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 1.7253086419753085, "grad_norm": 0.8978241670939364, "kl": 0.1929931640625, "learning_rate": 3.8129535925443187e-07, "loss": -0.0231, "num_tokens": 15555443.0, "reward": 0.0001963062968570739, "reward_std": 0.00023446467821486294, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00021811810438521206, "rewards/logprob_reward/std": 0.000874139426741749, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 683.28125, "completions/mean_terminated_length": 660.5667114257812, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 1.7283950617283952, "grad_norm": 0.9053528909515298, "kl": 0.215576171875, "learning_rate": 3.8086965703854336e-07, "loss": -0.0109, "num_tokens": 15583916.0, "reward": 0.00016514095477759838, "reward_std": 0.00033028190955519676, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0001834899594541639, "rewards/logprob_reward/std": 0.0010379758896306157, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 648.59375, "completions/mean_terminated_length": 609.7586059570312, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 1.7314814814814814, "grad_norm": 1.3833182735515157, "kl": 0.2352294921875, "learning_rate": 3.8044343147992563e-07, "loss": 0.0218, "num_tokens": 15611067.0, "reward": 0.0003771684132516384, "reward_std": 0.0005551050999201834, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00041907603736035526, "rewards/logprob_reward/std": 0.0012183680664747953, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 676.40625, "completions/mean_terminated_length": 653.2333374023438, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 1.734567901234568, "grad_norm": 2.663795700852014, "kl": 0.221923828125, "learning_rate": 3.8001668428303847e-07, "loss": -0.0667, "num_tokens": 15639632.0, "reward": 0.0038018166087567806, "reward_std": 0.007603633217513561, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0007520184153690934, "rewards/logprob_reward/std": 0.0032845232635736465, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 673.59375, "completions/mean_terminated_length": 637.3448486328125, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 1.7376543209876543, "grad_norm": 1.1606607471316734, "kl": 0.21630859375, "learning_rate": 3.7958941715442726e-07, "loss": -0.0115, "num_tokens": 15667271.0, "reward": 0.0031921612098813057, "reward_std": 0.006344469729810953, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 7.462350185960531e-05, "rewards/logprob_reward/std": 0.0003383457660675049, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 681.6875, "completions/mean_terminated_length": 632.7857666015625, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 1.7407407407407407, "grad_norm": 1.2183370306869523, "kl": 0.2139892578125, "learning_rate": 3.791616318027171e-07, "loss": -0.0217, "num_tokens": 15695405.0, "reward": 0.003422896610572934, "reward_std": 0.006645845249295235, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.000330996117554605, "rewards/logprob_reward/std": 0.0014105957234278321, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 669.34375, "completions/mean_terminated_length": 603.6666870117188, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 1.7438271604938271, "grad_norm": 0.6382976844465137, "kl": 0.2100830078125, "learning_rate": 3.78733329938605e-07, "loss": 0.0197, "num_tokens": 15723200.0, "reward": 6.218590715434402e-05, "reward_std": 0.00012437181430868804, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 6.909545481903479e-05, "rewards/logprob_reward/std": 0.00039086290053091943, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 710.375, "completions/mean_terminated_length": 677.9310302734375, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 1.7469135802469136, "grad_norm": 1.8427554754446938, "kl": 0.21417236328125, "learning_rate": 3.7830451327485367e-07, "loss": -0.1368, "num_tokens": 15752992.0, "reward": 0.00027952424716204405, "reward_std": 0.0005590484943240881, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0003105824871454388, "rewards/logprob_reward/std": 0.001312681590206921, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 648.25, "completions/mean_terminated_length": 648.25, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 1.75, "grad_norm": 0.03047510652804787, "kl": 0.240234375, "learning_rate": 3.778751835262847e-07, "loss": 0.0002, "num_tokens": 15779968.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 638.6875, "completions/mean_terminated_length": 613.0000610351562, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 1.7530864197530864, "grad_norm": 0.9325899409839425, "kl": 0.206787109375, "learning_rate": 3.7744534240977085e-07, "loss": -0.0031, "num_tokens": 15806774.0, "reward": 0.00016520038479939103, "reward_std": 0.00033040076959878206, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00018355599604547024, "rewards/logprob_reward/std": 0.0010383494663983583, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 637.25, "completions/mean_terminated_length": 597.2413940429688, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 1.7561728395061729, "grad_norm": 2.1549187244707677, "kl": 0.234619140625, "learning_rate": 3.7701499164423045e-07, "loss": -0.0691, "num_tokens": 15833482.0, "reward": 0.0003800169506575912, "reward_std": 0.0007600339013151824, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0004222410498186946, "rewards/logprob_reward/std": 0.0018106489442288876, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 742.8125, "completions/mean_terminated_length": 702.6428833007812, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 1.7592592592592593, "grad_norm": 1.121914628219875, "kl": 0.22802734375, "learning_rate": 3.7658413295061974e-07, "loss": 0.0065, "num_tokens": 15864092.0, "reward": 0.00044991367030888796, "reward_std": 0.0008998273406177759, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0004999040393158793, "rewards/logprob_reward/std": 0.0017304662615060806, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 734.53125, "completions/mean_terminated_length": 653.47998046875, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 1.7623456790123457, "grad_norm": 1.339499848247988, "kl": 0.229248046875, "learning_rate": 3.7615276805192595e-07, "loss": -0.0622, "num_tokens": 15894121.0, "reward": 0.006375543307512999, "reward_std": 0.012420847080647945, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0001394923747284338, "rewards/logprob_reward/std": 0.0007890879642218351, "step": 571 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 691.8125, "completions/mean_terminated_length": 630.2963256835938, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 1.765432098765432, "grad_norm": 1.3147532536862638, "kl": NaN, "learning_rate": 3.7572089867316075e-07, "loss": -0.0712, "num_tokens": 15923103.0, "reward": 0.003216506913304329, "reward_std": 0.006433013826608658, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00010167431173613295, "rewards/logprob_reward/std": 0.0005751567659899592, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 624.78125, "completions/mean_terminated_length": 567.75, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 1.7685185185185186, "grad_norm": 2.9182706994037972, "kl": 0.2005615234375, "learning_rate": 3.7528852654135323e-07, "loss": -0.3139, "num_tokens": 15949304.0, "reward": 0.0016523003578186035, "reward_std": 0.00254919589497149, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0018358894158154726, "rewards/logprob_reward/std": 0.004228347912430763, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 692.96875, "completions/mean_terminated_length": 658.72412109375, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 1.7716049382716048, "grad_norm": 3.1566661593032346, "kl": 0.2393798828125, "learning_rate": 3.7485565338554294e-07, "loss": -0.1953, "num_tokens": 15977843.0, "reward": 0.000254699494689703, "reward_std": 0.000509398989379406, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0002829994191415608, "rewards/logprob_reward/std": 0.000908093003090471, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 713.90625, "completions/mean_terminated_length": 703.9031982421875, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 1.7746913580246915, "grad_norm": 1.4487019782934505, "kl": 0.21484375, "learning_rate": 3.7442228093677296e-07, "loss": -0.0497, "num_tokens": 16006856.0, "reward": 0.0034210658632218838, "reward_std": 0.006678018253296614, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00032896202174015343, "rewards/logprob_reward/std": 0.001040315837599337, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 683.0625, "completions/mean_terminated_length": 634.357177734375, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 1.7777777777777777, "grad_norm": 1.7400070642658096, "kl": 0.245361328125, "learning_rate": 3.7398841092808307e-07, "loss": -0.1239, "num_tokens": 16035294.0, "reward": 0.001132089295424521, "reward_std": 0.002264178590849042, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0012578770983964205, "rewards/logprob_reward/std": 0.005214401055127382, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 620.71875, "completions/mean_terminated_length": 579.0, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 1.7808641975308643, "grad_norm": 1.882047078975824, "kl": 0.2557373046875, "learning_rate": 3.735540450945028e-07, "loss": 0.0118, "num_tokens": 16061417.0, "reward": 0.003746317932382226, "reward_std": 0.007492635864764452, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.000690353219397366, "rewards/logprob_reward/std": 0.002425077138468623, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 675.40625, "completions/mean_terminated_length": 625.607177734375, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 1.7839506172839505, "grad_norm": 1.5233515915669114, "kl": 0.21844482421875, "learning_rate": 3.731191851730443e-07, "loss": -0.1325, "num_tokens": 16089662.0, "reward": 0.007472705096006393, "reward_std": 0.012153670191764832, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0013585611013695598, "rewards/logprob_reward/std": 0.007685181684792042, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 666.53125, "completions/mean_terminated_length": 655.0, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 1.7870370370370372, "grad_norm": 1.9879078602215337, "kl": 0.202392578125, "learning_rate": 3.7268383290269583e-07, "loss": -0.0217, "num_tokens": 16117055.0, "reward": 0.0004778489819727838, "reward_std": 0.0009556979639455676, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0005309433327056468, "rewards/logprob_reward/std": 0.001705501927062869, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 752.625, "completions/mean_terminated_length": 676.6400146484375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 1.7901234567901234, "grad_norm": 1.6217444321309262, "kl": 0.2469482421875, "learning_rate": 3.7224799002441427e-07, "loss": -0.0398, "num_tokens": 16148175.0, "reward": 0.00041415169835090637, "reward_std": 0.0008283034549094737, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00046016855048947036, "rewards/logprob_reward/std": 0.0026031064335256815, "step": 580 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 687.84375, "completions/mean_terminated_length": 639.8214721679688, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 1.7932098765432098, "grad_norm": 1.3112878299707869, "kl": NaN, "learning_rate": 3.718116582811186e-07, "loss": -0.0825, "num_tokens": 16176942.0, "reward": 0.0003379134286660701, "reward_std": 0.0006758268573321402, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0003754593781195581, "rewards/logprob_reward/std": 0.0014890246093273163, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 697.9375, "completions/mean_terminated_length": 651.357177734375, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 1.7962962962962963, "grad_norm": 0.749183739087493, "kl": 0.2171630859375, "learning_rate": 3.713748394176827e-07, "loss": -0.013, "num_tokens": 16205860.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 639.4375, "completions/mean_terminated_length": 627.0322265625, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 1.7993827160493827, "grad_norm": 2.1244408546901385, "kl": 0.21630859375, "learning_rate": 3.7093753518092853e-07, "loss": -0.112, "num_tokens": 16232626.0, "reward": 0.0002797901979647577, "reward_std": 0.0005595803959295154, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0003108780365437269, "rewards/logprob_reward/std": 0.0009934562258422375, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 652.375, "completions/mean_terminated_length": 599.2857666015625, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 1.8024691358024691, "grad_norm": 2.0480921821475335, "kl": 0.20709228515625, "learning_rate": 3.704997473196187e-07, "loss": -0.0799, "num_tokens": 16259786.0, "reward": 0.0033423788845539093, "reward_std": 0.006684757769107819, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00024153194681275636, "rewards/logprob_reward/std": 0.0008057018858380616, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 659.5625, "completions/mean_terminated_length": 607.5, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 1.8055555555555556, "grad_norm": 2.7833744522159534, "kl": 0.2177734375, "learning_rate": 3.7006147758445017e-07, "loss": -0.27, "num_tokens": 16287652.0, "reward": 0.0006784686120226979, "reward_std": 0.0012488170759752393, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0007538540521636605, "rewards/logprob_reward/std": 0.002179505070671439, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 636.6875, "completions/mean_terminated_length": 624.1935424804688, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 1.808641975308642, "grad_norm": 1.0955419928063221, "kl": 0.207275390625, "learning_rate": 3.696227277280467e-07, "loss": -0.0446, "num_tokens": 16314314.0, "reward": 0.0032903477549552917, "reward_std": 0.0065806955099105835, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00018371966143604368, "rewards/logprob_reward/std": 0.001039275317452848, "step": 586 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 755.03125, "completions/mean_terminated_length": 716.607177734375, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 1.8117283950617284, "grad_norm": 1.1770169484689004, "kl": NaN, "learning_rate": 3.691834995049522e-07, "loss": -0.0274, "num_tokens": 16345195.0, "reward": 0.00325536890886724, "reward_std": 0.00651073781773448, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0001448543625883758, "rewards/logprob_reward/std": 0.0008194200927391648, "step": 587 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 737.71875, "completions/mean_terminated_length": 684.7037353515625, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 1.8148148148148149, "grad_norm": 0.5399996364713595, "kl": NaN, "learning_rate": 3.687437946716234e-07, "loss": 0.0159, "num_tokens": 16375446.0, "reward": 5.639005757984705e-05, "reward_std": 0.0001127801151596941, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 6.265562115004286e-05, "rewards/logprob_reward/std": 0.00035443369415588677, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 811.53125, "completions/mean_terminated_length": 728.3912963867188, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 1.817901234567901, "grad_norm": 0.5676428595234785, "kl": 0.2337646484375, "learning_rate": 3.68303614986423e-07, "loss": 0.0251, "num_tokens": 16408123.0, "reward": 0.00019499726477079093, "reward_std": 0.00038999452954158187, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0002166636404581368, "rewards/logprob_reward/std": 0.0012256345944479108, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 719.65625, "completions/mean_terminated_length": 688.1724243164062, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 1.8209876543209877, "grad_norm": 1.594060644099542, "kl": 0.215576171875, "learning_rate": 3.6786296220961277e-07, "loss": -0.0736, "num_tokens": 16437728.0, "reward": 0.003276342060416937, "reward_std": 0.006552684120833874, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00016815779963508248, "rewards/logprob_reward/std": 0.000704619218595326, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 737.15625, "completions/mean_terminated_length": 696.1785888671875, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 1.824074074074074, "grad_norm": 0.6973092693761138, "kl": 0.2528076171875, "learning_rate": 3.6742183810334605e-07, "loss": 0.0096, "num_tokens": 16468301.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 701.125, "completions/mean_terminated_length": 667.72412109375, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 1.8271604938271606, "grad_norm": 0.6128440332453214, "kl": 0.214599609375, "learning_rate": 3.6698024443166134e-07, "loss": 0.0046, "num_tokens": 16497181.0, "reward": 0.00035004859091714025, "reward_std": 0.00040432787500321865, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0003889428626280278, "rewards/logprob_reward/std": 0.0015307284193113446, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 663.84375, "completions/mean_terminated_length": 663.84375, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 1.8302469135802468, "grad_norm": 1.7003587122824284, "kl": 0.2296142578125, "learning_rate": 3.6653818296047466e-07, "loss": -0.0474, "num_tokens": 16525244.0, "reward": 0.003290090709924698, "reward_std": 0.006580181419849396, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00018343421106692404, "rewards/logprob_reward/std": 0.0008672911208122969, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 632.09375, "completions/mean_terminated_length": 605.9666748046875, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 1.8333333333333335, "grad_norm": 0.8095433239735778, "kl": 0.2178955078125, "learning_rate": 3.660956554575729e-07, "loss": -0.0387, "num_tokens": 16552051.0, "reward": 0.00011567265028133988, "reward_std": 0.00013375480193644762, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0001285251637455076, "rewards/logprob_reward/std": 0.0005061195697635412, "step": 594 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 698.96875, "completions/mean_terminated_length": 638.7777709960938, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 1.8364197530864197, "grad_norm": 1.189685264859625, "kl": NaN, "learning_rate": 3.656526636926065e-07, "loss": -0.0132, "num_tokens": 16581354.0, "reward": 0.0004485278914216906, "reward_std": 0.0008970557828433812, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0004983643302693963, "rewards/logprob_reward/std": 0.0022451134864240885, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 643.625, "completions/mean_terminated_length": 631.3547973632812, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 1.8395061728395061, "grad_norm": 2.291384447871791, "kl": 0.2108154296875, "learning_rate": 3.652092094370826e-07, "loss": -0.188, "num_tokens": 16608210.0, "reward": 0.007215001620352268, "reward_std": 0.013606452383100986, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0010722236474975944, "rewards/logprob_reward/std": 0.0025399161968380213, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 620.03125, "completions/mean_terminated_length": 607.0, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 1.8425925925925926, "grad_norm": 1.7033004523047413, "kl": 0.247802734375, "learning_rate": 3.647652944643577e-07, "loss": -0.1362, "num_tokens": 16634491.0, "reward": 0.008209185674786568, "reward_std": 0.016418371349573135, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00564909540116787, "rewards/logprob_reward/std": 0.031682878732681274, "step": 597 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 663.8125, "completions/mean_terminated_length": 639.800048828125, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 1.845679012345679, "grad_norm": 1.7798577344049977, "kl": NaN, "learning_rate": 3.6432092054963055e-07, "loss": -0.0714, "num_tokens": 16662137.0, "reward": 0.00012272670574020594, "reward_std": 0.0002454534114804119, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00013636301446240395, "rewards/logprob_reward/std": 0.0005504813161678612, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 669.3125, "completions/mean_terminated_length": 645.6666870117188, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 1.8487654320987654, "grad_norm": 3.8329879003761573, "kl": 0.2119140625, "learning_rate": 3.638760894699355e-07, "loss": -0.2267, "num_tokens": 16690151.0, "reward": 0.003814605064690113, "reward_std": 0.007213447708636522, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0007662278367206454, "rewards/logprob_reward/std": 0.0023001504596322775, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 666.6875, "completions/mean_terminated_length": 615.6428833007812, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 1.8518518518518519, "grad_norm": 1.9690954257948825, "kl": 0.189453125, "learning_rate": 3.6343080300413497e-07, "loss": -0.1472, "num_tokens": 16718129.0, "reward": 0.0006136884912848473, "reward_std": 0.0012273769825696945, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0006818760884925723, "rewards/logprob_reward/std": 0.0021795539651066065, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 616.03125, "completions/mean_terminated_length": 616.03125, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 1.8549382716049383, "grad_norm": 2.267634044761325, "kl": 0.20660400390625, "learning_rate": 3.629850629329124e-07, "loss": -0.1886, "num_tokens": 16743918.0, "reward": 0.0002721587661653757, "reward_std": 0.0005443175323307514, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00030239863554015756, "rewards/logprob_reward/std": 0.0009632536093704402, "step": 601 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 637.1875, "completions/mean_terminated_length": 611.4000244140625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 1.8580246913580247, "grad_norm": 1.9216710596841131, "kl": NaN, "learning_rate": 3.625388710387651e-07, "loss": -0.117, "num_tokens": 16770708.0, "reward": 0.00043982313945889473, "reward_std": 0.0008796462789177895, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0004886924289166927, "rewards/logprob_reward/std": 0.0018993121339008212, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 636.625, "completions/mean_terminated_length": 610.800048828125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 1.8611111111111112, "grad_norm": 0.7693513544724933, "kl": 0.2528076171875, "learning_rate": 3.6209222910599746e-07, "loss": 0.0026, "num_tokens": 16797624.0, "reward": 0.00011575274402275681, "reward_std": 0.00023150548804551363, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0001286141632590443, "rewards/logprob_reward/std": 0.0007275515235960484, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 706.5, "completions/mean_terminated_length": 685.3333740234375, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 1.8641975308641974, "grad_norm": 2.756857318491456, "kl": 0.1982421875, "learning_rate": 3.616451389207133e-07, "loss": -0.3005, "num_tokens": 16826608.0, "reward": 0.009740946814417839, "reward_std": 0.019144851714372635, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.00040660687955096364, "rewards/logprob_reward/std": 0.0009685659315437078, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 669.65625, "completions/mean_terminated_length": 646.0333862304688, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 1.867283950617284, "grad_norm": 2.4007858570678065, "kl": 0.19091796875, "learning_rate": 3.611976022708091e-07, "loss": -0.1266, "num_tokens": 16854421.0, "reward": 0.0005778549239039421, "reward_std": 0.0011557098478078842, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0006420610588975251, "rewards/logprob_reward/std": 0.0019600477535277605, "step": 605 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 714.4375, "completions/mean_terminated_length": 682.413818359375, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 1.8703703703703702, "grad_norm": 1.0836992968485804, "kl": NaN, "learning_rate": 3.6074962094596676e-07, "loss": -0.0144, "num_tokens": 16884259.0, "reward": 0.0002586792397778481, "reward_std": 0.0005173584795556962, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.000287421396933496, "rewards/logprob_reward/std": 0.0013689196202903986, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 638.9375, "completions/mean_terminated_length": 626.51611328125, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 1.873456790123457, "grad_norm": 1.9117851121082363, "kl": 0.271728515625, "learning_rate": 3.603011967376464e-07, "loss": -0.0971, "num_tokens": 16910953.0, "reward": 0.0063897306099534035, "reward_std": 0.012779461219906807, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00015525644994340837, "rewards/logprob_reward/std": 0.0006112787523306906, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 717.1875, "completions/mean_terminated_length": 660.370361328125, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 1.876543209876543, "grad_norm": 2.934403376079828, "kl": 0.221923828125, "learning_rate": 3.598523314390792e-07, "loss": -0.2261, "num_tokens": 16940703.0, "reward": 0.003488546935841441, "reward_std": 0.006977093871682882, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00040394088136963546, "rewards/logprob_reward/std": 0.001497879740782082, "step": 608 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 656.46875, "completions/mean_terminated_length": 618.4483032226562, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 1.8796296296296298, "grad_norm": 1.5286841883570508, "kl": NaN, "learning_rate": 3.594030268452601e-07, "loss": -0.1172, "num_tokens": 16968390.0, "reward": 0.007147197145968676, "reward_std": 0.012378408573567867, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0009968855883926153, "rewards/logprob_reward/std": 0.002718651667237282, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 733.59375, "completions/mean_terminated_length": 703.5516967773438, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 1.882716049382716, "grad_norm": 0.8153433208659888, "kl": 0.245361328125, "learning_rate": 3.5895328475294106e-07, "loss": 0.0039, "num_tokens": 16998853.0, "reward": 7.518004713347182e-05, "reward_std": 0.00015036009426694363, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 8.353338489541784e-05, "rewards/logprob_reward/std": 0.00047253616503439844, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 595.40625, "completions/mean_terminated_length": 595.40625, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 1.8858024691358026, "grad_norm": 1.3981606907089654, "kl": 0.2476806640625, "learning_rate": 3.585031069606234e-07, "loss": -0.0461, "num_tokens": 17024022.0, "reward": 0.00025799532886594534, "reward_std": 0.0005159906577318907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0002866614959202707, "rewards/logprob_reward/std": 0.0010666087036952376, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 733.21875, "completions/mean_terminated_length": 703.137939453125, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 1.8888888888888888, "grad_norm": 1.9300403485331896, "kl": 0.24560546875, "learning_rate": 3.5805249526855074e-07, "loss": -0.1364, "num_tokens": 17054241.0, "reward": 0.007239177823066711, "reward_std": 0.013713184744119644, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0010990869486704469, "rewards/logprob_reward/std": 0.003983082249760628, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 725.78125, "completions/mean_terminated_length": 694.9310302734375, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 1.8919753086419753, "grad_norm": 1.614546467864275, "kl": 0.2275390625, "learning_rate": 3.5760145147870204e-07, "loss": -0.0186, "num_tokens": 17084394.0, "reward": 0.004577491898089647, "reward_std": 0.005401932634413242, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.005086102522909641, "rewards/logprob_reward/std": 0.01939297839999199, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 649.34375, "completions/mean_terminated_length": 637.258056640625, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 1.8950617283950617, "grad_norm": 1.9682660385141868, "kl": 0.229736328125, "learning_rate": 3.571499773947839e-07, "loss": -0.1667, "num_tokens": 17111605.0, "reward": 0.003681524656713009, "reward_std": 0.007349275518208742, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0006183606456033885, "rewards/logprob_reward/std": 0.0019619932863861322, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 663.3125, "completions/mean_terminated_length": 651.6774291992188, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 1.8981481481481481, "grad_norm": 0.01012425679665426, "kl": 0.263916015625, "learning_rate": 3.5669807482222395e-07, "loss": 0.0003, "num_tokens": 17139279.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 615 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 640.65625, "completions/mean_terminated_length": 640.65625, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 1.9012345679012346, "grad_norm": 0.006418792746033701, "kl": NaN, "learning_rate": 3.562457455681633e-07, "loss": 0.0002, "num_tokens": 17165992.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0, "rewards/logprob_reward/std": 0.0, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 718.3125, "completions/mean_terminated_length": 708.4515991210938, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 1.904320987654321, "grad_norm": 1.4639889924948153, "kl": 0.25054931640625, "learning_rate": 3.557929914414491e-07, "loss": -0.0644, "num_tokens": 17195434.0, "reward": 0.0037563112564384937, "reward_std": 0.006750315893441439, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0007014567381702363, "rewards/logprob_reward/std": 0.0023105114232748747, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 660.65625, "completions/mean_terminated_length": 660.65625, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 1.9074074074074074, "grad_norm": 2.7532084091241784, "kl": 0.2330322265625, "learning_rate": 3.553398142526277e-07, "loss": -0.2638, "num_tokens": 17222695.0, "reward": 0.004605669528245926, "reward_std": 0.00885448046028614, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0016451881965622306, "rewards/logprob_reward/std": 0.005964573472738266, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 635.28125, "completions/mean_terminated_length": 622.741943359375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 1.9104938271604939, "grad_norm": 1.4866930666560245, "kl": 0.2518310546875, "learning_rate": 3.5488621581393736e-07, "loss": -0.0629, "num_tokens": 17249528.0, "reward": 0.003804399399086833, "reward_std": 0.007557778153568506, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.000754888285882771, "rewards/logprob_reward/std": 0.003386253025382757, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 677.40625, "completions/mean_terminated_length": 654.300048828125, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 1.9135802469135803, "grad_norm": 1.6192875703341125, "kl": 0.219482421875, "learning_rate": 3.5443219793930073e-07, "loss": -0.0757, "num_tokens": 17277369.0, "reward": 0.006622261367738247, "reward_std": 0.012823529541492462, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00041362352203577757, "rewards/logprob_reward/std": 0.0013766667107120156, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 720.34375, "completions/mean_terminated_length": 700.1000366210938, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 1.9166666666666665, "grad_norm": 1.361747719158989, "kl": 0.2054443359375, "learning_rate": 3.5397776244431794e-07, "loss": 0.0265, "num_tokens": 17306976.0, "reward": 0.000303977431030944, "reward_std": 0.000607954862061888, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.000337752717314288, "rewards/logprob_reward/std": 0.0011352337896823883, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 582.59375, "completions/mean_terminated_length": 582.59375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 1.9197530864197532, "grad_norm": 1.888818286955456, "kl": 0.2421875, "learning_rate": 3.535229111462589e-07, "loss": -0.0519, "num_tokens": 17331479.0, "reward": 0.009957034140825272, "reward_std": 0.01969798468053341, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0006467049242928624, "rewards/logprob_reward/std": 0.0022434887941926718, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 742.5625, "completions/mean_terminated_length": 690.4444580078125, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 1.9228395061728394, "grad_norm": 1.8183940594845527, "kl": 0.1805419921875, "learning_rate": 3.530676458640567e-07, "loss": -0.104, "num_tokens": 17361937.0, "reward": 0.006928172893822193, "reward_std": 0.01311812736093998, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0007535253535024822, "rewards/logprob_reward/std": 0.0017174641834571958, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 682.34375, "completions/mean_terminated_length": 659.5667114257812, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 1.925925925925926, "grad_norm": 1.622669129379335, "kl": 0.2322998046875, "learning_rate": 3.5261196841829957e-07, "loss": -0.1164, "num_tokens": 17390448.0, "reward": 0.0008956977399066091, "reward_std": 0.0017913954798132181, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0009952196851372719, "rewards/logprob_reward/std": 0.004388981498777866, "step": 624 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 653.6875, "completions/mean_terminated_length": 585.1111450195312, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 1.9290123456790123, "grad_norm": 1.9691449530904876, "kl": NaN, "learning_rate": 3.521558806312241e-07, "loss": -0.0801, "num_tokens": 17417398.0, "reward": 0.00996247585862875, "reward_std": 0.019323039799928665, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0006527507212013006, "rewards/logprob_reward/std": 0.0014504214050248265, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 782.9375, "completions/mean_terminated_length": 738.2963256835938, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 1.932098765432099, "grad_norm": 1.7416981798694386, "kl": 0.2022705078125, "learning_rate": 3.5169938432670775e-07, "loss": -0.1037, "num_tokens": 17449732.0, "reward": 0.009690329432487488, "reward_std": 0.018926044926047325, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0003503664047457278, "rewards/logprob_reward/std": 0.0011261178879067302, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 635.9375, "completions/mean_terminated_length": 635.9375, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 1.9351851851851851, "grad_norm": 2.551253548549046, "kl": 0.227294921875, "learning_rate": 3.5124248133026187e-07, "loss": -0.1819, "num_tokens": 17476330.0, "reward": 0.013743954710662365, "reward_std": 0.020080571994185448, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0013821718748658895, "rewards/logprob_reward/std": 0.003602338256314397, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 702.8125, "completions/mean_terminated_length": 692.4515991210938, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 1.9382716049382716, "grad_norm": 2.049453800691809, "kl": 0.153564453125, "learning_rate": 3.5078517346902384e-07, "loss": -0.1668, "num_tokens": 17505348.0, "reward": 0.0038290387019515038, "reward_std": 0.007396456319838762, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0007822653278708458, "rewards/logprob_reward/std": 0.001585601014085114, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 742.5, "completions/mean_terminated_length": 702.2857666015625, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 1.941358024691358, "grad_norm": 1.7754893581187987, "kl": 0.224609375, "learning_rate": 3.503274625717504e-07, "loss": -0.0476, "num_tokens": 17535780.0, "reward": 0.016202464699745178, "reward_std": 0.026726093143224716, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0006416262476705015, "rewards/logprob_reward/std": 0.001685622613877058, "step": 629 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 688.625, "completions/mean_terminated_length": 640.7142944335938, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 1.9444444444444444, "grad_norm": 1.0366987696053815, "kl": NaN, "learning_rate": 3.498693504688097e-07, "loss": -0.053, "num_tokens": 17564420.0, "reward": 0.00011124689626740292, "reward_std": 0.00022249379253480583, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00012360766413621604, "rewards/logprob_reward/std": 0.0005967440083622932, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 711.09375, "completions/mean_terminated_length": 666.3928833007812, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 1.9475308641975309, "grad_norm": 1.9263338689613325, "kl": 0.2337646484375, "learning_rate": 3.494108389921744e-07, "loss": -0.1325, "num_tokens": 17593547.0, "reward": 0.006682299077510834, "reward_std": 0.01326083205640316, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0004803319461643696, "rewards/logprob_reward/std": 0.0015430138446390629, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 622.0, "completions/mean_terminated_length": 622.0, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 1.9506172839506173, "grad_norm": 1.1144940461560273, "kl": 0.2257080078125, "learning_rate": 3.4895192997541436e-07, "loss": -0.0614, "num_tokens": 17619683.0, "reward": 0.00013048779510427266, "reward_std": 0.00026097559020854533, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0001449864503229037, "rewards/logprob_reward/std": 0.0008201671880669892, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 657.375, "completions/mean_terminated_length": 657.375, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 1.9537037037037037, "grad_norm": 2.7741367405573274, "kl": 0.20556640625, "learning_rate": 3.484926252536891e-07, "loss": -0.3178, "num_tokens": 17646827.0, "reward": 0.006848607212305069, "reward_std": 0.013298070058226585, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0006651186849921942, "rewards/logprob_reward/std": 0.001522500766441226, "step": 633 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 776.34375, "completions/mean_terminated_length": 707.0, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 1.9567901234567902, "grad_norm": 1.2660058001746237, "kl": NaN, "learning_rate": 3.4803292666374047e-07, "loss": -0.0422, "num_tokens": 17678622.0, "reward": 0.0034396664705127478, "reward_std": 0.006518971174955368, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00034962897188961506, "rewards/logprob_reward/std": 0.0009765062131918967, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 620.5, "completions/mean_terminated_length": 620.5, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 1.9598765432098766, "grad_norm": 2.3222121088229892, "kl": 0.20465087890625, "learning_rate": 3.4757283604388546e-07, "loss": -0.3146, "num_tokens": 17704882.0, "reward": 0.010148421861231327, "reward_std": 0.014225448481738567, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0008593578240834177, "rewards/logprob_reward/std": 0.002076053060591221, "step": 635 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 644.96875, "completions/mean_terminated_length": 632.741943359375, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 1.9629629629629628, "grad_norm": 2.857289780048274, "kl": NaN, "learning_rate": 3.47112355234009e-07, "loss": -0.2991, "num_tokens": 17731801.0, "reward": 0.00415524048730731, "reward_std": 0.007971592247486115, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0011447113938629627, "rewards/logprob_reward/std": 0.0035985566210001707, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 689.40625, "completions/mean_terminated_length": 654.7930908203125, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 1.9660493827160495, "grad_norm": 2.6457668450419636, "kl": 0.2266845703125, "learning_rate": 3.466514860755559e-07, "loss": -0.2339, "num_tokens": 17760614.0, "reward": 0.003990027587860823, "reward_std": 0.0076071722432971, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0009611418936401606, "rewards/logprob_reward/std": 0.0026789382100105286, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 748.0625, "completions/mean_terminated_length": 708.6428833007812, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 1.9691358024691357, "grad_norm": 3.398117563461742, "kl": 0.2039794921875, "learning_rate": 3.4619023041152433e-07, "loss": -0.3676, "num_tokens": 17791488.0, "reward": 0.009694833308458328, "reward_std": 0.01901283487677574, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0003553707501851022, "rewards/logprob_reward/std": 0.0011662804754450917, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 666.6875, "completions/mean_terminated_length": 655.1612548828125, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 1.9722222222222223, "grad_norm": 3.980949939208548, "kl": 0.1934814453125, "learning_rate": 3.4572859008645796e-07, "loss": -0.7152, "num_tokens": 17819310.0, "reward": 0.01366504654288292, "reward_std": 0.021684300154447556, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0012944953050464392, "rewards/logprob_reward/std": 0.003360053990036249, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 642.875, "completions/mean_terminated_length": 630.5806274414062, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 1.9753086419753085, "grad_norm": 2.097082282872992, "kl": 0.2086181640625, "learning_rate": 3.452665669464386e-07, "loss": -0.1114, "num_tokens": 17846250.0, "reward": 0.012979520484805107, "reward_std": 0.020176339894533157, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0005327996914274991, "rewards/logprob_reward/std": 0.0012611752608790994, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 678.375, "completions/mean_terminated_length": 655.3333740234375, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 1.9783950617283952, "grad_norm": 2.0341866997312157, "kl": 0.20025634765625, "learning_rate": 3.448041628390791e-07, "loss": -0.1804, "num_tokens": 17874178.0, "reward": 0.010370473377406597, "reward_std": 0.014955338090658188, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0011060814140364528, "rewards/logprob_reward/std": 0.002451796317473054, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 773.40625, "completions/mean_terminated_length": 703.239990234375, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 1.9814814814814814, "grad_norm": 1.768869897826344, "kl": 0.195556640625, "learning_rate": 3.443413796135159e-07, "loss": -0.1654, "num_tokens": 17906123.0, "reward": 0.006963692139834166, "reward_std": 0.013696135021746159, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0007929910207167268, "rewards/logprob_reward/std": 0.002285647438839078, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 657.125, "completions/mean_terminated_length": 657.125, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 1.984567901234568, "grad_norm": 1.7234238536710782, "kl": 0.1939697265625, "learning_rate": 3.4387821912040116e-07, "loss": -0.144, "num_tokens": 17933367.0, "reward": 0.01052926853299141, "reward_std": 0.014431968331336975, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0012825197773054242, "rewards/logprob_reward/std": 0.0020001446828246117, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 730.65625, "completions/mean_terminated_length": 711.1000366210938, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 1.9876543209876543, "grad_norm": 2.919212565274168, "kl": 0.19921875, "learning_rate": 3.4341468321189574e-07, "loss": -0.2925, "num_tokens": 17963196.0, "reward": 0.004549227189272642, "reward_std": 0.008575081825256348, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0015824747970327735, "rewards/logprob_reward/std": 0.00603429926559329, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 672.34375, "completions/mean_terminated_length": 672.34375, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 1.9907407407407407, "grad_norm": 1.714941870799897, "kl": 0.1932373046875, "learning_rate": 3.4295077374166214e-07, "loss": -0.0636, "num_tokens": 17990939.0, "reward": 0.014002135023474693, "reward_std": 0.013756480067968369, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0016690394841134548, "rewards/logprob_reward/std": 0.003402333240956068, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 648.5625, "completions/mean_terminated_length": 648.5625, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 1.9938271604938271, "grad_norm": 1.9059464070252758, "kl": 0.2001953125, "learning_rate": 3.4248649256485655e-07, "loss": -0.1257, "num_tokens": 18017729.0, "reward": 0.004415285307914019, "reward_std": 0.008087873458862305, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0014336502645164728, "rewards/logprob_reward/std": 0.004082860425114632, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 720.90625, "completions/mean_terminated_length": 720.90625, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 1.9969135802469136, "grad_norm": 2.861117188565715, "kl": 0.18402099609375, "learning_rate": 3.4202184153812135e-07, "loss": -0.2984, "num_tokens": 18047642.0, "reward": 0.011068264953792095, "reward_std": 0.02071787789463997, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0018814050126820803, "rewards/logprob_reward/std": 0.0049293856136500835, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 701.65625, "completions/mean_terminated_length": 668.3103637695312, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 2.0, "grad_norm": 3.4446847588007503, "kl": 0.183349609375, "learning_rate": 3.415568225195783e-07, "loss": -0.3439, "num_tokens": 18076455.0, "reward": 0.013659604825079441, "reward_std": 0.020921621471643448, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0012884484604001045, "rewards/logprob_reward/std": 0.002464776625856757, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 639.34375, "completions/mean_terminated_length": 639.34375, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 2.003086419753086, "grad_norm": 2.8367048053235058, "kl": 0.2159423828125, "learning_rate": 3.410914373688205e-07, "loss": -0.2831, "num_tokens": 18103130.0, "reward": 0.010716721415519714, "reward_std": 0.020568618550896645, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.00149080203846097, "rewards/logprob_reward/std": 0.0031364334281533957, "step": 649 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 737.59375, "completions/mean_terminated_length": 696.6785888671875, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 2.006172839506173, "grad_norm": 1.7818935639622824, "kl": NaN, "learning_rate": 3.4062568794690536e-07, "loss": -0.0913, "num_tokens": 18133505.0, "reward": 0.008726210333406925, "reward_std": 0.014377452433109283, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.002751345979049802, "rewards/logprob_reward/std": 0.008913630619645119, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 787.0, "completions/mean_terminated_length": 708.0, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 2.009259259259259, "grad_norm": 2.5376291886189506, "kl": 0.2034912109375, "learning_rate": 3.401595761163468e-07, "loss": -0.2945, "num_tokens": 18165753.0, "reward": 0.0041171349585056305, "reward_std": 0.007853616960346699, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0011023720726370811, "rewards/logprob_reward/std": 0.0028409750666469336, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 687.03125, "completions/mean_terminated_length": 638.8928833007812, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 2.0123456790123457, "grad_norm": 2.165864505710753, "kl": 0.217041015625, "learning_rate": 3.3969310374110817e-07, "loss": -0.128, "num_tokens": 18194070.0, "reward": 0.011410648003220558, "reward_std": 0.015654362738132477, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0022618311922997236, "rewards/logprob_reward/std": 0.005047931801527739, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 770.875, "completions/mean_terminated_length": 724.0, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 2.015432098765432, "grad_norm": 3.129542329363188, "kl": 0.206787109375, "learning_rate": 3.3922627268659467e-07, "loss": -0.3592, "num_tokens": 18225798.0, "reward": 0.006686346139758825, "reward_std": 0.013143929652869701, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00048482915735803545, "rewards/logprob_reward/std": 0.0012821360724046826, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 700.375, "completions/mean_terminated_length": 678.800048828125, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 2.0185185185185186, "grad_norm": 1.7025150053825844, "kl": 0.1939697265625, "learning_rate": 3.387590848196456e-07, "loss": -0.075, "num_tokens": 18254870.0, "reward": 0.020044365897774696, "reward_std": 0.0279097743332386, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0014381844084709883, "rewards/logprob_reward/std": 0.002929187845438719, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 745.75, "completions/mean_terminated_length": 716.9655151367188, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 2.021604938271605, "grad_norm": 1.3789003528592825, "kl": 0.1744384765625, "learning_rate": 3.382915420085274e-07, "loss": -0.0817, "num_tokens": 18285474.0, "reward": 0.00401701033115387, "reward_std": 0.007092323154211044, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0009911225643008947, "rewards/logprob_reward/std": 0.001909551676362753, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 646.59375, "completions/mean_terminated_length": 646.59375, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 2.0246913580246915, "grad_norm": 1.4971325155816617, "kl": 0.1964111328125, "learning_rate": 3.3782364612292574e-07, "loss": -0.0537, "num_tokens": 18312153.0, "reward": 0.0006674743490293622, "reward_std": 0.0008368261624127626, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0007416382431983948, "rewards/logprob_reward/std": 0.0015969941159710288, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 714.59375, "completions/mean_terminated_length": 682.586181640625, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 2.0277777777777777, "grad_norm": 2.1720589107666544, "kl": 0.2088623046875, "learning_rate": 3.3735539903393826e-07, "loss": -0.0993, "num_tokens": 18341308.0, "reward": 0.01398205291479826, "reward_std": 0.020772812888026237, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0016467259265482426, "rewards/logprob_reward/std": 0.0027846088632941246, "step": 657 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 719.25, "completions/mean_terminated_length": 709.4193115234375, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 2.0308641975308643, "grad_norm": 1.333437859197381, "kl": NaN, "learning_rate": 3.368868026140672e-07, "loss": 0.0051, "num_tokens": 18370472.0, "reward": 0.00979865062981844, "reward_std": 0.014177567325532436, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0004707224143203348, "rewards/logprob_reward/std": 0.001750817522406578, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 699.03125, "completions/mean_terminated_length": 677.36669921875, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 2.0339506172839505, "grad_norm": 2.1036568152035935, "kl": 0.1922607421875, "learning_rate": 3.364178587372115e-07, "loss": -0.188, "num_tokens": 18399049.0, "reward": 0.004447158891707659, "reward_std": 0.00822368636727333, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0014690652024000883, "rewards/logprob_reward/std": 0.0029987546149641275, "step": 659 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 731.15625, "completions/mean_terminated_length": 663.5769653320312, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 2.037037037037037, "grad_norm": 1.9055595221485488, "kl": NaN, "learning_rate": 3.359485692786597e-07, "loss": -0.144, "num_tokens": 18428938.0, "reward": 0.006459952797740698, "reward_std": 0.012794826179742813, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00023328073439188302, "rewards/logprob_reward/std": 0.0006398450350388885, "step": 660 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 708.5625, "completions/mean_terminated_length": 663.5, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 2.0401234567901234, "grad_norm": 1.266916597384057, "kl": NaN, "learning_rate": 3.354789361150824e-07, "loss": -0.0315, "num_tokens": 18457924.0, "reward": 0.0071504078805446625, "reward_std": 0.013596047647297382, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0010004526702687144, "rewards/logprob_reward/std": 0.002534077037125826, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 701.4375, "completions/mean_terminated_length": 679.933349609375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 2.04320987654321, "grad_norm": 1.9044580996992468, "kl": 0.1959228515625, "learning_rate": 3.350089611245246e-07, "loss": -0.185, "num_tokens": 18486814.0, "reward": 0.00792771764099598, "reward_std": 0.009091407991945744, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0018641313072293997, "rewards/logprob_reward/std": 0.0034033788833767176, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 776.4375, "completions/mean_terminated_length": 750.8275756835938, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 2.0462962962962963, "grad_norm": 1.8380414945608772, "kl": 0.17919921875, "learning_rate": 3.345386461863981e-07, "loss": -0.0741, "num_tokens": 18518560.0, "reward": 0.018665527924895287, "reward_std": 0.00801034551113844, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0033783656544983387, "rewards/logprob_reward/std": 0.005880396813154221, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 708.0625, "completions/mean_terminated_length": 708.0625, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 2.049382716049383, "grad_norm": 1.9473130330652877, "kl": 0.1395263671875, "learning_rate": 3.340679931814743e-07, "loss": -0.2052, "num_tokens": 18547734.0, "reward": 0.016261916607618332, "reward_std": 0.02662317454814911, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0007076855399645865, "rewards/logprob_reward/std": 0.0016589416190981865, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 736.71875, "completions/mean_terminated_length": 695.6785888671875, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "epoch": 2.052469135802469, "grad_norm": 1.703841123020719, "kl": 0.189453125, "learning_rate": 3.3359700399187654e-07, "loss": -0.1388, "num_tokens": 18577937.0, "reward": 0.004150180146098137, "reward_std": 0.007443716283887625, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.001139089115895331, "rewards/logprob_reward/std": 0.003203223692253232, "step": 665 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 698.125, "completions/mean_terminated_length": 698.125, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 2.0555555555555554, "grad_norm": 2.475517824742073, "kl": NaN, "learning_rate": 3.331256805010724e-07, "loss": -0.2655, "num_tokens": 18606221.0, "reward": 0.007154985796660185, "reward_std": 0.013995552435517311, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0010055399034172297, "rewards/logprob_reward/std": 0.0031242729164659977, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 683.28125, "completions/mean_terminated_length": 672.290283203125, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 2.058641975308642, "grad_norm": 1.6183520507006008, "kl": 0.1864013671875, "learning_rate": 3.326540245938666e-07, "loss": -0.074, "num_tokens": 18634210.0, "reward": 0.0033837261144071817, "reward_std": 0.006697164848446846, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0002874734054785222, "rewards/logprob_reward/std": 0.0007958361529745162, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 656.65625, "completions/mean_terminated_length": 618.6551513671875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 2.0617283950617282, "grad_norm": 2.0183462162596433, "kl": 0.1925048828125, "learning_rate": 3.3218203815639265e-07, "loss": -0.1317, "num_tokens": 18661287.0, "reward": 0.009772385470569134, "reward_std": 0.019369445741176605, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0004415393341332674, "rewards/logprob_reward/std": 0.0011397881899029016, "step": 668 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 805.0625, "completions/mean_terminated_length": 743.760009765625, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 2.064814814814815, "grad_norm": 1.7423276364718858, "kl": NaN, "learning_rate": 3.3170972307610654e-07, "loss": -0.0686, "num_tokens": 18694153.0, "reward": 0.006426104810088873, "reward_std": 0.012474000453948975, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.00019567189156077802, "rewards/logprob_reward/std": 0.0009213192970491946, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 728.625, "completions/mean_terminated_length": 719.0967407226562, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 2.067901234567901, "grad_norm": 1.6042362855286607, "kl": 0.182373046875, "learning_rate": 3.312370812417779e-07, "loss": -0.0821, "num_tokens": 18723577.0, "reward": 0.0014206302585080266, "reward_std": 0.0014337702887132764, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.001578478142619133, "rewards/logprob_reward/std": 0.0031967894174158573, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 716.59375, "completions/mean_terminated_length": 706.6773681640625, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 2.0709876543209877, "grad_norm": 2.557187308566611, "kl": 0.1739501953125, "learning_rate": 3.3076411454348336e-07, "loss": -0.1886, "num_tokens": 18752736.0, "reward": 0.011115819215774536, "reward_std": 0.0204685777425766, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0019342433661222458, "rewards/logprob_reward/std": 0.0029235610272735357, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 759.03125, "completions/mean_terminated_length": 721.1785888671875, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 2.074074074074074, "grad_norm": 3.7069889032155583, "kl": 0.1595458984375, "learning_rate": 3.3029082487259847e-07, "loss": -0.3376, "num_tokens": 18783641.0, "reward": 0.00557498587295413, "reward_std": 0.01114997174590826, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0027222069911658764, "rewards/logprob_reward/std": 0.009254386648535728, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 683.3125, "completions/mean_terminated_length": 660.6000366210938, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 2.0771604938271606, "grad_norm": 2.615035737173366, "kl": 0.21624755859375, "learning_rate": 3.298172141217905e-07, "loss": -0.3121, "num_tokens": 18811987.0, "reward": 0.007663974072784185, "reward_std": 0.01448277197778225, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0015710823936387897, "rewards/logprob_reward/std": 0.0026468480937182903, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 608.21875, "completions/mean_terminated_length": 608.21875, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 2.080246913580247, "grad_norm": 3.429308325454037, "kl": 0.1944580078125, "learning_rate": 3.2934328418501064e-07, "loss": -0.427, "num_tokens": 18837502.0, "reward": 0.01980498433113098, "reward_std": 0.03812722861766815, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0011722053168341517, "rewards/logprob_reward/std": 0.0017390131251886487, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 743.78125, "completions/mean_terminated_length": 714.7930908203125, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 2.0833333333333335, "grad_norm": 2.077398647842541, "kl": 0.1700439453125, "learning_rate": 3.2886903695748647e-07, "loss": -0.1476, "num_tokens": 18867563.0, "reward": 0.010390140116214752, "reward_std": 0.019611971452832222, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.001127932919189334, "rewards/logprob_reward/std": 0.0022287664469331503, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 647.96875, "completions/mean_terminated_length": 622.9000244140625, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 2.0864197530864197, "grad_norm": 3.081639433507794, "kl": 0.1923828125, "learning_rate": 3.2839447433571454e-07, "loss": -0.3706, "num_tokens": 18894266.0, "reward": 0.004312549717724323, "reward_std": 0.00768632534891367, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0013194999191910028, "rewards/logprob_reward/std": 0.0021036118268966675, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 693.9375, "completions/mean_terminated_length": 693.9375, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 2.0895061728395063, "grad_norm": 1.7451066403736228, "kl": 0.197021484375, "learning_rate": 3.279195982174524e-07, "loss": -0.0397, "num_tokens": 18922744.0, "reward": 0.016954107210040092, "reward_std": 0.02626674249768257, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.001476786215789616, "rewards/logprob_reward/std": 0.002165104728192091, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 778.5625, "completions/mean_terminated_length": 721.923095703125, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 2.0925925925925926, "grad_norm": 1.7709637996139636, "kl": 0.1868896484375, "learning_rate": 3.2744441050171136e-07, "loss": -0.0327, "num_tokens": 18954366.0, "reward": 0.017053451389074326, "reward_std": 0.02214268036186695, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.00158716703299433, "rewards/logprob_reward/std": 0.0030286668334156275, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 674.21875, "completions/mean_terminated_length": 638.0344848632812, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 2.095679012345679, "grad_norm": 2.2810111545190623, "kl": 0.1983642578125, "learning_rate": 3.26968913088749e-07, "loss": -0.0857, "num_tokens": 18982253.0, "reward": 0.006142496131360531, "reward_std": 0.008889172226190567, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.003352772444486618, "rewards/logprob_reward/std": 0.006975427269935608, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 607.5, "completions/mean_terminated_length": 607.5, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 2.0987654320987654, "grad_norm": 2.4110704442814, "kl": 0.195556640625, "learning_rate": 3.264931078800611e-07, "loss": -0.2248, "num_tokens": 19007681.0, "reward": 0.01696503534913063, "reward_std": 0.019929613918066025, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0014889281010255218, "rewards/logprob_reward/std": 0.0025145707186311483, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 652.34375, "completions/mean_terminated_length": 627.5667114257812, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 2.1018518518518516, "grad_norm": 1.9426391620859809, "kl": 0.197265625, "learning_rate": 3.260169967783744e-07, "loss": -0.1323, "num_tokens": 19034720.0, "reward": 0.029838038608431816, "reward_std": 0.027410298585891724, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0019033756107091904, "rewards/logprob_reward/std": 0.0024241837672889233, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 742.375, "completions/mean_terminated_length": 713.2413940429688, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 2.1049382716049383, "grad_norm": 2.548683868468059, "kl": 0.17132568359375, "learning_rate": 3.255405816876389e-07, "loss": -0.2088, "num_tokens": 19065240.0, "reward": 0.010009588673710823, "reward_std": 0.019502948969602585, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0007050983840599656, "rewards/logprob_reward/std": 0.0015305898850783706, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 842.28125, "completions/mean_terminated_length": 771.1739501953125, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 2.1080246913580245, "grad_norm": 2.351741854366014, "kl": 0.19775390625, "learning_rate": 3.250638645130204e-07, "loss": -0.1794, "num_tokens": 19099337.0, "reward": 0.013157753273844719, "reward_std": 0.02523644268512726, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0007308362983167171, "rewards/logprob_reward/std": 0.002028500894084573, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 689.21875, "completions/mean_terminated_length": 654.586181640625, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 2.111111111111111, "grad_norm": 1.7230649399481124, "kl": 0.21240234375, "learning_rate": 3.2458684716089224e-07, "loss": -0.1002, "num_tokens": 19127832.0, "reward": 0.014887186698615551, "reward_std": 0.02193785086274147, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0026524297427386045, "rewards/logprob_reward/std": 0.005362158641219139, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 713.09375, "completions/mean_terminated_length": 713.09375, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 2.1141975308641974, "grad_norm": 2.10335220372187, "kl": 0.1942138671875, "learning_rate": 3.241095315388287e-07, "loss": -0.1358, "num_tokens": 19157211.0, "reward": 0.025678927078843117, "reward_std": 0.03614753484725952, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.004226584918797016, "rewards/logprob_reward/std": 0.007038444746285677, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 667.0, "completions/mean_terminated_length": 655.4838256835938, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 2.117283950617284, "grad_norm": 2.013781515668275, "kl": 0.185302734375, "learning_rate": 3.2363191955559656e-07, "loss": -0.0637, "num_tokens": 19184643.0, "reward": 0.013850882649421692, "reward_std": 0.013981991447508335, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.001500982092693448, "rewards/logprob_reward/std": 0.0026388971600681543, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 628.65625, "completions/mean_terminated_length": 628.65625, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 2.1203703703703702, "grad_norm": 1.7190092386605302, "kl": 0.23095703125, "learning_rate": 3.231540131211478e-07, "loss": -0.0553, "num_tokens": 19211044.0, "reward": 0.007108983118087053, "reward_std": 0.013545095920562744, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0009544256026856601, "rewards/logprob_reward/std": 0.0021833155769854784, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 640.28125, "completions/mean_terminated_length": 640.28125, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 2.123456790123457, "grad_norm": 3.071812389989443, "kl": 0.1951904296875, "learning_rate": 3.22675814146612e-07, "loss": -0.2527, "num_tokens": 19237345.0, "reward": 0.022753456607460976, "reward_std": 0.03936662897467613, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0009760628454387188, "rewards/logprob_reward/std": 0.0018598815659061074, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 640.15625, "completions/mean_terminated_length": 627.774169921875, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 2.126543209876543, "grad_norm": 2.2048609829488877, "kl": 0.2049560546875, "learning_rate": 3.221973245442883e-07, "loss": -0.1128, "num_tokens": 19263990.0, "reward": 0.011205012910068035, "reward_std": 0.015312530100345612, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0020333474967628717, "rewards/logprob_reward/std": 0.0030404385179281235, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 777.78125, "completions/mean_terminated_length": 732.1851806640625, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 2.1296296296296298, "grad_norm": 1.634631378336532, "kl": 0.17413330078125, "learning_rate": 3.217185462276382e-07, "loss": -0.1329, "num_tokens": 19295775.0, "reward": 0.007836061529815197, "reward_std": 0.014449788257479668, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0017622908344492316, "rewards/logprob_reward/std": 0.003121594898402691, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 708.8125, "completions/mean_terminated_length": 650.4444580078125, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 2.132716049382716, "grad_norm": 2.283304174740487, "kl": 0.2203369140625, "learning_rate": 3.2123948111127795e-07, "loss": -0.1059, "num_tokens": 19324805.0, "reward": 0.013735933229327202, "reward_std": 0.026334304362535477, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.001373259350657463, "rewards/logprob_reward/std": 0.002092954469844699, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 733.8125, "completions/mean_terminated_length": 703.7930908203125, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 2.1358024691358026, "grad_norm": 3.544811007756336, "kl": 0.2158203125, "learning_rate": 3.2076013111097055e-07, "loss": -0.2391, "num_tokens": 19355451.0, "reward": 0.014487557113170624, "reward_std": 0.01965447887778282, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.005680619738996029, "rewards/logprob_reward/std": 0.00881699938327074, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 757.15625, "completions/mean_terminated_length": 695.5769653320312, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 2.138888888888889, "grad_norm": 1.7368220628411621, "kl": 0.182373046875, "learning_rate": 3.20280498143618e-07, "loss": -0.0938, "num_tokens": 19386480.0, "reward": 0.010229920968413353, "reward_std": 0.014454798772931099, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0009499117149971426, "rewards/logprob_reward/std": 0.0017783971270546317, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 698.40625, "completions/mean_terminated_length": 664.72412109375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 2.1419753086419755, "grad_norm": 1.6837417520038844, "kl": 0.1942138671875, "learning_rate": 3.1980058412725436e-07, "loss": 0.0112, "num_tokens": 19415325.0, "reward": 0.013666713610291481, "reward_std": 0.02048582024872303, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0012963481713086367, "rewards/logprob_reward/std": 0.002260439097881317, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 793.34375, "completions/mean_terminated_length": 740.1154174804688, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 2.1450617283950617, "grad_norm": 1.8016214015142245, "kl": 0.17626953125, "learning_rate": 3.1932039098103723e-07, "loss": -0.0566, "num_tokens": 19447440.0, "reward": 0.023225625976920128, "reward_std": 0.019267966970801353, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0015006960602477193, "rewards/logprob_reward/std": 0.002450818894430995, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 774.59375, "completions/mean_terminated_length": 728.4074096679688, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 2.148148148148148, "grad_norm": 2.168445493493645, "kl": 0.1729736328125, "learning_rate": 3.188399206252406e-07, "loss": -0.0589, "num_tokens": 19479239.0, "reward": 0.010763797909021378, "reward_std": 0.018367238342761993, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0050153303891420364, "rewards/logprob_reward/std": 0.009430542588233948, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 740.96875, "completions/mean_terminated_length": 688.5555419921875, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 2.1512345679012346, "grad_norm": 1.7668619588278764, "kl": 0.1787109375, "learning_rate": 3.183591749812468e-07, "loss": -0.1101, "num_tokens": 19509878.0, "reward": 0.011277096346020699, "reward_std": 0.015580618754029274, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0021134396083652973, "rewards/logprob_reward/std": 0.004006913397461176, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 724.3125, "completions/mean_terminated_length": 640.3999633789062, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 2.154320987654321, "grad_norm": 2.014204884923914, "kl": 0.201416015625, "learning_rate": 3.1787815597153934e-07, "loss": -0.0338, "num_tokens": 19539788.0, "reward": 0.007484388537704945, "reward_std": 0.01386910118162632, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.001371542806737125, "rewards/logprob_reward/std": 0.0021594043355435133, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 653.46875, "completions/mean_terminated_length": 641.51611328125, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 2.1574074074074074, "grad_norm": 2.0127198116107867, "kl": 0.2174072265625, "learning_rate": 3.173968655196947e-07, "loss": -0.0456, "num_tokens": 19566687.0, "reward": 0.010289727710187435, "reward_std": 0.014571724459528923, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0010163638507947326, "rewards/logprob_reward/std": 0.0018873271765187383, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 804.34375, "completions/mean_terminated_length": 753.6538696289062, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 2.1604938271604937, "grad_norm": 2.6669954316042555, "kl": 0.17919921875, "learning_rate": 3.1691530555037493e-07, "loss": -0.2577, "num_tokens": 19599266.0, "reward": 0.013990317471325397, "reward_std": 0.02109478786587715, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0016559083014726639, "rewards/logprob_reward/std": 0.0022258798126131296, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 744.46875, "completions/mean_terminated_length": 735.4515991210938, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 2.1635802469135803, "grad_norm": 2.5513428031933043, "kl": 0.16241455078125, "learning_rate": 3.164334779893198e-07, "loss": -0.2988, "num_tokens": 19630097.0, "reward": 0.003614734159782529, "reward_std": 0.003220351180061698, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.004016371443867683, "rewards/logprob_reward/std": 0.0058664362877607346, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 731.71875, "completions/mean_terminated_length": 731.71875, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 2.1666666666666665, "grad_norm": 1.973620737873119, "kl": 0.176513671875, "learning_rate": 3.159513847633393e-07, "loss": -0.0696, "num_tokens": 19659864.0, "reward": 0.014876470901072025, "reward_std": 0.026599494740366936, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.002640522550791502, "rewards/logprob_reward/std": 0.0033077364787459373, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 740.9375, "completions/mean_terminated_length": 722.0667114257812, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 2.169753086419753, "grad_norm": 1.8805013932396657, "kl": 0.1842041015625, "learning_rate": 3.1546902780030555e-07, "loss": -0.0033, "num_tokens": 19690006.0, "reward": 0.01714780181646347, "reward_std": 0.027524225413799286, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0016920019406825304, "rewards/logprob_reward/std": 0.002416391856968403, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 713.0625, "completions/mean_terminated_length": 655.4815063476562, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 2.1728395061728394, "grad_norm": 2.9018920083774518, "kl": 0.21142578125, "learning_rate": 3.1498640902914565e-07, "loss": -0.1858, "num_tokens": 19719600.0, "reward": 0.015629183501005173, "reward_std": 0.022466342896223068, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.003476869547739625, "rewards/logprob_reward/std": 0.004384683445096016, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 648.8125, "completions/mean_terminated_length": 636.7096557617188, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 2.175925925925926, "grad_norm": 1.8384344471623555, "kl": 0.2064208984375, "learning_rate": 3.1450353037983346e-07, "loss": -0.1312, "num_tokens": 19746418.0, "reward": 0.017334118485450745, "reward_std": 0.027243902906775475, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.001899020280689001, "rewards/logprob_reward/std": 0.003224569372832775, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 684.84375, "completions/mean_terminated_length": 649.7586059570312, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 2.1790123456790123, "grad_norm": 1.7258756181759107, "kl": 0.2039794921875, "learning_rate": 3.140203937833821e-07, "loss": -0.038, "num_tokens": 19774681.0, "reward": 0.007824774831533432, "reward_std": 0.013877566903829575, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0017497497610747814, "rewards/logprob_reward/std": 0.0029178455006331205, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 671.15625, "completions/mean_terminated_length": 634.6551513671875, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 2.182098765432099, "grad_norm": 3.5582507821007114, "kl": 0.21875, "learning_rate": 3.135370011718364e-07, "loss": -0.2457, "num_tokens": 19802574.0, "reward": 0.022976461797952652, "reward_std": 0.03377217799425125, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0012238477356731892, "rewards/logprob_reward/std": 0.002058252226561308, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 675.09375, "completions/mean_terminated_length": 663.8386840820312, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 2.185185185185185, "grad_norm": 2.066657496155649, "kl": 0.218505859375, "learning_rate": 3.1305335447826477e-07, "loss": -0.0358, "num_tokens": 19830921.0, "reward": 0.027299972251057625, "reward_std": 0.0403885543346405, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0025555237662047148, "rewards/logprob_reward/std": 0.002870726864784956, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 697.84375, "completions/mean_terminated_length": 687.3225708007812, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 2.1882716049382718, "grad_norm": 1.8649316470081814, "kl": 0.1871337890625, "learning_rate": 3.125694556367517e-07, "loss": -0.0515, "num_tokens": 19859400.0, "reward": 0.02298777922987938, "reward_std": 0.03882042318582535, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0012364235008135438, "rewards/logprob_reward/std": 0.0017622843151912093, "step": 709 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 831.40625, "completions/mean_terminated_length": 681.6111450195312, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 2.191358024691358, "grad_norm": 2.2503754641255838, "kl": NaN, "learning_rate": 3.1208530658239e-07, "loss": -0.0248, "num_tokens": 19893309.0, "reward": 0.00822516344487667, "reward_std": 0.01045928057283163, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.002194626722484827, "rewards/logprob_reward/std": 0.005198339931666851, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 706.6875, "completions/mean_terminated_length": 696.4515991210938, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 2.1944444444444446, "grad_norm": 1.9686203354037635, "kl": 0.2001953125, "learning_rate": 3.1160090925127325e-07, "loss": -0.1546, "num_tokens": 19922259.0, "reward": 0.024451250210404396, "reward_std": 0.030137833207845688, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.002862499561160803, "rewards/logprob_reward/std": 0.004403434693813324, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 706.3125, "completions/mean_terminated_length": 660.9285888671875, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 2.197530864197531, "grad_norm": 2.0815079933441116, "kl": 0.2012939453125, "learning_rate": 3.1111626558048777e-07, "loss": -0.1331, "num_tokens": 19951337.0, "reward": 0.01722552999854088, "reward_std": 0.02297770418226719, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0017783672083169222, "rewards/logprob_reward/std": 0.003334183944389224, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 692.8125, "completions/mean_terminated_length": 692.8125, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 2.200617283950617, "grad_norm": 1.8152786652921813, "kl": 0.2042236328125, "learning_rate": 3.1063137750810493e-07, "loss": -0.063, "num_tokens": 19980051.0, "reward": 0.014371870085597038, "reward_std": 0.02127542346715927, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.002079855417832732, "rewards/logprob_reward/std": 0.0030030657071620226, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 740.0, "completions/mean_terminated_length": 699.4285888671875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 2.2037037037037037, "grad_norm": 3.001523321516778, "kl": 0.1907958984375, "learning_rate": 3.101462469731735e-07, "loss": -0.3082, "num_tokens": 20010515.0, "reward": 0.008616460487246513, "reward_std": 0.015965763479471207, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0026294009294360876, "rewards/logprob_reward/std": 0.004724571481347084, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 669.1875, "completions/mean_terminated_length": 645.5333862304688, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 2.20679012345679, "grad_norm": 1.5780098412505246, "kl": 0.2098388671875, "learning_rate": 3.0966087591571184e-07, "loss": -0.0214, "num_tokens": 20038189.0, "reward": 0.02048753760755062, "reward_std": 0.022141898050904274, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.001930593978613615, "rewards/logprob_reward/std": 0.003870361717417836, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 685.90625, "completions/mean_terminated_length": 663.36669921875, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 2.2098765432098766, "grad_norm": 1.9906976149586653, "kl": 0.19647216796875, "learning_rate": 3.091752662767001e-07, "loss": -0.0742, "num_tokens": 20066510.0, "reward": 0.02095744013786316, "reward_std": 0.03419157490134239, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0024527120403945446, "rewards/logprob_reward/std": 0.004117546137422323, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 667.84375, "completions/mean_terminated_length": 656.3547973632812, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 2.212962962962963, "grad_norm": 2.1312491887377787, "kl": 0.2139892578125, "learning_rate": 3.0868941999807274e-07, "loss": -0.1352, "num_tokens": 20094169.0, "reward": 0.011528492905199528, "reward_std": 0.020114785060286522, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.002392769791185856, "rewards/logprob_reward/std": 0.00551227293908596, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 664.90625, "completions/mean_terminated_length": 664.90625, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 2.2160493827160495, "grad_norm": 1.819554122367732, "kl": 0.197265625, "learning_rate": 3.082033390227102e-07, "loss": 0.0268, "num_tokens": 20121990.0, "reward": 0.01730799302458763, "reward_std": 0.027153251692652702, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.001869992003776133, "rewards/logprob_reward/std": 0.0025917806196957827, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 662.0625, "completions/mean_terminated_length": 662.0625, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 2.2191358024691357, "grad_norm": 1.7887861245248962, "kl": 0.1986083984375, "learning_rate": 3.0771702529443163e-07, "loss": -0.0369, "num_tokens": 20149468.0, "reward": 0.022415729239583015, "reward_std": 0.029145292937755585, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.004073033109307289, "rewards/logprob_reward/std": 0.009793604724109173, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 707.4375, "completions/mean_terminated_length": 674.6896362304688, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 2.2222222222222223, "grad_norm": 2.473500654696141, "kl": 0.191650390625, "learning_rate": 3.0723048075798694e-07, "loss": -0.1671, "num_tokens": 20178214.0, "reward": 0.01057870127260685, "reward_std": 0.02028326690196991, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0013374458067119122, "rewards/logprob_reward/std": 0.002189180813729763, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 667.53125, "completions/mean_terminated_length": 667.53125, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 2.2253086419753085, "grad_norm": 2.0823282597420847, "kl": 0.2186279296875, "learning_rate": 3.0674370735904917e-07, "loss": -0.1845, "num_tokens": 20205755.0, "reward": 0.02725653350353241, "reward_std": 0.03393562510609627, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0025072579737752676, "rewards/logprob_reward/std": 0.003850584151223302, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 723.65625, "completions/mean_terminated_length": 680.75, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 2.228395061728395, "grad_norm": 2.1220284831023544, "kl": 0.20318603515625, "learning_rate": 3.0625670704420634e-07, "loss": -0.1226, "num_tokens": 20235716.0, "reward": 0.008036112412810326, "reward_std": 0.014271697029471397, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0019845697097480297, "rewards/logprob_reward/std": 0.0030064627062529325, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 684.65625, "completions/mean_terminated_length": 662.0333862304688, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 2.2314814814814814, "grad_norm": 1.8710458481733792, "kl": 0.17779541015625, "learning_rate": 3.057694817609539e-07, "loss": -0.1808, "num_tokens": 20264109.0, "reward": 0.00783197395503521, "reward_std": 0.00933231133967638, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0017577486578375101, "rewards/logprob_reward/std": 0.0028715496882796288, "step": 723 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 738.21875, "completions/mean_terminated_length": 672.2692260742188, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 2.234567901234568, "grad_norm": 1.9014327670978726, "kl": NaN, "learning_rate": 3.0528203345768717e-07, "loss": -0.1328, "num_tokens": 20294540.0, "reward": 0.004208831116557121, "reward_std": 0.007450885139405727, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0012042568996548653, "rewards/logprob_reward/std": 0.0021676637697964907, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 708.09375, "completions/mean_terminated_length": 697.9031982421875, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 2.2376543209876543, "grad_norm": 2.0511718997832578, "kl": 0.19927978515625, "learning_rate": 3.047943640836931e-07, "loss": -0.0949, "num_tokens": 20323371.0, "reward": 0.010843828320503235, "reward_std": 0.019358793273568153, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0016320315189659595, "rewards/logprob_reward/std": 0.002632023999467492, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 670.4375, "completions/mean_terminated_length": 659.0322265625, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 2.240740740740741, "grad_norm": 1.985521889256597, "kl": 0.2191162109375, "learning_rate": 3.0430647558914284e-07, "loss": -0.1478, "num_tokens": 20350897.0, "reward": 0.0018935356056317687, "reward_std": 0.0018031983636319637, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.0021039284765720367, "rewards/logprob_reward/std": 0.0025468331295996904, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 682.1875, "completions/mean_terminated_length": 682.1875, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 2.243827160493827, "grad_norm": 3.0085913078454514, "kl": 0.158935546875, "learning_rate": 3.038183699250837e-07, "loss": -0.4066, "num_tokens": 20379207.0, "reward": 0.008154462091624737, "reward_std": 0.011152166873216629, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.005588292144238949, "rewards/logprob_reward/std": 0.01106759998947382, "step": 727 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 739.375, "completions/mean_terminated_length": 659.6799926757812, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 2.246913580246914, "grad_norm": 1.6009103603948496, "kl": NaN, "learning_rate": 3.0333004904343153e-07, "loss": -0.1468, "num_tokens": 20409499.0, "reward": 0.016564439982175827, "reward_std": 0.02129506506025791, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0010438221506774426, "rewards/logprob_reward/std": 0.001586288446560502, "step": 728 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 744.71875, "completions/mean_terminated_length": 635.434814453125, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 2.25, "grad_norm": 1.783686655560749, "kl": NaN, "learning_rate": 3.0284151489696264e-07, "loss": -0.0725, "num_tokens": 20440014.0, "reward": 0.011835969984531403, "reward_std": 0.015340530313551426, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0027344110421836376, "rewards/logprob_reward/std": 0.0038544186390936375, "step": 729 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 715.625, "completions/mean_terminated_length": 683.72412109375, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 2.253086419753086, "grad_norm": 1.7669001918411822, "kl": NaN, "learning_rate": 3.023527694393064e-07, "loss": -0.0642, "num_tokens": 20469546.0, "reward": 0.01330122072249651, "reward_std": 0.020615288987755775, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0008902450208552182, "rewards/logprob_reward/std": 0.0015324733685702085, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 703.84375, "completions/mean_terminated_length": 703.84375, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 2.256172839506173, "grad_norm": 2.1691027905437767, "kl": 0.1790771484375, "learning_rate": 3.0186381462493704e-07, "loss": -0.171, "num_tokens": 20499109.0, "reward": 0.011748358607292175, "reward_std": 0.020492851734161377, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0026370638515800238, "rewards/logprob_reward/std": 0.004131729248911142, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 640.1875, "completions/mean_terminated_length": 627.8064575195312, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 2.259259259259259, "grad_norm": 1.919132832371422, "kl": 0.20391845703125, "learning_rate": 3.0137465240916614e-07, "loss": -0.0194, "num_tokens": 20525571.0, "reward": 0.014449520036578178, "reward_std": 0.026038456708192825, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.002166133839637041, "rewards/logprob_reward/std": 0.0022073055151849985, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 711.0625, "completions/mean_terminated_length": 690.2000122070312, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 2.2623456790123457, "grad_norm": 2.1348215835442, "kl": 0.2127685546875, "learning_rate": 3.008852847481346e-07, "loss": -0.0467, "num_tokens": 20555449.0, "reward": 0.0079883998259902, "reward_std": 0.014689110219478607, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0019315548706799746, "rewards/logprob_reward/std": 0.0034405828919261694, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 725.03125, "completions/mean_terminated_length": 682.3214721679688, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 2.265432098765432, "grad_norm": 2.2587945330038504, "kl": 0.1964111328125, "learning_rate": 3.003957135988049e-07, "loss": -0.0865, "num_tokens": 20585418.0, "reward": 0.007533889263868332, "reward_std": 0.01407144870609045, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0014265429927036166, "rewards/logprob_reward/std": 0.002197315450757742, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 680.96875, "completions/mean_terminated_length": 631.9642944335938, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 2.2685185185185186, "grad_norm": 2.180209869824383, "kl": 0.20733642578125, "learning_rate": 2.999059409189533e-07, "loss": -0.1038, "num_tokens": 20613861.0, "reward": 0.011373293586075306, "reward_std": 0.020540405064821243, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.002220326103270054, "rewards/logprob_reward/std": 0.00289405370131135, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 697.21875, "completions/mean_terminated_length": 663.413818359375, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 2.271604938271605, "grad_norm": 1.948421055598178, "kl": 0.203125, "learning_rate": 2.9941596866716174e-07, "loss": -0.0492, "num_tokens": 20642348.0, "reward": 0.014036007225513458, "reward_std": 0.02121649496257305, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.001706675160676241, "rewards/logprob_reward/std": 0.002709642518311739, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 634.84375, "completions/mean_terminated_length": 622.290283203125, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 2.2746913580246915, "grad_norm": 2.3766095069961355, "kl": 0.2034912109375, "learning_rate": 2.989257988028105e-07, "loss": -0.0669, "num_tokens": 20669211.0, "reward": 0.01749192178249359, "reward_std": 0.02235991507768631, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0020743580535054207, "rewards/logprob_reward/std": 0.0026480804663151503, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 696.71875, "completions/mean_terminated_length": 674.9000244140625, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 2.2777777777777777, "grad_norm": 2.037501844544708, "kl": 0.1934814453125, "learning_rate": 2.984354332860702e-07, "loss": -0.0959, "num_tokens": 20697986.0, "reward": 0.01077186968177557, "reward_std": 0.015292023308575153, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.001552077243104577, "rewards/logprob_reward/std": 0.002849878277629614, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 738.875, "completions/mean_terminated_length": 673.0769653320312, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 2.2808641975308643, "grad_norm": 1.8398893931923475, "kl": 0.2205810546875, "learning_rate": 2.979448740778935e-07, "loss": 0.0243, "num_tokens": 20728650.0, "reward": 0.014752760529518127, "reward_std": 0.02185913547873497, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0025030672550201416, "rewards/logprob_reward/std": 0.003537602722644806, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 632.5625, "completions/mean_terminated_length": 632.5625, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 2.2839506172839505, "grad_norm": 2.339917624735448, "kl": 0.2362060546875, "learning_rate": 2.9745412314000786e-07, "loss": -0.097, "num_tokens": 20755072.0, "reward": 0.022999752312898636, "reward_std": 0.033915817737579346, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0012497229035943747, "rewards/logprob_reward/std": 0.001911089289933443, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 670.40625, "completions/mean_terminated_length": 659.0, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 2.287037037037037, "grad_norm": 1.9800143544819198, "kl": 0.19384765625, "learning_rate": 2.9696318243490746e-07, "loss": -0.1877, "num_tokens": 20782865.0, "reward": 0.004813074134290218, "reward_std": 0.008067871443927288, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.001875637797638774, "rewards/logprob_reward/std": 0.0025465013459324837, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 701.1875, "completions/mean_terminated_length": 679.6666870117188, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 2.2901234567901234, "grad_norm": 2.1524480397410843, "kl": 0.2032470703125, "learning_rate": 2.9647205392584533e-07, "loss": -0.1965, "num_tokens": 20812199.0, "reward": 0.006285310722887516, "reward_std": 0.009405326098203659, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.00351145607419312, "rewards/logprob_reward/std": 0.0050127143040299416, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 636.5, "completions/mean_terminated_length": 610.6666870117188, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 2.29320987654321, "grad_norm": 2.1391421655950684, "kl": 0.2254638671875, "learning_rate": 2.959807395768255e-07, "loss": -0.1155, "num_tokens": 20839011.0, "reward": 0.010436173528432846, "reward_std": 0.01438200380653143, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0011790813878178596, "rewards/logprob_reward/std": 0.0016188162844628096, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 732.25, "completions/mean_terminated_length": 690.5714721679688, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 2.2962962962962963, "grad_norm": 1.6584251680821471, "kl": 0.184326171875, "learning_rate": 2.95489241352595e-07, "loss": -0.0456, "num_tokens": 20869183.0, "reward": 0.011500661261379719, "reward_std": 0.016072984784841537, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0023618454579263926, "rewards/logprob_reward/std": 0.0040468983352184296, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 625.3125, "completions/mean_terminated_length": 625.3125, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 2.299382716049383, "grad_norm": 2.3394841133481075, "kl": 0.2041015625, "learning_rate": 2.949975612186366e-07, "loss": -0.1045, "num_tokens": 20895609.0, "reward": 0.027013035491108894, "reward_std": 0.03591803461313248, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0022367057390511036, "rewards/logprob_reward/std": 0.0031810272485017776, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 652.53125, "completions/mean_terminated_length": 614.1034545898438, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 2.302469135802469, "grad_norm": 2.0041419047184204, "kl": 0.2049560546875, "learning_rate": 2.9450570114116014e-07, "loss": -0.1177, "num_tokens": 20922798.0, "reward": 0.027759265154600143, "reward_std": 0.03280960023403168, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0030658484902232885, "rewards/logprob_reward/std": 0.0035908985882997513, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 656.59375, "completions/mean_terminated_length": 656.59375, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 2.3055555555555554, "grad_norm": 2.0626102908524633, "kl": 0.225830078125, "learning_rate": 2.9401366308709513e-07, "loss": -0.0954, "num_tokens": 20950261.0, "reward": 0.02691406011581421, "reward_std": 0.03383072093129158, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.002126733073964715, "rewards/logprob_reward/std": 0.0029292285908013582, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 694.34375, "completions/mean_terminated_length": 683.7096557617188, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 2.308641975308642, "grad_norm": 1.7855321890272653, "kl": 0.1859130859375, "learning_rate": 2.9352144902408296e-07, "loss": -0.0889, "num_tokens": 20979168.0, "reward": 0.020497024059295654, "reward_std": 0.03210761398077011, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.001941136666573584, "rewards/logprob_reward/std": 0.0027287257835268974, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 694.40625, "completions/mean_terminated_length": 694.40625, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 2.3117283950617282, "grad_norm": 2.0004801072760086, "kl": 0.173095703125, "learning_rate": 2.930290609204686e-07, "loss": -0.1498, "num_tokens": 21008037.0, "reward": 0.00931592471897602, "reward_std": 0.009273052215576172, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.003406583098694682, "rewards/logprob_reward/std": 0.0034958128817379475, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 654.53125, "completions/mean_terminated_length": 642.6128540039062, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 2.314814814814815, "grad_norm": 1.7895581006013852, "kl": 0.19036865234375, "learning_rate": 2.925365007452933e-07, "loss": -0.0906, "num_tokens": 21035402.0, "reward": 0.01693890616297722, "reward_std": 0.02178483083844185, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0014598959824070334, "rewards/logprob_reward/std": 0.0020789767149835825, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 612.4375, "completions/mean_terminated_length": 585.0, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 2.317901234567901, "grad_norm": 2.382818710179892, "kl": 0.2159423828125, "learning_rate": 2.920437704682861e-07, "loss": -0.0949, "num_tokens": 21061172.0, "reward": 0.026447393000125885, "reward_std": 0.040804147720336914, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0016082143411040306, "rewards/logprob_reward/std": 0.002333967015147209, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 636.53125, "completions/mean_terminated_length": 624.0322265625, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 2.3209876543209877, "grad_norm": 1.8283921741212457, "kl": 0.1959228515625, "learning_rate": 2.915508720598566e-07, "loss": -0.0756, "num_tokens": 21087901.0, "reward": 0.012147173285484314, "reward_std": 0.014731649309396744, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0030801924876868725, "rewards/logprob_reward/std": 0.00503428652882576, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 651.6875, "completions/mean_terminated_length": 626.86669921875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 2.324074074074074, "grad_norm": 2.3386370058074952, "kl": 0.2027587890625, "learning_rate": 2.910578074910865e-07, "loss": -0.1045, "num_tokens": 21115203.0, "reward": 0.014293940737843513, "reward_std": 0.026815392076969147, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.001993267796933651, "rewards/logprob_reward/std": 0.003454646561294794, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 606.4375, "completions/mean_terminated_length": 606.4375, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 2.3271604938271606, "grad_norm": 2.5045503605738797, "kl": 0.212890625, "learning_rate": 2.9056457873372213e-07, "loss": -0.2066, "num_tokens": 21140657.0, "reward": 0.013928234577178955, "reward_std": 0.025971507653594017, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.001586928148753941, "rewards/logprob_reward/std": 0.002250204561278224, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 674.125, "completions/mean_terminated_length": 674.125, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 2.330246913580247, "grad_norm": 1.8750311313989472, "kl": 0.203125, "learning_rate": 2.9007118776016635e-07, "loss": 0.0243, "num_tokens": 21168265.0, "reward": 0.016981391236186028, "reward_std": 0.02731180191040039, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0015071008820086718, "rewards/logprob_reward/std": 0.0024619167670607567, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 688.46875, "completions/mean_terminated_length": 666.1000366210938, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 2.3333333333333335, "grad_norm": 2.0285058126328632, "kl": 0.17169189453125, "learning_rate": 2.895776365434706e-07, "loss": -0.101, "num_tokens": 21196824.0, "reward": 0.002055669669061899, "reward_std": 0.002533155959099531, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00228407746180892, "rewards/logprob_reward/std": 0.003654703265056014, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 625.5, "completions/mean_terminated_length": 612.6451416015625, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 2.3364197530864197, "grad_norm": 3.9096325385449853, "kl": 0.2318115234375, "learning_rate": 2.8908392705732724e-07, "loss": -0.2157, "num_tokens": 21222676.0, "reward": 0.014098500832915306, "reward_std": 0.016845598816871643, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0017761120107024908, "rewards/logprob_reward/std": 0.00431130500510335, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 693.09375, "completions/mean_terminated_length": 671.0333862304688, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 2.3395061728395063, "grad_norm": 2.0621807247628516, "kl": 0.19091796875, "learning_rate": 2.885900612760616e-07, "loss": -0.0017, "num_tokens": 21250915.0, "reward": 0.013744791969656944, "reward_std": 0.025930313393473625, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0013831022661179304, "rewards/logprob_reward/std": 0.0021195383742451668, "step": 758 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 723.59375, "completions/mean_terminated_length": 654.2692260742188, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 2.3425925925925926, "grad_norm": 2.130005701801698, "kl": NaN, "learning_rate": 2.8809604117462397e-07, "loss": -0.2003, "num_tokens": 21280958.0, "reward": 0.01306244358420372, "reward_std": 0.021564170718193054, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.004097159951925278, "rewards/logprob_reward/std": 0.0075044953264296055, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 624.96875, "completions/mean_terminated_length": 612.0967407226562, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 2.3456790123456788, "grad_norm": 2.2385336775800586, "kl": 0.22021484375, "learning_rate": 2.876018687285817e-07, "loss": -0.0829, "num_tokens": 21307321.0, "reward": 0.02025846391916275, "reward_std": 0.033152103424072266, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0016760698053985834, "rewards/logprob_reward/std": 0.0024185858201235533, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 668.625, "completions/mean_terminated_length": 668.625, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 2.3487654320987654, "grad_norm": 2.222865996326742, "kl": 0.1905517578125, "learning_rate": 2.8710754591411147e-07, "loss": -0.1101, "num_tokens": 21335093.0, "reward": 0.020237958058714867, "reward_std": 0.02622999995946884, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0016532877925783396, "rewards/logprob_reward/std": 0.0030789622105658054, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 606.3125, "completions/mean_terminated_length": 606.3125, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 2.351851851851852, "grad_norm": 2.6081438099890977, "kl": 0.21533203125, "learning_rate": 2.8661307470799114e-07, "loss": -0.2491, "num_tokens": 21360947.0, "reward": 0.039200570434331894, "reward_std": 0.05209661275148392, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0018895207904279232, "rewards/logprob_reward/std": 0.003134438069537282, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 683.09375, "completions/mean_terminated_length": 647.8275756835938, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 2.3549382716049383, "grad_norm": 1.9614533144375295, "kl": 0.19000244140625, "learning_rate": 2.861184570875921e-07, "loss": -0.0871, "num_tokens": 21389146.0, "reward": 0.014616047963500023, "reward_std": 0.026574091985821724, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0023511643521487713, "rewards/logprob_reward/std": 0.004739148076623678, "step": 763 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 627.0625, "completions/mean_terminated_length": 627.0625, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 2.3580246913580245, "grad_norm": 1.7626662455342699, "kl": NaN, "learning_rate": 2.856236950308711e-07, "loss": -0.1747, "num_tokens": 21415380.0, "reward": 0.011208342388272285, "reward_std": 0.01498295646160841, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0020370474085211754, "rewards/logprob_reward/std": 0.0030780963134020567, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 684.09375, "completions/mean_terminated_length": 661.433349609375, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 2.361111111111111, "grad_norm": 1.9784413179253255, "kl": 0.1986083984375, "learning_rate": 2.851287905163628e-07, "loss": -0.0547, "num_tokens": 21443735.0, "reward": 0.029958879575133324, "reward_std": 0.039155617356300354, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0020376425236463547, "rewards/logprob_reward/std": 0.0034716313239187002, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 635.15625, "completions/mean_terminated_length": 622.6129150390625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 2.3641975308641974, "grad_norm": 3.031221502195575, "kl": 0.220947265625, "learning_rate": 2.8463374552317123e-07, "loss": -0.3255, "num_tokens": 21470648.0, "reward": 0.012188144959509373, "reward_std": 0.016201891005039215, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0031257164664566517, "rewards/logprob_reward/std": 0.0069906204007565975, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 681.71875, "completions/mean_terminated_length": 646.3103637695312, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 2.367283950617284, "grad_norm": 2.1293540852434014, "kl": 0.1748046875, "learning_rate": 2.8413856203096226e-07, "loss": -0.1594, "num_tokens": 21499035.0, "reward": 0.001801763428375125, "reward_std": 0.001622794196009636, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "rewards/logprob_reward/mean": 0.00200195936486125, "rewards/logprob_reward/std": 0.003044778248295188, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 659.0625, "completions/mean_terminated_length": 634.7333374023438, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 2.3703703703703702, "grad_norm": 1.7041509254321938, "kl": 0.19854736328125, "learning_rate": 2.836432420199557e-07, "loss": 0.0039, "num_tokens": 21526845.0, "reward": 0.024135377258062363, "reward_std": 0.03447246551513672, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.002511532511562109, "rewards/logprob_reward/std": 0.004762453492730856, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 690.5, "completions/mean_terminated_length": 668.2667236328125, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 2.373456790123457, "grad_norm": 1.9538570419641474, "kl": 0.189208984375, "learning_rate": 2.831477874709172e-07, "loss": -0.0272, "num_tokens": 21555429.0, "reward": 0.020408859476447105, "reward_std": 0.03313077613711357, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0018431773642078042, "rewards/logprob_reward/std": 0.0025917550083249807, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 633.5, "completions/mean_terminated_length": 633.5, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 2.376543209876543, "grad_norm": 2.1641262080253942, "kl": 0.2119140625, "learning_rate": 2.826522003651504e-07, "loss": -0.0668, "num_tokens": 21581841.0, "reward": 0.01095607504248619, "reward_std": 0.02049342915415764, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0017567494651302695, "rewards/logprob_reward/std": 0.002133831148967147, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 677.28125, "completions/mean_terminated_length": 654.1666870117188, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 2.3796296296296298, "grad_norm": 1.7747904490039623, "kl": 0.17822265625, "learning_rate": 2.8215648268448926e-07, "loss": -0.0883, "num_tokens": 21610214.0, "reward": 0.006160065997391939, "reward_std": 0.008916087448596954, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1767766922712326, "rewards/logprob_reward/mean": 0.0033722955267876387, "rewards/logprob_reward/std": 0.003799574449658394, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 601.59375, "completions/mean_terminated_length": 601.59375, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 2.382716049382716, "grad_norm": 3.416045127994316, "kl": 0.19635009765625, "learning_rate": 2.8166063641128963e-07, "loss": -0.3115, "num_tokens": 21635393.0, "reward": 0.016752969473600388, "reward_std": 0.02682165801525116, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0012532987166196108, "rewards/logprob_reward/std": 0.0017938826931640506, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 642.25, "completions/mean_terminated_length": 587.7142944335938, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 2.3858024691358026, "grad_norm": 2.6297709005463186, "kl": 0.2100830078125, "learning_rate": 2.8116466352842165e-07, "loss": -0.139, "num_tokens": 21662481.0, "reward": 0.02676667645573616, "reward_std": 0.034649819135665894, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.001962975598871708, "rewards/logprob_reward/std": 0.0030351963359862566, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 674.875, "completions/mean_terminated_length": 625.0, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 2.388888888888889, "grad_norm": 2.7441401584018723, "kl": 0.208984375, "learning_rate": 2.80668566019262e-07, "loss": -0.1109, "num_tokens": 21690513.0, "reward": 0.020455727353692055, "reward_std": 0.02898206189274788, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0018952535465359688, "rewards/logprob_reward/std": 0.003361431183293462, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 640.6875, "completions/mean_terminated_length": 628.3225708007812, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 2.3919753086419755, "grad_norm": 2.423581119859638, "kl": 0.19287109375, "learning_rate": 2.8017234586768534e-07, "loss": -0.2178, "num_tokens": 21717463.0, "reward": 0.018702922388911247, "reward_std": 0.022699924185872078, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0034199138171970844, "rewards/logprob_reward/std": 0.0043744854629039764, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 683.21875, "completions/mean_terminated_length": 604.5769653320312, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 2.3950617283950617, "grad_norm": 2.218899276677366, "kl": 0.21234130859375, "learning_rate": 2.796760050580571e-07, "loss": -0.1398, "num_tokens": 21745822.0, "reward": 0.0223326925188303, "reward_std": 0.02641984634101391, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0005085471784695983, "rewards/logprob_reward/std": 0.0008823273237794638, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 633.59375, "completions/mean_terminated_length": 633.59375, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 2.398148148148148, "grad_norm": 2.3842149240513777, "kl": 0.179443359375, "learning_rate": 2.7917954557522503e-07, "loss": -0.2196, "num_tokens": 21772169.0, "reward": 0.03312259167432785, "reward_std": 0.033782489597797394, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.002080656588077545, "rewards/logprob_reward/std": 0.0032138347160071135, "step": 777 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 714.0, "completions/mean_terminated_length": 642.4615478515625, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 2.4012345679012346, "grad_norm": 2.0869436884326116, "kl": NaN, "learning_rate": 2.786829694045116e-07, "loss": -0.0507, "num_tokens": 21802057.0, "reward": 0.01728595420718193, "reward_std": 0.02657473087310791, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0018455050885677338, "rewards/logprob_reward/std": 0.0020782107021659613, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 561.875, "completions/mean_terminated_length": 561.875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 2.4043209876543212, "grad_norm": 2.100845147856409, "kl": 0.200927734375, "learning_rate": 2.7818627853170585e-07, "loss": -0.0829, "num_tokens": 21826125.0, "reward": 0.0206417478621006, "reward_std": 0.027341028675436974, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.002101942664012313, "rewards/logprob_reward/std": 0.0026918782386928797, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 691.03125, "completions/mean_terminated_length": 643.4642944335938, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 2.4074074074074074, "grad_norm": 2.1858736493189777, "kl": 0.20977783203125, "learning_rate": 2.7768947494305545e-07, "loss": -0.1138, "num_tokens": 21854738.0, "reward": 0.017663007602095604, "reward_std": 0.026448804885149002, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.002264453563839197, "rewards/logprob_reward/std": 0.0034026289358735085, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 665.71875, "completions/mean_terminated_length": 614.5357666015625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 2.4104938271604937, "grad_norm": 2.126362146413349, "kl": 0.21826171875, "learning_rate": 2.7719256062525884e-07, "loss": -0.0783, "num_tokens": 21882865.0, "reward": 0.023511648178100586, "reward_std": 0.020995650440454483, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.005290718283504248, "rewards/logprob_reward/std": 0.0067299045622348785, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 568.03125, "completions/mean_terminated_length": 568.03125, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 2.4135802469135803, "grad_norm": 2.45437341508095, "kl": 0.2205810546875, "learning_rate": 2.766955375654573e-07, "loss": -0.2238, "num_tokens": 21907334.0, "reward": 0.018289368599653244, "reward_std": 0.02329649031162262, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.002960409503430128, "rewards/logprob_reward/std": 0.005423862487077713, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 619.1875, "completions/mean_terminated_length": 606.1290283203125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 2.4166666666666665, "grad_norm": 2.077958899749018, "kl": 0.203369140625, "learning_rate": 2.7619840775122695e-07, "loss": -0.1114, "num_tokens": 21933496.0, "reward": 0.011950278654694557, "reward_std": 0.02096518874168396, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.002861420623958111, "rewards/logprob_reward/std": 0.0045641642063856125, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 622.90625, "completions/mean_terminated_length": 622.90625, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 2.419753086419753, "grad_norm": 1.9110171745488564, "kl": 0.20361328125, "learning_rate": 2.7570117317057087e-07, "loss": -0.14, "num_tokens": 21960053.0, "reward": 0.018233656883239746, "reward_std": 0.02679438516497612, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0028985063545405865, "rewards/logprob_reward/std": 0.0035074797924607992, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 641.3125, "completions/mean_terminated_length": 641.3125, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 2.4228395061728394, "grad_norm": 1.955324665689567, "kl": 0.2054443359375, "learning_rate": 2.7520383581191085e-07, "loss": -0.2198, "num_tokens": 21987327.0, "reward": 0.01924588903784752, "reward_std": 0.026994815096259117, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.00402320921421051, "rewards/logprob_reward/std": 0.005341153126209974, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 641.53125, "completions/mean_terminated_length": 601.9655151367188, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 2.425925925925926, "grad_norm": 1.967916964949879, "kl": 0.2139892578125, "learning_rate": 2.7470639766408003e-07, "loss": -0.0426, "num_tokens": 22014296.0, "reward": 0.017452334985136986, "reward_std": 0.02039685659110546, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0020303723867982626, "rewards/logprob_reward/std": 0.004384672734886408, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 663.3125, "completions/mean_terminated_length": 651.6774291992188, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 2.4290123456790123, "grad_norm": 2.049183593122295, "kl": 0.2091064453125, "learning_rate": 2.7420886071631455e-07, "loss": -0.1806, "num_tokens": 22041906.0, "reward": 0.017709145322442055, "reward_std": 0.02079470083117485, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0023157179821282625, "rewards/logprob_reward/std": 0.0030351937748491764, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 653.71875, "completions/mean_terminated_length": 641.774169921875, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 2.432098765432099, "grad_norm": 2.005760721591516, "kl": 0.239501953125, "learning_rate": 2.7371122695824534e-07, "loss": -0.0891, "num_tokens": 22069113.0, "reward": 0.015844479203224182, "reward_std": 0.021281344816088676, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0037160879001021385, "rewards/logprob_reward/std": 0.004673448856920004, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 686.6875, "completions/mean_terminated_length": 675.8064575195312, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 2.435185185185185, "grad_norm": 2.271091804404587, "kl": 0.1875, "learning_rate": 2.732134983798907e-07, "loss": -0.1317, "num_tokens": 22098127.0, "reward": 0.026305291801691055, "reward_std": 0.03931676968932152, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.004922546446323395, "rewards/logprob_reward/std": 0.0043291207402944565, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 632.46875, "completions/mean_terminated_length": 632.46875, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 2.4382716049382718, "grad_norm": 2.1275371201597326, "kl": 0.19384765625, "learning_rate": 2.727156769716482e-07, "loss": -0.0638, "num_tokens": 22124702.0, "reward": 0.012196826748549938, "reward_std": 0.01979811303317547, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0031353633385151625, "rewards/logprob_reward/std": 0.003473072312772274, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 624.46875, "completions/mean_terminated_length": 597.8333740234375, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 2.441358024691358, "grad_norm": 2.1957550340046064, "kl": 0.231201171875, "learning_rate": 2.722177647242863e-07, "loss": -0.0802, "num_tokens": 22150957.0, "reward": 0.020226188004016876, "reward_std": 0.03338390588760376, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0016402084147557616, "rewards/logprob_reward/std": 0.0022355131804943085, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 734.9375, "completions/mean_terminated_length": 654.0, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 2.4444444444444446, "grad_norm": 1.6726344757151257, "kl": 0.20458984375, "learning_rate": 2.717197636289373e-07, "loss": -0.0819, "num_tokens": 22181163.0, "reward": 0.01065262220799923, "reward_std": 0.014610957354307175, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0014195798430591822, "rewards/logprob_reward/std": 0.0018534038681536913, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 580.6875, "completions/mean_terminated_length": 580.6875, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 2.447530864197531, "grad_norm": 2.398142707509085, "kl": 0.2135009765625, "learning_rate": 2.712216756770881e-07, "loss": -0.1378, "num_tokens": 22205769.0, "reward": 0.02064337022602558, "reward_std": 0.025822220370173454, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0021037464030086994, "rewards/logprob_reward/std": 0.0022530886344611645, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 648.09375, "completions/mean_terminated_length": 648.09375, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 2.450617283950617, "grad_norm": 3.0934505561772836, "kl": 0.2025146484375, "learning_rate": 2.7072350286057354e-07, "loss": -0.1571, "num_tokens": 22232980.0, "reward": 0.025620292872190475, "reward_std": 0.02903948351740837, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.00416143424808979, "rewards/logprob_reward/std": 0.00495970668271184, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 701.15625, "completions/mean_terminated_length": 679.6333618164062, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 2.4537037037037037, "grad_norm": 2.1450314466149716, "kl": 0.1929931640625, "learning_rate": 2.7022524717156734e-07, "loss": -0.0682, "num_tokens": 22261977.0, "reward": 0.02857239916920662, "reward_std": 0.03375323861837387, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.003969332203269005, "rewards/logprob_reward/std": 0.004325473215430975, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 683.375, "completions/mean_terminated_length": 672.3870849609375, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 2.45679012345679, "grad_norm": 2.0853127274576515, "kl": 0.191162109375, "learning_rate": 2.6972691060257504e-07, "loss": -0.0678, "num_tokens": 22290609.0, "reward": 0.018175773322582245, "reward_std": 0.027988426387310028, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0028341934084892273, "rewards/logprob_reward/std": 0.005148232914507389, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 683.8125, "completions/mean_terminated_length": 661.1333618164062, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 2.4598765432098766, "grad_norm": 2.270771040940409, "kl": 0.221923828125, "learning_rate": 2.6922849514642524e-07, "loss": -0.1624, "num_tokens": 22319203.0, "reward": 0.028170034289360046, "reward_std": 0.035853706300258636, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.003522261045873165, "rewards/logprob_reward/std": 0.004629835020750761, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 701.125, "completions/mean_terminated_length": 690.7096557617188, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 2.462962962962963, "grad_norm": 2.3139829790799915, "kl": 0.19635009765625, "learning_rate": 2.687300027962624e-07, "loss": -0.0852, "num_tokens": 22348239.0, "reward": 0.008503954857587814, "reward_std": 0.0144048435613513, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.002504394156858325, "rewards/logprob_reward/std": 0.002979145385324955, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 663.0625, "completions/mean_terminated_length": 639.0000610351562, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 2.4660493827160495, "grad_norm": 1.9387656971456555, "kl": 0.1888427734375, "learning_rate": 2.682314355455381e-07, "loss": -0.0454, "num_tokens": 22375397.0, "reward": 0.03912090137600899, "reward_std": 0.04738355427980423, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0018010009080171585, "rewards/logprob_reward/std": 0.00249595008790493, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 647.1875, "completions/mean_terminated_length": 635.0322265625, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 2.4691358024691357, "grad_norm": 1.9088482438239143, "kl": 0.206787109375, "learning_rate": 2.677327953880038e-07, "loss": -0.1478, "num_tokens": 22402259.0, "reward": 0.03538230061531067, "reward_std": 0.03400729224085808, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0011192248202860355, "rewards/logprob_reward/std": 0.0020716898143291473, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 706.1875, "completions/mean_terminated_length": 660.7857666015625, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 2.4722222222222223, "grad_norm": 2.2538632437104162, "kl": 0.2005615234375, "learning_rate": 2.6723408431770214e-07, "loss": -0.0677, "num_tokens": 22431425.0, "reward": 0.017258968204259872, "reward_std": 0.03199339658021927, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0018155192956328392, "rewards/logprob_reward/std": 0.003149349009618163, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 631.8125, "completions/mean_terminated_length": 619.1612548828125, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 2.4753086419753085, "grad_norm": 3.179859102788341, "kl": 0.2052001953125, "learning_rate": 2.6673530432895957e-07, "loss": -0.2108, "num_tokens": 22458075.0, "reward": 0.021727275103330612, "reward_std": 0.03393279388546944, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0033080829307436943, "rewards/logprob_reward/std": 0.004199848975986242, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 626.46875, "completions/mean_terminated_length": 626.46875, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 2.478395061728395, "grad_norm": 2.048019564442794, "kl": 0.20166015625, "learning_rate": 2.6623645741637815e-07, "loss": -0.0589, "num_tokens": 22484558.0, "reward": 0.01799776963889599, "reward_std": 0.027030106633901596, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0026364100631326437, "rewards/logprob_reward/std": 0.0041060373187065125, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 673.4375, "completions/mean_terminated_length": 662.1290283203125, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 2.4814814814814814, "grad_norm": 2.267452814467895, "kl": 0.21453857421875, "learning_rate": 2.6573754557482746e-07, "loss": -0.2337, "num_tokens": 22512584.0, "reward": 0.023072022944688797, "reward_std": 0.02745337039232254, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.001330025726929307, "rewards/logprob_reward/std": 0.003039197064936161, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 659.25, "completions/mean_terminated_length": 647.4838256835938, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 2.484567901234568, "grad_norm": 2.2584024761330634, "kl": 0.1944580078125, "learning_rate": 2.652385707994369e-07, "loss": -0.1697, "num_tokens": 22540008.0, "reward": 0.019889328628778458, "reward_std": 0.028179999440908432, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0012659190688282251, "rewards/logprob_reward/std": 0.0020133298821747303, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 674.96875, "completions/mean_terminated_length": 663.7096557617188, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 2.4876543209876543, "grad_norm": 4.047315712520667, "kl": 0.1915283203125, "learning_rate": 2.6473953508558726e-07, "loss": -0.2301, "num_tokens": 22567819.0, "reward": 0.020216289907693863, "reward_std": 0.026102395728230476, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0016292117070406675, "rewards/logprob_reward/std": 0.0024890329223126173, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 629.0625, "completions/mean_terminated_length": 616.3225708007812, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 2.490740740740741, "grad_norm": 3.8122308876763054, "kl": 0.2374267578125, "learning_rate": 2.6424044042890334e-07, "loss": -0.2644, "num_tokens": 22594225.0, "reward": 0.04215528443455696, "reward_std": 0.04140203446149826, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0017003151588141918, "rewards/logprob_reward/std": 0.0036566979251801968, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 634.53125, "completions/mean_terminated_length": 634.53125, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 2.493827160493827, "grad_norm": 2.045740451584124, "kl": 0.2281494140625, "learning_rate": 2.6374128882524527e-07, "loss": -0.118, "num_tokens": 22620978.0, "reward": 0.030714647844433784, "reward_std": 0.04138539358973503, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.002877388149499893, "rewards/logprob_reward/std": 0.003089474281296134, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 716.625, "completions/mean_terminated_length": 672.7142944335938, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 2.496913580246914, "grad_norm": 3.7451071903248905, "kl": 0.20947265625, "learning_rate": 2.6324208227070136e-07, "loss": -0.5801, "num_tokens": 22650606.0, "reward": 0.010864447802305222, "reward_std": 0.020829197019338608, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0016549410065636039, "rewards/logprob_reward/std": 0.003017139621078968, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 664.5, "completions/mean_terminated_length": 652.9031982421875, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 2.5, "grad_norm": 2.2161602841140726, "kl": 0.2120361328125, "learning_rate": 2.6274282276157934e-07, "loss": -0.1445, "num_tokens": 22678430.0, "reward": 0.04231654107570648, "reward_std": 0.034627120941877365, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0018794930074363947, "rewards/logprob_reward/std": 0.0027959332801401615, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 712.5625, "completions/mean_terminated_length": 691.800048828125, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 2.503086419753086, "grad_norm": 3.9039260299922915, "kl": 0.1927490234375, "learning_rate": 2.622435122943987e-07, "loss": -0.5333, "num_tokens": 22708060.0, "reward": 0.02357512339949608, "reward_std": 0.039907120168209076, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0018890247447416186, "rewards/logprob_reward/std": 0.0037410426884889603, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 733.875, "completions/mean_terminated_length": 666.923095703125, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 2.506172839506173, "grad_norm": 7.871507714768179, "kl": 0.2376708984375, "learning_rate": 2.61744152865883e-07, "loss": -0.604, "num_tokens": 22738440.0, "reward": 0.01712499000132084, "reward_std": 0.02763485349714756, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0016666557639837265, "rewards/logprob_reward/std": 0.003386562690138817, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 709.96875, "completions/mean_terminated_length": 622.0399780273438, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 2.5092592592592595, "grad_norm": 2.550898122770325, "kl": 0.2625732421875, "learning_rate": 2.6124474647295137e-07, "loss": -0.1239, "num_tokens": 22767875.0, "reward": 0.027525482699275017, "reward_std": 0.03265233337879181, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.002806092379614711, "rewards/logprob_reward/std": 0.00510747404769063, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 688.28125, "completions/mean_terminated_length": 665.9000244140625, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 2.5123456790123457, "grad_norm": 2.9279018313591623, "kl": 0.2076416015625, "learning_rate": 2.607452951127107e-07, "loss": -0.3979, "num_tokens": 22796600.0, "reward": 0.017985306680202484, "reward_std": 0.022175882011651993, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0026225619949400425, "rewards/logprob_reward/std": 0.0036902620922774076, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 793.40625, "completions/mean_terminated_length": 728.8399658203125, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 2.515432098765432, "grad_norm": 1.9669109131970839, "kl": 0.171630859375, "learning_rate": 2.6024580078244777e-07, "loss": -0.081, "num_tokens": 22828749.0, "reward": 0.021482855081558228, "reward_std": 0.029232092201709747, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0030365039128810167, "rewards/logprob_reward/std": 0.0033765204716473818, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 715.15625, "completions/mean_terminated_length": 671.0357666015625, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 2.5185185185185186, "grad_norm": 2.721919174748601, "kl": 0.2049560546875, "learning_rate": 2.5974626547962127e-07, "loss": -0.2804, "num_tokens": 22859026.0, "reward": 0.008617338724434376, "reward_std": 0.00868584681302309, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24593468010425568, "rewards/logprob_reward/mean": 0.0026303762570023537, "rewards/logprob_reward/std": 0.003620925359427929, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 746.25, "completions/mean_terminated_length": 717.5172119140625, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 2.521604938271605, "grad_norm": 2.222073599850611, "kl": 0.2247314453125, "learning_rate": 2.5924669120185373e-07, "loss": -0.2009, "num_tokens": 22890034.0, "reward": 0.023714452981948853, "reward_std": 0.027255091816186905, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0020438386127352715, "rewards/logprob_reward/std": 0.003529217327013612, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 661.71875, "completions/mean_terminated_length": 661.71875, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 2.5246913580246915, "grad_norm": 2.088454467765236, "kl": 0.207275390625, "learning_rate": 2.5874707994692333e-07, "loss": -0.3222, "num_tokens": 22917609.0, "reward": 0.018701478838920593, "reward_std": 0.02833986096084118, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0034183121751993895, "rewards/logprob_reward/std": 0.0037787938490509987, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 702.78125, "completions/mean_terminated_length": 692.4193115234375, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 2.5277777777777777, "grad_norm": 3.260481603818947, "kl": 0.2135009765625, "learning_rate": 2.582474337127564e-07, "loss": -0.2878, "num_tokens": 22946986.0, "reward": 0.027376752346754074, "reward_std": 0.028328103944659233, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.002640834078192711, "rewards/logprob_reward/std": 0.004320894833654165, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 715.75, "completions/mean_terminated_length": 695.2000122070312, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 2.5308641975308643, "grad_norm": 1.851039622210421, "kl": 0.1866455078125, "learning_rate": 2.5774775449741903e-07, "loss": -0.1446, "num_tokens": 22976878.0, "reward": 0.02392280474305153, "reward_std": 0.03221181035041809, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0022753365337848663, "rewards/logprob_reward/std": 0.003072317922487855, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 658.9375, "completions/mean_terminated_length": 658.9375, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 2.5339506172839505, "grad_norm": 2.694360369586984, "kl": 0.2193603515625, "learning_rate": 2.572480442991092e-07, "loss": -0.2354, "num_tokens": 23004184.0, "reward": 0.030364010483026505, "reward_std": 0.04005251079797745, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0024877884425222874, "rewards/logprob_reward/std": 0.00370772578753531, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 664.03125, "completions/mean_terminated_length": 640.0333862304688, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 2.537037037037037, "grad_norm": 2.2410612913781907, "kl": 0.215087890625, "learning_rate": 2.567483051161487e-07, "loss": -0.0733, "num_tokens": 23031981.0, "reward": 0.03331879526376724, "reward_std": 0.03510475903749466, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0022986605763435364, "rewards/logprob_reward/std": 0.002696947194635868, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 738.875, "completions/mean_terminated_length": 709.3793334960938, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 2.5401234567901234, "grad_norm": 2.2639579831015446, "kl": 0.2003173828125, "learning_rate": 2.562485389469754e-07, "loss": -0.2896, "num_tokens": 23062461.0, "reward": 0.020506957545876503, "reward_std": 0.028530918061733246, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.001952175865881145, "rewards/logprob_reward/std": 0.0033695560414344072, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 720.125, "completions/mean_terminated_length": 710.3225708007812, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 2.5432098765432096, "grad_norm": 2.476754768832393, "kl": 0.216796875, "learning_rate": 2.5574874779013494e-07, "loss": -0.3659, "num_tokens": 23091893.0, "reward": 0.010837879031896591, "reward_std": 0.02046077884733677, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0016254207585006952, "rewards/logprob_reward/std": 0.0024387193843722343, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 716.25, "completions/mean_terminated_length": 672.2857666015625, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 2.5462962962962963, "grad_norm": 2.8502985601277833, "kl": 0.20654296875, "learning_rate": 2.5524893364427307e-07, "loss": -0.3099, "num_tokens": 23121433.0, "reward": 0.03331666439771652, "reward_std": 0.03405371308326721, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.002296295017004013, "rewards/logprob_reward/std": 0.0034785487223416567, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 727.6875, "completions/mean_terminated_length": 718.1290283203125, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 2.549382716049383, "grad_norm": 1.9951048258107709, "kl": 0.180908203125, "learning_rate": 2.547490985081272e-07, "loss": -0.2309, "num_tokens": 23151375.0, "reward": 0.017598789185285568, "reward_std": 0.031999118626117706, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0021930988878011703, "rewards/logprob_reward/std": 0.0025571477599442005, "step": 826 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 696.34375, "completions/mean_terminated_length": 674.5000610351562, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 2.552469135802469, "grad_norm": 2.8159746377764665, "kl": NaN, "learning_rate": 2.5424924438051896e-07, "loss": -0.2667, "num_tokens": 23180022.0, "reward": 0.029780089855194092, "reward_std": 0.044473059475421906, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0018389882752671838, "rewards/logprob_reward/std": 0.0028411380480974913, "step": 827 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 747.75, "completions/mean_terminated_length": 719.1724243164062, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 2.5555555555555554, "grad_norm": 4.467702325213489, "kl": NaN, "learning_rate": 2.5374937326034575e-07, "loss": -0.774, "num_tokens": 23210766.0, "reward": 0.014227262698113918, "reward_std": 0.022986872121691704, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0019191803876310587, "rewards/logprob_reward/std": 0.005398834589868784, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 669.15625, "completions/mean_terminated_length": 645.5000610351562, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 2.558641975308642, "grad_norm": 2.597076161518784, "kl": 0.2498779296875, "learning_rate": 2.5324948714657287e-07, "loss": -0.2013, "num_tokens": 23238795.0, "reward": 0.03445756062865257, "reward_std": 0.028805706650018692, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0035639547277241945, "rewards/logprob_reward/std": 0.0068948217667639256, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 705.4375, "completions/mean_terminated_length": 672.4827270507812, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 2.5617283950617287, "grad_norm": 3.653506477903024, "kl": 0.21246337890625, "learning_rate": 2.527495880382259e-07, "loss": -0.4878, "num_tokens": 23268021.0, "reward": 0.01364973746240139, "reward_std": 0.026206474751234055, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0012774853967130184, "rewards/logprob_reward/std": 0.0029412677977234125, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 724.375, "completions/mean_terminated_length": 724.375, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 2.564814814814815, "grad_norm": 2.309741392394042, "kl": 0.2120361328125, "learning_rate": 2.522496779343819e-07, "loss": -0.1435, "num_tokens": 23297769.0, "reward": 0.03863968327641487, "reward_std": 0.05230659246444702, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0012663148809224367, "rewards/logprob_reward/std": 0.002999991411343217, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 722.1875, "completions/mean_terminated_length": 690.9655151367188, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 2.567901234567901, "grad_norm": 3.3358968568484353, "kl": 0.216064453125, "learning_rate": 2.5174975883416237e-07, "loss": -0.5637, "num_tokens": 23327459.0, "reward": 0.03565439581871033, "reward_std": 0.047280535101890564, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.001421550172381103, "rewards/logprob_reward/std": 0.003314180066809058, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 715.46875, "completions/mean_terminated_length": 694.9000244140625, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 2.5709876543209877, "grad_norm": 2.702658914359402, "kl": 0.212890625, "learning_rate": 2.512498327367245e-07, "loss": -0.4353, "num_tokens": 23356742.0, "reward": 0.03318271040916443, "reward_std": 0.04089856147766113, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.002147453837096691, "rewards/logprob_reward/std": 0.003950577694922686, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 719.15625, "completions/mean_terminated_length": 698.8333740234375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 2.574074074074074, "grad_norm": 3.3366556553222115, "kl": 0.224365234375, "learning_rate": 2.5074990164125355e-07, "loss": -0.4936, "num_tokens": 23386711.0, "reward": 0.020330991595983505, "reward_std": 0.03460453450679779, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.001756657613441348, "rewards/logprob_reward/std": 0.003462655935436487, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 708.96875, "completions/mean_terminated_length": 676.3793334960938, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 2.5771604938271606, "grad_norm": 2.6519424685060122, "kl": 0.2386474609375, "learning_rate": 2.502499675469547e-07, "loss": -0.3995, "num_tokens": 23416294.0, "reward": 0.032521802932024, "reward_std": 0.04797051474452019, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0014131118077784777, "rewards/logprob_reward/std": 0.003906686324626207, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 693.4375, "completions/mean_terminated_length": 682.774169921875, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 2.580246913580247, "grad_norm": 3.67101673480092, "kl": 0.23876953125, "learning_rate": 2.497500324530453e-07, "loss": -0.4025, "num_tokens": 23444636.0, "reward": 0.02910676598548889, "reward_std": 0.04685968905687332, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0010908524272963405, "rewards/logprob_reward/std": 0.002023884328082204, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 668.28125, "completions/mean_terminated_length": 656.8064575195312, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 2.5833333333333335, "grad_norm": 3.253404379602226, "kl": 0.2213134765625, "learning_rate": 2.4925009835874643e-07, "loss": -0.3962, "num_tokens": 23472589.0, "reward": 0.02016962505877018, "reward_std": 0.02743956819176674, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.001577360788360238, "rewards/logprob_reward/std": 0.002759169088676572, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 671.65625, "completions/mean_terminated_length": 660.290283203125, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 2.5864197530864197, "grad_norm": 2.1602120687780175, "kl": 0.2156982421875, "learning_rate": 2.4875016726327555e-07, "loss": -0.1269, "num_tokens": 23500206.0, "reward": 0.026714034378528595, "reward_std": 0.039035614579916, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.001904481090605259, "rewards/logprob_reward/std": 0.00641883397474885, "step": 838 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 740.625, "completions/mean_terminated_length": 700.1428833007812, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 2.5895061728395063, "grad_norm": 2.3325872334653863, "kl": NaN, "learning_rate": 2.482502411658376e-07, "loss": -0.3661, "num_tokens": 23530850.0, "reward": 0.0106188440695405, "reward_std": 0.015147138386964798, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.001382049173116684, "rewards/logprob_reward/std": 0.002947068540379405, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 782.4375, "completions/mean_terminated_length": 714.7999877929688, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 2.5925925925925926, "grad_norm": 2.544350818063004, "kl": 0.2039794921875, "learning_rate": 2.477503220656181e-07, "loss": -0.3391, "num_tokens": 23562580.0, "reward": 0.010148586705327034, "reward_std": 0.019542181864380836, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.2961445748806, "rewards/logprob_reward/mean": 0.0008595405961386859, "rewards/logprob_reward/std": 0.0020047107245773077, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 718.0, "completions/mean_terminated_length": 674.2857666015625, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 2.5956790123456788, "grad_norm": 1.9610370735118756, "kl": 0.2322998046875, "learning_rate": 2.472504119617742e-07, "loss": -0.2352, "num_tokens": 23592020.0, "reward": 0.05466719716787338, "reward_std": 0.03430184721946716, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0017135508824139833, "rewards/logprob_reward/std": 0.00216072634793818, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 674.125, "completions/mean_terminated_length": 674.125, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 2.5987654320987654, "grad_norm": 2.7523464152444657, "kl": 0.2215576171875, "learning_rate": 2.4675051285342716e-07, "loss": -0.4303, "num_tokens": 23619552.0, "reward": 0.026860298588871956, "reward_std": 0.03700836002826691, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.002066997578367591, "rewards/logprob_reward/std": 0.00514443451538682, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 703.84375, "completions/mean_terminated_length": 693.51611328125, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 2.601851851851852, "grad_norm": 2.214071112523842, "kl": 0.224609375, "learning_rate": 2.462506267396543e-07, "loss": -0.2276, "num_tokens": 23648431.0, "reward": 0.025821613147854805, "reward_std": 0.039692893624305725, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0009129041573032737, "rewards/logprob_reward/std": 0.0021135048009455204, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 718.78125, "completions/mean_terminated_length": 698.433349609375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 2.6049382716049383, "grad_norm": 2.786763201629874, "kl": 0.2379150390625, "learning_rate": 2.45750755619481e-07, "loss": -0.4413, "num_tokens": 23677764.0, "reward": 0.03020767867565155, "reward_std": 0.04247771203517914, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0023140886332839727, "rewards/logprob_reward/std": 0.005226579960435629, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 813.84375, "completions/mean_terminated_length": 774.9259033203125, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 2.6080246913580245, "grad_norm": 1.7725783149245817, "kl": 0.221923828125, "learning_rate": 2.452509014918728e-07, "loss": -0.0991, "num_tokens": 23711003.0, "reward": 0.016469093039631844, "reward_std": 0.026728259399533272, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0009378820541314781, "rewards/logprob_reward/std": 0.0023578214459121227, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 756.65625, "completions/mean_terminated_length": 748.0322265625, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 2.611111111111111, "grad_norm": 2.220694192968226, "kl": 0.22216796875, "learning_rate": 2.4475106635572696e-07, "loss": -0.1817, "num_tokens": 23741380.0, "reward": 0.01998922973871231, "reward_std": 0.0389016717672348, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.001376922708004713, "rewards/logprob_reward/std": 0.003074995242059231, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 743.71875, "completions/mean_terminated_length": 734.6773681640625, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 2.6141975308641974, "grad_norm": 2.4805680619076704, "kl": 0.2080078125, "learning_rate": 2.4425125220986503e-07, "loss": -0.4004, "num_tokens": 23771663.0, "reward": 0.03915968909859657, "reward_std": 0.05278148874640465, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0018440973944962025, "rewards/logprob_reward/std": 0.003013347741216421, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 736.8125, "completions/mean_terminated_length": 695.7857666015625, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 2.617283950617284, "grad_norm": 2.5417168398216003, "kl": 0.20831298828125, "learning_rate": 2.437514610530246e-07, "loss": -0.474, "num_tokens": 23801965.0, "reward": 0.033397383987903595, "reward_std": 0.03566242754459381, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0023859834764152765, "rewards/logprob_reward/std": 0.0035746481735259295, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 673.34375, "completions/mean_terminated_length": 662.0322265625, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 2.6203703703703702, "grad_norm": 2.3381554960645095, "kl": 0.223876953125, "learning_rate": 2.4325169488385137e-07, "loss": -0.2657, "num_tokens": 23829732.0, "reward": 0.032207190990448, "reward_std": 0.038205452263355255, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0010635434882715344, "rewards/logprob_reward/std": 0.001865375554189086, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 691.75, "completions/mean_terminated_length": 681.0322265625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 2.623456790123457, "grad_norm": 1.986900300570951, "kl": 0.249755859375, "learning_rate": 2.4275195570089083e-07, "loss": -0.1838, "num_tokens": 23858240.0, "reward": 0.05186006799340248, "reward_std": 0.046564631164073944, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0020667435601353645, "rewards/logprob_reward/std": 0.004226405173540115, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 594.40625, "completions/mean_terminated_length": 580.54833984375, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 2.626543209876543, "grad_norm": 3.1367713294332105, "kl": 0.2412109375, "learning_rate": 2.42252245502581e-07, "loss": -0.2543, "num_tokens": 23883045.0, "reward": 0.03827198967337608, "reward_std": 0.054491765797138214, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0008577638072893023, "rewards/logprob_reward/std": 0.001706449780613184, "step": 851 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 743.625, "completions/mean_terminated_length": 714.6206665039062, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 2.6296296296296298, "grad_norm": 1.8859280096829483, "kl": NaN, "learning_rate": 2.417525662872436e-07, "loss": -0.0967, "num_tokens": 23913497.0, "reward": 0.030198421329259872, "reward_std": 0.03990299627184868, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.002303801476955414, "rewards/logprob_reward/std": 0.0065719569101929665, "step": 852 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 749.875, "completions/mean_terminated_length": 721.5172119140625, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 2.632716049382716, "grad_norm": 1.8682637632133128, "kl": NaN, "learning_rate": 2.412529200530767e-07, "loss": -0.0972, "num_tokens": 23944309.0, "reward": 0.020021267235279083, "reward_std": 0.03287472575902939, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0014125206507742405, "rewards/logprob_reward/std": 0.0031592841260135174, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 673.03125, "completions/mean_terminated_length": 622.8928833007812, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 2.6358024691358026, "grad_norm": 2.147540991945435, "kl": 0.2291259765625, "learning_rate": 2.407533087981463e-07, "loss": -0.1666, "num_tokens": 23971974.0, "reward": 0.04172077775001526, "reward_std": 0.047027163207530975, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.008161972276866436, "rewards/logprob_reward/std": 0.0175464004278183, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 744.46875, "completions/mean_terminated_length": 735.4515991210938, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 2.638888888888889, "grad_norm": 1.9951436098050117, "kl": 0.2181396484375, "learning_rate": 2.4025373452037865e-07, "loss": -0.0855, "num_tokens": 24002289.0, "reward": 0.02945689484477043, "reward_std": 0.040991902351379395, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0014798822812736034, "rewards/logprob_reward/std": 0.002451105508953333, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 772.28125, "completions/mean_terminated_length": 746.2413940429688, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 2.6419753086419755, "grad_norm": 2.1854130021683753, "kl": 0.199951171875, "learning_rate": 2.3975419921755215e-07, "loss": -0.1726, "num_tokens": 24033878.0, "reward": 0.038735829293727875, "reward_std": 0.03313373401761055, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0013731422368437052, "rewards/logprob_reward/std": 0.002503145020455122, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 691.5625, "completions/mean_terminated_length": 669.4000244140625, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 2.6450617283950617, "grad_norm": 2.2495599348922664, "kl": 0.2222900390625, "learning_rate": 2.3925470488728935e-07, "loss": -0.1941, "num_tokens": 24062368.0, "reward": 0.036605220288038254, "reward_std": 0.04203525185585022, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0024780225940048695, "rewards/logprob_reward/std": 0.004517507739365101, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 662.5, "completions/mean_terminated_length": 662.5, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 2.648148148148148, "grad_norm": 2.343706151366428, "kl": 0.2281494140625, "learning_rate": 2.3875525352704866e-07, "loss": -0.1816, "num_tokens": 24090336.0, "reward": 0.050402089953422546, "reward_std": 0.039860792458057404, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.003918987698853016, "rewards/logprob_reward/std": 0.0071139344945549965, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 803.0, "completions/mean_terminated_length": 780.137939453125, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 2.6512345679012346, "grad_norm": 2.090071290877971, "kl": 0.214111328125, "learning_rate": 2.38255847134117e-07, "loss": -0.2867, "num_tokens": 24123040.0, "reward": 0.039301685988903046, "reward_std": 0.04914182424545288, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0020018750801682472, "rewards/logprob_reward/std": 0.005134669132530689, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 733.625, "completions/mean_terminated_length": 714.2667236328125, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 2.6543209876543212, "grad_norm": 2.1483396464727327, "kl": 0.2054443359375, "learning_rate": 2.3775648770560126e-07, "loss": -0.236, "num_tokens": 24153412.0, "reward": 0.023131113499403, "reward_std": 0.03914899379014969, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0013956804759800434, "rewards/logprob_reward/std": 0.004638405051082373, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 698.53125, "completions/mean_terminated_length": 698.53125, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 2.6574074074074074, "grad_norm": 2.0154457417961686, "kl": 0.23779296875, "learning_rate": 2.3725717723842066e-07, "loss": -0.2103, "num_tokens": 24182273.0, "reward": 0.03179170563817024, "reward_std": 0.03794276341795921, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.004074117634445429, "rewards/logprob_reward/std": 0.007332983892410994, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 694.5625, "completions/mean_terminated_length": 660.4827270507812, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 2.6604938271604937, "grad_norm": 2.035982964371428, "kl": 0.2576904296875, "learning_rate": 2.3675791772929862e-07, "loss": -0.1806, "num_tokens": 24211475.0, "reward": 0.03286074101924896, "reward_std": 0.0356038361787796, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0017897089710459113, "rewards/logprob_reward/std": 0.0033139868173748255, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 687.03125, "completions/mean_terminated_length": 664.5667114257812, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 2.6635802469135803, "grad_norm": 3.2998292410758596, "kl": 0.2371826171875, "learning_rate": 2.3625871117475466e-07, "loss": -0.4995, "num_tokens": 24239900.0, "reward": 0.04468598589301109, "reward_std": 0.052429646253585815, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0010399832390248775, "rewards/logprob_reward/std": 0.002016523154452443, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 741.375, "completions/mean_terminated_length": 712.137939453125, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 2.6666666666666665, "grad_norm": 2.566695161347095, "kl": 0.25341796875, "learning_rate": 2.357595595710967e-07, "loss": -0.2856, "num_tokens": 24270364.0, "reward": 0.019286027178168297, "reward_std": 0.025570526719093323, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0005955855594947934, "rewards/logprob_reward/std": 0.0013773522805422544, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 741.8125, "completions/mean_terminated_length": 723.0000610351562, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 2.669753086419753, "grad_norm": 2.1976330674905262, "kl": 0.2265625, "learning_rate": 2.3526046491441277e-07, "loss": -0.2576, "num_tokens": 24300518.0, "reward": 0.036166466772556305, "reward_std": 0.0490657202899456, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.001990516670048237, "rewards/logprob_reward/std": 0.003708133241161704, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 716.40625, "completions/mean_terminated_length": 659.4444580078125, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 2.6728395061728394, "grad_norm": 2.320208401868147, "kl": 0.2210693359375, "learning_rate": 2.3476142920056315e-07, "loss": -0.1642, "num_tokens": 24330003.0, "reward": 0.04185473173856735, "reward_std": 0.03403524309396744, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0013663668651133776, "rewards/logprob_reward/std": 0.0019420768367126584, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 679.96875, "completions/mean_terminated_length": 668.8709716796875, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 2.675925925925926, "grad_norm": 2.3944614516156175, "kl": 0.2408447265625, "learning_rate": 2.3426245442517254e-07, "loss": -0.3096, "num_tokens": 24357998.0, "reward": 0.03848707303404808, "reward_std": 0.03939800336956978, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0010967478156089783, "rewards/logprob_reward/std": 0.002325907815247774, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 739.6875, "completions/mean_terminated_length": 720.7333984375, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 2.6790123456790123, "grad_norm": 2.7347047246556753, "kl": 0.223388671875, "learning_rate": 2.3376354258362185e-07, "loss": -0.2714, "num_tokens": 24388320.0, "reward": 0.04037487506866455, "reward_std": 0.05308530479669571, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.003194308839738369, "rewards/logprob_reward/std": 0.006363058928400278, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 745.21875, "completions/mean_terminated_length": 726.6333618164062, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 2.682098765432099, "grad_norm": 1.995427564199402, "kl": 0.229248046875, "learning_rate": 2.3326469567104044e-07, "loss": -0.2556, "num_tokens": 24418655.0, "reward": 0.0455198772251606, "reward_std": 0.046911582350730896, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.001966530457139015, "rewards/logprob_reward/std": 0.004726557061076164, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 731.84375, "completions/mean_terminated_length": 701.6206665039062, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 2.685185185185185, "grad_norm": 2.1619837369227914, "kl": 0.2369384765625, "learning_rate": 2.3276591568229787e-07, "loss": -0.1859, "num_tokens": 24448690.0, "reward": 0.031650789082050323, "reward_std": 0.03955671191215515, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.00044532225001603365, "rewards/logprob_reward/std": 0.001193919568322599, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 691.03125, "completions/mean_terminated_length": 668.8333740234375, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 2.6882716049382713, "grad_norm": 2.187276421691549, "kl": 0.258544921875, "learning_rate": 2.3226720461199626e-07, "loss": -0.2818, "num_tokens": 24477479.0, "reward": 0.04434952139854431, "reward_std": 0.0494009368121624, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0006661323131993413, "rewards/logprob_reward/std": 0.0018652883591130376, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 735.0625, "completions/mean_terminated_length": 715.800048828125, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 2.691358024691358, "grad_norm": 3.1963819380802896, "kl": 0.20294189453125, "learning_rate": 2.3176856445446187e-07, "loss": -0.4824, "num_tokens": 24507285.0, "reward": 0.04160226508975029, "reward_std": 0.04009556770324707, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0010858499445021152, "rewards/logprob_reward/std": 0.0035910236183553934, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 655.5625, "completions/mean_terminated_length": 643.6774291992188, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 2.6944444444444446, "grad_norm": 2.311400684915605, "kl": 0.2564697265625, "learning_rate": 2.3126999720373757e-07, "loss": -0.2976, "num_tokens": 24534211.0, "reward": 0.044940121471881866, "reward_std": 0.04768058657646179, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0013223541900515556, "rewards/logprob_reward/std": 0.0023013753816485405, "step": 873 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 727.3125, "completions/mean_terminated_length": 717.741943359375, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 2.697530864197531, "grad_norm": 2.054519713179906, "kl": NaN, "learning_rate": 2.3077150485357477e-07, "loss": -0.2575, "num_tokens": 24564097.0, "reward": 0.016626127064228058, "reward_std": 0.02705797366797924, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0011123641161248088, "rewards/logprob_reward/std": 0.0025833887048065662, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 692.96875, "completions/mean_terminated_length": 670.9000244140625, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 2.700617283950617, "grad_norm": 2.910370041402984, "kl": 0.2257080078125, "learning_rate": 2.3027308939742502e-07, "loss": -0.4352, "num_tokens": 24592792.0, "reward": 0.04235769063234329, "reward_std": 0.04238816350698471, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0019252125639468431, "rewards/logprob_reward/std": 0.0034259967505931854, "step": 875 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 761.40625, "completions/mean_terminated_length": 687.8800048828125, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 2.7037037037037037, "grad_norm": 1.455819713215063, "kl": NaN, "learning_rate": 2.2977475282843266e-07, "loss": -0.2113, "num_tokens": 24623477.0, "reward": 0.019184719771146774, "reward_std": 0.02016684226691723, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0004830232937820256, "rewards/logprob_reward/std": 0.0013851220719516277, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 681.5, "completions/mean_terminated_length": 658.6666870117188, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 2.7067901234567904, "grad_norm": 1.8719776724399728, "kl": 0.2705078125, "learning_rate": 2.292764971394265e-07, "loss": -0.2832, "num_tokens": 24652041.0, "reward": 0.031855180859565735, "reward_std": 0.03468414396047592, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0006724251434206963, "rewards/logprob_reward/std": 0.001641817856580019, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 685.75, "completions/mean_terminated_length": 685.75, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 2.7098765432098766, "grad_norm": 2.0589673508480195, "kl": 0.2218017578125, "learning_rate": 2.2877832432291188e-07, "loss": -0.1418, "num_tokens": 24680525.0, "reward": 0.033176496624946594, "reward_std": 0.04296921193599701, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.002140550408512354, "rewards/logprob_reward/std": 0.004232995677739382, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 724.21875, "completions/mean_terminated_length": 668.7037353515625, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 2.712962962962963, "grad_norm": 2.119981505262375, "kl": 0.23681640625, "learning_rate": 2.2828023637106273e-07, "loss": -0.1436, "num_tokens": 24710444.0, "reward": 0.020132753998041153, "reward_std": 0.03874822333455086, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.001536392024718225, "rewards/logprob_reward/std": 0.0035032329615205526, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 752.5625, "completions/mean_terminated_length": 713.7857666015625, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 2.7160493827160495, "grad_norm": 3.2460424909186703, "kl": 0.2237548828125, "learning_rate": 2.2778223527571362e-07, "loss": -0.462, "num_tokens": 24741258.0, "reward": 0.03546038269996643, "reward_std": 0.03611791878938675, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0012059778673574328, "rewards/logprob_reward/std": 0.002610138850286603, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 711.5625, "completions/mean_terminated_length": 679.2413940429688, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 2.7191358024691357, "grad_norm": 1.6212524916976438, "kl": 0.27587890625, "learning_rate": 2.2728432302835183e-07, "loss": -0.0666, "num_tokens": 24770404.0, "reward": 0.016240714117884636, "reward_std": 0.02671341598033905, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0006841267459094524, "rewards/logprob_reward/std": 0.0019257022067904472, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 743.875, "completions/mean_terminated_length": 692.0, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 2.7222222222222223, "grad_norm": 2.0606874675581333, "kl": 0.203125, "learning_rate": 2.2678650162010937e-07, "loss": -0.1637, "num_tokens": 24800956.0, "reward": 0.025701280683279037, "reward_std": 0.04016866162419319, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0007791995303705335, "rewards/logprob_reward/std": 0.0015462575247511268, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 747.03125, "completions/mean_terminated_length": 728.5667114257812, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 2.7253086419753085, "grad_norm": 2.542278374391496, "kl": 0.2138671875, "learning_rate": 2.2628877304175472e-07, "loss": -0.1805, "num_tokens": 24831345.0, "reward": 0.039266541600227356, "reward_std": 0.03604422137141228, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0019628223963081837, "rewards/logprob_reward/std": 0.0035051226150244474, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 720.8125, "completions/mean_terminated_length": 677.5, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 2.728395061728395, "grad_norm": 1.6664721271423677, "kl": 0.21124267578125, "learning_rate": 2.2579113928368548e-07, "loss": -0.0488, "num_tokens": 24860695.0, "reward": 0.037032563239336014, "reward_std": 0.03367120772600174, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0029528478626161814, "rewards/logprob_reward/std": 0.005151271354407072, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 700.625, "completions/mean_terminated_length": 679.0667114257812, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 2.7314814814814814, "grad_norm": 2.40551840197583, "kl": 0.2275390625, "learning_rate": 2.2529360233591997e-07, "loss": -0.2004, "num_tokens": 24889427.0, "reward": 0.0368088036775589, "reward_std": 0.04108230024576187, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0027042264118790627, "rewards/logprob_reward/std": 0.004673474468290806, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 687.625, "completions/mean_terminated_length": 625.3333129882812, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 2.734567901234568, "grad_norm": 1.9828080778962243, "kl": 0.2723388671875, "learning_rate": 2.2479616418808915e-07, "loss": -0.1018, "num_tokens": 24918295.0, "reward": 0.041648536920547485, "reward_std": 0.04729478806257248, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0046094865538179874, "rewards/logprob_reward/std": 0.01932183839380741, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 709.3333740234375, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 2.7376543209876543, "grad_norm": 3.843452394623579, "kl": 0.23779296875, "learning_rate": 2.242988268294292e-07, "loss": -0.3611, "num_tokens": 24948303.0, "reward": 0.038460731506347656, "reward_std": 0.04706891253590584, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0010674791410565376, "rewards/logprob_reward/std": 0.0017414541216567159, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 713.0, "completions/mean_terminated_length": 668.5714721679688, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 2.7407407407407405, "grad_norm": 1.9014996345376378, "kl": 0.23095703125, "learning_rate": 2.23801592248773e-07, "loss": -0.0997, "num_tokens": 24977355.0, "reward": 0.04267347976565361, "reward_std": 0.04828355461359024, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0022760871797800064, "rewards/logprob_reward/std": 0.00266113318502903, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 734.125, "completions/mean_terminated_length": 714.800048828125, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 2.743827160493827, "grad_norm": 2.6287424716050847, "kl": 0.1951904296875, "learning_rate": 2.2330446243454265e-07, "loss": -0.4275, "num_tokens": 25007187.0, "reward": 0.04790825769305229, "reward_std": 0.05132868513464928, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001148065086454153, "rewards/logprob_reward/std": 0.002382042817771435, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 720.65625, "completions/mean_terminated_length": 700.433349609375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 2.746913580246914, "grad_norm": 2.1345033147595203, "kl": 0.2271728515625, "learning_rate": 2.228074393747412e-07, "loss": -0.2181, "num_tokens": 25036912.0, "reward": 0.03855757787823677, "reward_std": 0.04170295223593712, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.00117508287075907, "rewards/logprob_reward/std": 0.002204425632953644, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 752.15625, "completions/mean_terminated_length": 724.0344848632812, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 2.75, "grad_norm": 2.1455061021486355, "kl": 0.222900390625, "learning_rate": 2.2231052505694458e-07, "loss": -0.0601, "num_tokens": 25067697.0, "reward": 0.02888527512550354, "reward_std": 0.03874707967042923, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0008447513682767749, "rewards/logprob_reward/std": 0.0017420414369553328, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 756.03125, "completions/mean_terminated_length": 706.4074096679688, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 2.753086419753086, "grad_norm": 1.9247640801259118, "kl": 0.2283935546875, "learning_rate": 2.2181372146829418e-07, "loss": -0.1753, "num_tokens": 25098878.0, "reward": 0.0382111594080925, "reward_std": 0.040418513119220734, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.000790177786257118, "rewards/logprob_reward/std": 0.002555250423029065, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 743.40625, "completions/mean_terminated_length": 724.7000122070312, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 2.756172839506173, "grad_norm": 3.2209369553822227, "kl": 0.2071533203125, "learning_rate": 2.213170305954884e-07, "loss": -0.5666, "num_tokens": 25129303.0, "reward": 0.045907195657491684, "reward_std": 0.049174029380083084, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0023968853056430817, "rewards/logprob_reward/std": 0.005156919360160828, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 685.5, "completions/mean_terminated_length": 674.5806274414062, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 2.7592592592592595, "grad_norm": 2.136325788844036, "kl": 0.22412109375, "learning_rate": 2.2082045442477497e-07, "loss": -0.0527, "num_tokens": 25157715.0, "reward": 0.06102164462208748, "reward_std": 0.046311501413583755, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0018296046182513237, "rewards/logprob_reward/std": 0.003303486853837967, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 695.0625, "completions/mean_terminated_length": 695.0625, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 2.7623456790123457, "grad_norm": 2.0556344749647875, "kl": 0.2237548828125, "learning_rate": 2.2032399494194292e-07, "loss": -0.0831, "num_tokens": 25186289.0, "reward": 0.041386574506759644, "reward_std": 0.04076617956161499, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0008461924735456705, "rewards/logprob_reward/std": 0.0014869216829538345, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 734.375, "completions/mean_terminated_length": 704.413818359375, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 2.765432098765432, "grad_norm": 2.7409042385817526, "kl": 0.2484130859375, "learning_rate": 2.1982765413231466e-07, "loss": -0.3198, "num_tokens": 25216233.0, "reward": 0.036085888743400574, "reward_std": 0.04144894704222679, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0019009875832125545, "rewards/logprob_reward/std": 0.0037570588756352663, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 746.65625, "completions/mean_terminated_length": 695.2963256835938, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 2.7685185185185186, "grad_norm": 1.8348602353578563, "kl": 0.2242431640625, "learning_rate": 2.1933143398073805e-07, "loss": -0.0908, "num_tokens": 25246846.0, "reward": 0.02923869527876377, "reward_std": 0.04143529012799263, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.001237439108081162, "rewards/logprob_reward/std": 0.002240176545456052, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 716.78125, "completions/mean_terminated_length": 716.78125, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 2.771604938271605, "grad_norm": 1.7030290514235649, "kl": 0.2056884765625, "learning_rate": 2.1883533647157828e-07, "loss": -0.0736, "num_tokens": 25276579.0, "reward": 0.05580006539821625, "reward_std": 0.047913894057273865, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0029722945764660835, "rewards/logprob_reward/std": 0.004684407729655504, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 748.9375, "completions/mean_terminated_length": 709.6428833007812, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 2.7746913580246915, "grad_norm": 1.980861417160589, "kl": 0.213134765625, "learning_rate": 2.1833936358871045e-07, "loss": -0.238, "num_tokens": 25306981.0, "reward": 0.04480702057480812, "reward_std": 0.045569345355033875, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0011744651710614562, "rewards/logprob_reward/std": 0.002723806072026491, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 705.53125, "completions/mean_terminated_length": 672.586181640625, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 2.7777777777777777, "grad_norm": 2.1573868648030263, "kl": 0.236328125, "learning_rate": 2.1784351731551077e-07, "loss": -0.0647, "num_tokens": 25335726.0, "reward": 0.03806355968117714, "reward_std": 0.0521421954035759, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0006261759554035962, "rewards/logprob_reward/std": 0.0018277488416060805, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 727.90625, "completions/mean_terminated_length": 697.27587890625, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 2.7808641975308643, "grad_norm": 3.639874262202838, "kl": 0.2315673828125, "learning_rate": 2.1734779963484959e-07, "loss": -0.3523, "num_tokens": 25366059.0, "reward": 0.03266870230436325, "reward_std": 0.05293484777212143, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0015763344708830118, "rewards/logprob_reward/std": 0.0033819256350398064, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 744.6875, "completions/mean_terminated_length": 704.7857666015625, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 2.7839506172839505, "grad_norm": 2.2011254841985393, "kl": 0.223388671875, "learning_rate": 2.1685221252908282e-07, "loss": -0.3663, "num_tokens": 25396109.0, "reward": 0.03815517947077751, "reward_std": 0.04741198942065239, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0007279774872586131, "rewards/logprob_reward/std": 0.0016141906380653381, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 789.53125, "completions/mean_terminated_length": 746.1111450195312, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 2.787037037037037, "grad_norm": 1.5647770721778727, "kl": 0.2010498046875, "learning_rate": 2.163567579800443e-07, "loss": 0.0294, "num_tokens": 25428246.0, "reward": 0.02891172468662262, "reward_std": 0.01902131177484989, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0008741386118344963, "rewards/logprob_reward/std": 0.002331784460693598, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 653.4375, "completions/mean_terminated_length": 641.4838256835938, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 2.7901234567901234, "grad_norm": 1.7748957652190511, "kl": 0.2374267578125, "learning_rate": 2.1586143796903775e-07, "loss": -0.1707, "num_tokens": 25455468.0, "reward": 0.04110602289438248, "reward_std": 0.04065663367509842, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0005344680976122618, "rewards/logprob_reward/std": 0.0008938212413340807, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 684.09375, "completions/mean_terminated_length": 673.1290283203125, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 2.7932098765432096, "grad_norm": 2.0471774127617355, "kl": 0.234375, "learning_rate": 2.1536625447682877e-07, "loss": -0.1063, "num_tokens": 25483795.0, "reward": 0.02925974503159523, "reward_std": 0.034787438809871674, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0012608272954821587, "rewards/logprob_reward/std": 0.002463593613356352, "step": 905 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 690.53125, "completions/mean_terminated_length": 656.0344848632812, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 2.7962962962962963, "grad_norm": 2.0512353520988977, "kl": NaN, "learning_rate": 2.1487120948363713e-07, "loss": -0.06, "num_tokens": 25512088.0, "reward": 0.023405537009239197, "reward_std": 0.0352381132543087, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0017005959525704384, "rewards/logprob_reward/std": 0.006216020323336124, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 708.15625, "completions/mean_terminated_length": 687.1000366210938, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 2.799382716049383, "grad_norm": 1.8893150159550898, "kl": 0.24169921875, "learning_rate": 2.1437630496912889e-07, "loss": -0.1146, "num_tokens": 25540801.0, "reward": 0.04442692548036575, "reward_std": 0.034866563975811005, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0007521397201344371, "rewards/logprob_reward/std": 0.001434684731066227, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 721.09375, "completions/mean_terminated_length": 700.9000244140625, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 2.802469135802469, "grad_norm": 2.020542494468501, "kl": 0.2286376953125, "learning_rate": 2.1388154291240794e-07, "loss": -0.2745, "num_tokens": 25569920.0, "reward": 0.05134275183081627, "reward_std": 0.05257630720734596, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0014919439563527703, "rewards/logprob_reward/std": 0.0018537379801273346, "step": 908 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 707.40625, "completions/mean_terminated_length": 674.6551513671875, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 2.8055555555555554, "grad_norm": 1.8797137190535893, "kl": NaN, "learning_rate": 2.133869252920089e-07, "loss": -0.1527, "num_tokens": 25599417.0, "reward": 0.03823522478342056, "reward_std": 0.04038812220096588, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0008169173379428685, "rewards/logprob_reward/std": 0.0017048893496394157, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 727.09375, "completions/mean_terminated_length": 696.3793334960938, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 2.808641975308642, "grad_norm": 1.8310669285933219, "kl": 0.2655029296875, "learning_rate": 2.128924540858885e-07, "loss": -0.0971, "num_tokens": 25629592.0, "reward": 0.019925329834222794, "reward_std": 0.033746667206287384, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.001305922050960362, "rewards/logprob_reward/std": 0.002956134732812643, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 701.96875, "completions/mean_terminated_length": 691.5806274414062, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 2.8117283950617287, "grad_norm": 2.176586044774021, "kl": 0.244873046875, "learning_rate": 2.1239813127141828e-07, "loss": -0.1411, "num_tokens": 25658383.0, "reward": 0.022919094190001488, "reward_std": 0.039000414311885834, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.001160103245638311, "rewards/logprob_reward/std": 0.0021208145190030336, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 740.8125, "completions/mean_terminated_length": 700.357177734375, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 2.814814814814815, "grad_norm": 2.4825655831882703, "kl": 0.2076416015625, "learning_rate": 2.1190395882537598e-07, "loss": -0.1728, "num_tokens": 25688761.0, "reward": 0.03592553734779358, "reward_std": 0.043933987617492676, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0017228213837370276, "rewards/logprob_reward/std": 0.0035046476405113935, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 778.71875, "completions/mean_terminated_length": 753.3448486328125, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 2.817901234567901, "grad_norm": 1.8471597814389293, "kl": 0.1898193359375, "learning_rate": 2.1140993872393833e-07, "loss": -0.0453, "num_tokens": 25720608.0, "reward": 0.03514767065644264, "reward_std": 0.0339820571243763, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0008585240575484931, "rewards/logprob_reward/std": 0.002222515409812331, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 770.125, "completions/mean_terminated_length": 723.1111450195312, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "epoch": 2.8209876543209877, "grad_norm": 1.4505306512388436, "kl": 0.2109375, "learning_rate": 2.1091607294267269e-07, "loss": -0.0187, "num_tokens": 25751912.0, "reward": 0.020330917090177536, "reward_std": 0.02412344701588154, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0017565754242241383, "rewards/logprob_reward/std": 0.004126360174268484, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 644.8125, "completions/mean_terminated_length": 619.5333862304688, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 2.824074074074074, "grad_norm": 2.6444669946440027, "kl": 0.23681640625, "learning_rate": 2.1042236345652947e-07, "loss": -0.2378, "num_tokens": 25778326.0, "reward": 0.032222680747509, "reward_std": 0.026036633178591728, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0010807574726641178, "rewards/logprob_reward/std": 0.0017917260993272066, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 720.8125, "completions/mean_terminated_length": 700.6000366210938, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 2.8271604938271606, "grad_norm": 2.3359892011606327, "kl": 0.2286376953125, "learning_rate": 2.0992881223983368e-07, "loss": -0.1425, "num_tokens": 25808068.0, "reward": 0.03367864713072777, "reward_std": 0.039221081882715225, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0026984962169080973, "rewards/logprob_reward/std": 0.0053538489155471325, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 743.4375, "completions/mean_terminated_length": 703.357177734375, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 2.830246913580247, "grad_norm": 1.8391771393551033, "kl": 0.247802734375, "learning_rate": 2.0943542126627784e-07, "loss": -0.1331, "num_tokens": 25838754.0, "reward": 0.03146309033036232, "reward_std": 0.045010365545749664, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.00023676696582697332, "rewards/logprob_reward/std": 0.0008217405993491411, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 783.78125, "completions/mean_terminated_length": 749.4642944335938, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 2.8333333333333335, "grad_norm": 1.8408733626005627, "kl": 0.1705322265625, "learning_rate": 2.0894219250891352e-07, "loss": -0.1964, "num_tokens": 25870683.0, "reward": 0.04172273352742195, "reward_std": 0.04826078563928604, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0012197005562484264, "rewards/logprob_reward/std": 0.0017729178071022034, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 694.75, "completions/mean_terminated_length": 672.800048828125, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 2.8364197530864197, "grad_norm": 2.1710575076842136, "kl": 0.2451171875, "learning_rate": 2.0844912794014341e-07, "loss": -0.2082, "num_tokens": 25899011.0, "reward": 0.019306883215904236, "reward_std": 0.02787940949201584, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0006187591934576631, "rewards/logprob_reward/std": 0.0014984877780079842, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 746.25, "completions/mean_terminated_length": 727.7333984375, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 2.8395061728395063, "grad_norm": 2.0881691914331175, "kl": 0.2122802734375, "learning_rate": 2.079562295317139e-07, "loss": -0.137, "num_tokens": 25929343.0, "reward": 0.02632749453186989, "reward_std": 0.03549928218126297, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0014749924885109067, "rewards/logprob_reward/std": 0.0022526613902300596, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 685.4375, "completions/mean_terminated_length": 674.51611328125, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 2.8425925925925926, "grad_norm": 2.000703787788389, "kl": 0.2445068359375, "learning_rate": 2.0746349925470672e-07, "loss": -0.072, "num_tokens": 25957837.0, "reward": 0.0353454053401947, "reward_std": 0.040932249277830124, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.001078227418474853, "rewards/logprob_reward/std": 0.0023284656926989555, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 678.03125, "completions/mean_terminated_length": 654.9666748046875, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 2.8456790123456788, "grad_norm": 2.779900339886693, "kl": 0.2435302734375, "learning_rate": 2.0697093907953134e-07, "loss": -0.2733, "num_tokens": 25986354.0, "reward": 0.02944917231798172, "reward_std": 0.04230036959052086, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0014713001437485218, "rewards/logprob_reward/std": 0.0037846944760531187, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 648.78125, "completions/mean_terminated_length": 623.7667236328125, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 2.8487654320987654, "grad_norm": 2.106664401008419, "kl": 0.24365234375, "learning_rate": 2.0647855097591704e-07, "loss": -0.2234, "num_tokens": 26013543.0, "reward": 0.03965634107589722, "reward_std": 0.05002221092581749, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.002395935822278261, "rewards/logprob_reward/std": 0.0039562019519507885, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 680.96875, "completions/mean_terminated_length": 658.1000366210938, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 2.851851851851852, "grad_norm": 2.096405505444134, "kl": 0.2164306640625, "learning_rate": 2.0598633691290485e-07, "loss": -0.1971, "num_tokens": 26042054.0, "reward": 0.04500027745962143, "reward_std": 0.056555259972810745, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0013891983544453979, "rewards/logprob_reward/std": 0.0026214104145765305, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 671.59375, "completions/mean_terminated_length": 635.137939453125, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 2.8549382716049383, "grad_norm": 1.8687028382292379, "kl": 0.2330322265625, "learning_rate": 2.054942988588399e-07, "loss": -0.059, "num_tokens": 26070213.0, "reward": 0.03630334883928299, "reward_std": 0.027717553079128265, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0021426090970635414, "rewards/logprob_reward/std": 0.003982920199632645, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 740.9375, "completions/mean_terminated_length": 731.8064575195312, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 2.8580246913580245, "grad_norm": 1.6972618165454312, "kl": 0.237548828125, "learning_rate": 2.050024387813634e-07, "loss": -0.2142, "num_tokens": 26100399.0, "reward": 0.03215242922306061, "reward_std": 0.03464379906654358, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0010027001844719052, "rewards/logprob_reward/std": 0.002128482097759843, "step": 926 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 753.71875, "completions/mean_terminated_length": 735.7000122070312, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 2.861111111111111, "grad_norm": 1.7761275671751802, "kl": NaN, "learning_rate": 2.0451075864740496e-07, "loss": -0.1009, "num_tokens": 26130846.0, "reward": 0.029843822121620178, "reward_std": 0.035880472511053085, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0019098015036433935, "rewards/logprob_reward/std": 0.0037320053670555353, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 747.5625, "completions/mean_terminated_length": 718.9655151367188, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 2.8641975308641974, "grad_norm": 1.7724801359206694, "kl": 0.217529296875, "learning_rate": 2.0401926042317455e-07, "loss": -0.1598, "num_tokens": 26161596.0, "reward": 0.032547809183597565, "reward_std": 0.027523929253220558, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0014420116785913706, "rewards/logprob_reward/std": 0.002904426772147417, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 734.15625, "completions/mean_terminated_length": 692.7500610351562, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 2.867283950617284, "grad_norm": 1.7664998792321178, "kl": 0.225830078125, "learning_rate": 2.0352794607415465e-07, "loss": -0.0817, "num_tokens": 26191673.0, "reward": 0.04807525873184204, "reward_std": 0.04088424891233444, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0013336197007447481, "rewards/logprob_reward/std": 0.0023917900398373604, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 659.1875, "completions/mean_terminated_length": 647.4193115234375, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 2.8703703703703702, "grad_norm": 2.7436686524222655, "kl": 0.2562255859375, "learning_rate": 2.0303681756509254e-07, "loss": -0.2881, "num_tokens": 26219299.0, "reward": 0.0265082735568285, "reward_std": 0.036000318825244904, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0016758590936660767, "rewards/logprob_reward/std": 0.004514663480222225, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 692.4375, "completions/mean_terminated_length": 670.3333740234375, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 2.873456790123457, "grad_norm": 2.264409669674971, "kl": 0.2442626953125, "learning_rate": 2.0254587685999215e-07, "loss": -0.1219, "num_tokens": 26248205.0, "reward": 0.04572978615760803, "reward_std": 0.03910953179001808, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0021997597068548203, "rewards/logprob_reward/std": 0.003756813472136855, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 736.875, "completions/mean_terminated_length": 727.6128540039062, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 2.876543209876543, "grad_norm": 1.8927253489778453, "kl": 0.231689453125, "learning_rate": 2.020551259221066e-07, "loss": -0.0791, "num_tokens": 26277901.0, "reward": 0.050919897854328156, "reward_std": 0.04152818024158478, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0010221096454188228, "rewards/logprob_reward/std": 0.00207647611387074, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 700.75, "completions/mean_terminated_length": 654.5714721679688, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 2.8796296296296298, "grad_norm": 2.6095332054392095, "kl": 0.22802734375, "learning_rate": 2.0156456671392988e-07, "loss": -0.3138, "num_tokens": 26306925.0, "reward": 0.026051906868815422, "reward_std": 0.03594221919775009, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0011687844526022673, "rewards/logprob_reward/std": 0.0023048915900290012, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 738.71875, "completions/mean_terminated_length": 709.2069091796875, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 2.882716049382716, "grad_norm": 2.3273980833688, "kl": 0.2119140625, "learning_rate": 2.010742011971895e-07, "loss": -0.1946, "num_tokens": 26337436.0, "reward": 0.01686321385204792, "reward_std": 0.032536305487155914, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0013757927808910608, "rewards/logprob_reward/std": 0.002666281070560217, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 683.15625, "completions/mean_terminated_length": 683.15625, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 2.8858024691358026, "grad_norm": 4.617620683870493, "kl": 0.2593994140625, "learning_rate": 2.005840313328383e-07, "loss": -0.2423, "num_tokens": 26365493.0, "reward": 0.02862159162759781, "reward_std": 0.032037027180194855, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0005517672398127615, "rewards/logprob_reward/std": 0.001306802616454661, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 632.53125, "completions/mean_terminated_length": 632.53125, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 2.888888888888889, "grad_norm": 3.2772853980728667, "kl": 0.247314453125, "learning_rate": 2.0009405908104673e-07, "loss": -0.3443, "num_tokens": 26391638.0, "reward": 0.047143395990133286, "reward_std": 0.045782119035720825, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0002982149017043412, "rewards/logprob_reward/std": 0.0007833261624909937, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 763.75, "completions/mean_terminated_length": 746.4000244140625, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 2.8919753086419755, "grad_norm": 2.43808565296315, "kl": 0.20361328125, "learning_rate": 1.996042864011951e-07, "loss": -0.2154, "num_tokens": 26422678.0, "reward": 0.025890544056892395, "reward_std": 0.0403691790997982, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0009894927497953176, "rewards/logprob_reward/std": 0.0026666016783565283, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 673.53125, "completions/mean_terminated_length": 623.4642944335938, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 2.8950617283950617, "grad_norm": 2.351329091261145, "kl": 0.24432373046875, "learning_rate": 1.9911471525186534e-07, "loss": -0.1476, "num_tokens": 26450139.0, "reward": 0.03514251857995987, "reward_std": 0.03566879779100418, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0008527971804141998, "rewards/logprob_reward/std": 0.0015333736082538962, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 718.5625, "completions/mean_terminated_length": 718.5625, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 2.898148148148148, "grad_norm": 2.3260060500535107, "kl": 0.2093505859375, "learning_rate": 1.9862534759083379e-07, "loss": -0.2907, "num_tokens": 26479697.0, "reward": 0.03334007412195206, "reward_std": 0.04166783392429352, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0023223059251904488, "rewards/logprob_reward/std": 0.004669366870075464, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 687.46875, "completions/mean_terminated_length": 687.46875, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 2.9012345679012346, "grad_norm": 2.299405399407316, "kl": 0.2261962890625, "learning_rate": 1.9813618537506302e-07, "loss": -0.1793, "num_tokens": 26508116.0, "reward": 0.038836508989334106, "reward_std": 0.04531483352184296, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.001485014334321022, "rewards/logprob_reward/std": 0.0030897411052137613, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 695.40625, "completions/mean_terminated_length": 673.5000610351562, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 2.9043209876543212, "grad_norm": 1.9386965966121879, "kl": 0.218017578125, "learning_rate": 1.9764723056069365e-07, "loss": -0.1481, "num_tokens": 26536901.0, "reward": 0.026116058230400085, "reward_std": 0.02804435044527054, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0012400643900036812, "rewards/logprob_reward/std": 0.0018863547593355179, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 694.75, "completions/mean_terminated_length": 672.800048828125, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 2.9074074074074074, "grad_norm": 2.2908048302294777, "kl": 0.231201171875, "learning_rate": 1.9715848510303739e-07, "loss": -0.2014, "num_tokens": 26565441.0, "reward": 0.04806693643331528, "reward_std": 0.04172935336828232, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001324373995885253, "rewards/logprob_reward/std": 0.002286948962137103, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 668.8125, "completions/mean_terminated_length": 657.3547973632812, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 2.9104938271604937, "grad_norm": 2.797264602347024, "kl": 0.22705078125, "learning_rate": 1.966699509565685e-07, "loss": -0.2226, "num_tokens": 26592935.0, "reward": 0.03517685830593109, "reward_std": 0.04654119908809662, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0008909569587558508, "rewards/logprob_reward/std": 0.0019737931434065104, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 724.46875, "completions/mean_terminated_length": 655.34619140625, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 2.9135802469135803, "grad_norm": 2.2749502960424355, "kl": 0.25048828125, "learning_rate": 1.961816300749163e-07, "loss": -0.1714, "num_tokens": 26622870.0, "reward": 0.0398334376513958, "reward_std": 0.045474015176296234, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.002592710079625249, "rewards/logprob_reward/std": 0.005250102840363979, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 650.375, "completions/mean_terminated_length": 638.3225708007812, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 2.9166666666666665, "grad_norm": 2.321613512476675, "kl": 0.2291259765625, "learning_rate": 1.9569352441085712e-07, "loss": -0.1206, "num_tokens": 26650066.0, "reward": 0.054110296070575714, "reward_std": 0.027386318892240524, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0010947741102427244, "rewards/logprob_reward/std": 0.002825903706252575, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 733.0, "completions/mean_terminated_length": 713.6000366210938, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 2.919753086419753, "grad_norm": 2.2568177731349293, "kl": 0.21337890625, "learning_rate": 1.9520563591630686e-07, "loss": -0.179, "num_tokens": 26679926.0, "reward": 0.04479778930544853, "reward_std": 0.04891182482242584, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0011642095632851124, "rewards/logprob_reward/std": 0.002152997301891446, "step": 946 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 747.5, "completions/mean_terminated_length": 696.2963256835938, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 2.9228395061728394, "grad_norm": 2.7880005014153393, "kl": NaN, "learning_rate": 1.9471796654231278e-07, "loss": -0.2831, "num_tokens": 26710714.0, "reward": 0.022786159068346024, "reward_std": 0.02745286375284195, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0010123990941792727, "rewards/logprob_reward/std": 0.003126228228211403, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 647.21875, "completions/mean_terminated_length": 622.1000366210938, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 2.925925925925926, "grad_norm": 2.3418864750573807, "kl": 0.2344970703125, "learning_rate": 1.9423051823904602e-07, "loss": -0.1891, "num_tokens": 26738029.0, "reward": 0.03349651023745537, "reward_std": 0.045034341514110565, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.002496121684089303, "rewards/logprob_reward/std": 0.005302882753312588, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 736.125, "completions/mean_terminated_length": 716.933349609375, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 2.9290123456790123, "grad_norm": 2.5584406905157295, "kl": 0.2247314453125, "learning_rate": 1.9374329295579372e-07, "loss": -0.0229, "num_tokens": 26768945.0, "reward": 0.03358815610408783, "reward_std": 0.033007651567459106, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0025979499332606792, "rewards/logprob_reward/std": 0.00609669741243124, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 694.34375, "completions/mean_terminated_length": 672.36669921875, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 2.932098765432099, "grad_norm": 2.0999638328254715, "kl": 0.22021484375, "learning_rate": 1.9325629264095083e-07, "loss": -0.0932, "num_tokens": 26797632.0, "reward": 0.0319441556930542, "reward_std": 0.039987243711948395, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0007712829392403364, "rewards/logprob_reward/std": 0.001864199643023312, "step": 950 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 741.0, "completions/mean_terminated_length": 700.5714721679688, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 2.935185185185185, "grad_norm": 2.85673139460347, "kl": NaN, "learning_rate": 1.9276951924201304e-07, "loss": -0.3032, "num_tokens": 26828492.0, "reward": 0.03654161095619202, "reward_std": 0.022315043956041336, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.002407343592494726, "rewards/logprob_reward/std": 0.003987497184425592, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 692.25, "completions/mean_terminated_length": 692.25, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 2.9382716049382713, "grad_norm": 2.587864809945036, "kl": 0.2215576171875, "learning_rate": 1.922829747055684e-07, "loss": -0.3074, "num_tokens": 26857088.0, "reward": 0.04753436893224716, "reward_std": 0.04605057090520859, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.004204855766147375, "rewards/logprob_reward/std": 0.008981630206108093, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 655.125, "completions/mean_terminated_length": 655.125, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 2.941358024691358, "grad_norm": 2.2185380658297977, "kl": 0.2635498046875, "learning_rate": 1.9179666097728982e-07, "loss": -0.1516, "num_tokens": 26884268.0, "reward": 0.02021958865225315, "reward_std": 0.03220298886299133, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.001632875413633883, "rewards/logprob_reward/std": 0.0028763532172888517, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 663.875, "completions/mean_terminated_length": 652.258056640625, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 2.9444444444444446, "grad_norm": 2.514627285805861, "kl": 0.2249755859375, "learning_rate": 1.9131058000192726e-07, "loss": -0.1363, "num_tokens": 26911888.0, "reward": 0.044653862714767456, "reward_std": 0.0406486913561821, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0010042909998446703, "rewards/logprob_reward/std": 0.00209109578281641, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 658.3125, "completions/mean_terminated_length": 646.51611328125, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 2.947530864197531, "grad_norm": 2.402558732506387, "kl": 0.22216796875, "learning_rate": 1.9082473372329983e-07, "loss": -0.1917, "num_tokens": 26939318.0, "reward": 0.035541120916604996, "reward_std": 0.02778727188706398, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0012956904247403145, "rewards/logprob_reward/std": 0.0030140208546072245, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 624.5, "completions/mean_terminated_length": 624.5, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 2.950617283950617, "grad_norm": 2.3041716856268715, "kl": 0.2430419921875, "learning_rate": 1.903391240842882e-07, "loss": -0.352, "num_tokens": 26965298.0, "reward": 0.04536670073866844, "reward_std": 0.04791241139173508, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0017963348655030131, "rewards/logprob_reward/std": 0.0029619657434523106, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 740.0, "completions/mean_terminated_length": 710.6206665039062, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 2.9537037037037037, "grad_norm": 2.511745953771838, "kl": 0.20751953125, "learning_rate": 1.8985375302682654e-07, "loss": -0.2179, "num_tokens": 26995682.0, "reward": 0.034126099199056625, "reward_std": 0.04123295471072197, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.003195664379745722, "rewards/logprob_reward/std": 0.008202649652957916, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 638.4375, "completions/mean_terminated_length": 638.4375, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 2.9567901234567904, "grad_norm": 2.0066421954219074, "kl": 0.2579345703125, "learning_rate": 1.8936862249189515e-07, "loss": -0.2235, "num_tokens": 27022396.0, "reward": 0.036057278513908386, "reward_std": 0.03543171286582947, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.001869198982603848, "rewards/logprob_reward/std": 0.00807044468820095, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 726.6875, "completions/mean_terminated_length": 706.86669921875, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 2.9598765432098766, "grad_norm": 2.5928406908012582, "kl": 0.210205078125, "learning_rate": 1.8888373441951228e-07, "loss": -0.2476, "num_tokens": 27052470.0, "reward": 0.016658205538988113, "reward_std": 0.02684372290968895, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0011480040848255157, "rewards/logprob_reward/std": 0.0028190447483211756, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 733.375, "completions/mean_terminated_length": 714.0000610351562, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 2.962962962962963, "grad_norm": 2.2541048802611576, "kl": 0.21533203125, "learning_rate": 1.8839909074872675e-07, "loss": -0.2485, "num_tokens": 27082594.0, "reward": 0.03550969064235687, "reward_std": 0.04626007378101349, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0012607639655470848, "rewards/logprob_reward/std": 0.0024260045029222965, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 702.96875, "completions/mean_terminated_length": 681.5667114257812, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 2.9660493827160495, "grad_norm": 1.8861370870638634, "kl": 0.2171630859375, "learning_rate": 1.8791469341761e-07, "loss": -0.137, "num_tokens": 27111269.0, "reward": 0.04205770790576935, "reward_std": 0.041784629225730896, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0015919003635644913, "rewards/logprob_reward/std": 0.0029006951954215765, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 694.3125, "completions/mean_terminated_length": 694.3125, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 2.9691358024691357, "grad_norm": 2.4936754152100336, "kl": 0.2056884765625, "learning_rate": 1.8743054436324835e-07, "loss": -0.4614, "num_tokens": 27140059.0, "reward": 0.03833600506186485, "reward_std": 0.047480180859565735, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0009288942092098296, "rewards/logprob_reward/std": 0.0017160874558612704, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 686.21875, "completions/mean_terminated_length": 686.21875, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 2.9722222222222223, "grad_norm": 2.1449705143275857, "kl": 0.21044921875, "learning_rate": 1.8694664552173529e-07, "loss": -0.0667, "num_tokens": 27168474.0, "reward": 0.0475720576941967, "reward_std": 0.03860652074217796, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0007745065959170461, "rewards/logprob_reward/std": 0.0016013880958780646, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 712.625, "completions/mean_terminated_length": 680.413818359375, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 2.9753086419753085, "grad_norm": 3.0540231894763217, "kl": 0.2491455078125, "learning_rate": 1.8646299882816358e-07, "loss": -0.2756, "num_tokens": 27197718.0, "reward": 0.04453630745410919, "reward_std": 0.04714227467775345, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0008736720774322748, "rewards/logprob_reward/std": 0.0018073354149237275, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 679.3125, "completions/mean_terminated_length": 679.3125, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 2.978395061728395, "grad_norm": 2.0565833101266557, "kl": 0.213623046875, "learning_rate": 1.859796062166178e-07, "loss": -0.081, "num_tokens": 27226264.0, "reward": 0.046038296073675156, "reward_std": 0.03503313288092613, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.002542553935199976, "rewards/logprob_reward/std": 0.004069070797413588, "step": 965 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 752.90625, "completions/mean_terminated_length": 724.862060546875, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 2.9814814814814814, "grad_norm": 1.7593721687065136, "kl": NaN, "learning_rate": 1.854964696201666e-07, "loss": -0.0906, "num_tokens": 27257273.0, "reward": 0.025743938982486725, "reward_std": 0.03171555697917938, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0008265995420515537, "rewards/logprob_reward/std": 0.0023957984521985054, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 655.25, "completions/mean_terminated_length": 655.25, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 2.984567901234568, "grad_norm": 1.8427884565955073, "kl": 0.2392578125, "learning_rate": 1.850135909708544e-07, "loss": -0.0342, "num_tokens": 27284561.0, "reward": 0.04149714112281799, "reward_std": 0.046559251844882965, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0009690446313470602, "rewards/logprob_reward/std": 0.002075807424262166, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 639.34375, "completions/mean_terminated_length": 639.34375, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 2.9876543209876543, "grad_norm": 2.3890344483833665, "kl": 0.2347412109375, "learning_rate": 1.8453097219969448e-07, "loss": -0.1873, "num_tokens": 27311436.0, "reward": 0.042143747210502625, "reward_std": 0.04867733269929886, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0016874989960342646, "rewards/logprob_reward/std": 0.0028819881845265627, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 701.59375, "completions/mean_terminated_length": 691.1935424804688, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 2.9907407407407405, "grad_norm": 2.2579359892038533, "kl": 0.2457275390625, "learning_rate": 1.8404861523666073e-07, "loss": -0.1065, "num_tokens": 27340667.0, "reward": 0.032514940947294235, "reward_std": 0.04773382470011711, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0014054876519367099, "rewards/logprob_reward/std": 0.0029887219425290823, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 641.875, "completions/mean_terminated_length": 629.54833984375, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 2.993827160493827, "grad_norm": 2.759877169084418, "kl": 0.2598876953125, "learning_rate": 1.8356652201068024e-07, "loss": -0.2884, "num_tokens": 27367331.0, "reward": 0.04795064032077789, "reward_std": 0.03241584450006485, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001195153803564608, "rewards/logprob_reward/std": 0.0020508323796093464, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 690.9375, "completions/mean_terminated_length": 690.9375, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 2.996913580246914, "grad_norm": 2.0833704877638564, "kl": 0.216796875, "learning_rate": 1.830846944496251e-07, "loss": -0.1624, "num_tokens": 27395669.0, "reward": 0.04163329675793648, "reward_std": 0.03980160504579544, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0011203312315046787, "rewards/logprob_reward/std": 0.0018936976557597518, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 757.625, "completions/mean_terminated_length": 708.2963256835938, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 3.0, "grad_norm": 1.676104430337265, "kl": 0.2392578125, "learning_rate": 1.826031344803053e-07, "loss": -0.0363, "num_tokens": 27426409.0, "reward": 0.03207068517804146, "reward_std": 0.04026733711361885, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0009118721354752779, "rewards/logprob_reward/std": 0.0016733306692913175, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 727.25, "completions/mean_terminated_length": 684.857177734375, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 3.003086419753086, "grad_norm": 2.085222805406508, "kl": 0.219482421875, "learning_rate": 1.8212184402846064e-07, "loss": -0.0639, "num_tokens": 27456689.0, "reward": 0.02974565513432026, "reward_std": 0.03855707868933678, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0018007296603173018, "rewards/logprob_reward/std": 0.00419599749147892, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 725.6875, "completions/mean_terminated_length": 683.0714721679688, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 3.006172839506173, "grad_norm": 2.1373416146080815, "kl": 0.2381591796875, "learning_rate": 1.8164082501875326e-07, "loss": -0.1113, "num_tokens": 27486407.0, "reward": 0.01286717876791954, "reward_std": 0.025322534143924713, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.0004079767386429012, "rewards/logprob_reward/std": 0.0014256259892135859, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 692.15625, "completions/mean_terminated_length": 692.15625, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 3.009259259259259, "grad_norm": 2.240096866392538, "kl": 0.224609375, "learning_rate": 1.8116007937475947e-07, "loss": -0.1386, "num_tokens": 27515024.0, "reward": 0.04009921848773956, "reward_std": 0.05619942396879196, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.002888019662350416, "rewards/logprob_reward/std": 0.008251198567450047, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 688.0625, "completions/mean_terminated_length": 677.2257690429688, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 3.0123456790123457, "grad_norm": 2.0895995813476627, "kl": 0.21240234375, "learning_rate": 1.8067960901896278e-07, "loss": -0.2006, "num_tokens": 27543946.0, "reward": 0.02907356433570385, "reward_std": 0.03428079932928085, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0010539607610553503, "rewards/logprob_reward/std": 0.002084154635667801, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 711.6875, "completions/mean_terminated_length": 701.6128540039062, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 3.015432098765432, "grad_norm": 2.002426319069105, "kl": 0.2469482421875, "learning_rate": 1.8019941587274565e-07, "loss": -0.2033, "num_tokens": 27573084.0, "reward": 0.03858362138271332, "reward_std": 0.04214697331190109, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0012040241854265332, "rewards/logprob_reward/std": 0.001954335253685713, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 648.125, "completions/mean_terminated_length": 648.125, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 3.0185185185185186, "grad_norm": 2.352263155111765, "kl": 0.231201171875, "learning_rate": 1.7971950185638195e-07, "loss": -0.1319, "num_tokens": 27599792.0, "reward": 0.0429069846868515, "reward_std": 0.03892037644982338, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0025355410762131214, "rewards/logprob_reward/std": 0.0033677336759865284, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 772.90625, "completions/mean_terminated_length": 714.9615478515625, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 3.021604938271605, "grad_norm": 2.8889595705531117, "kl": 0.206298828125, "learning_rate": 1.7923986888902948e-07, "loss": -0.1802, "num_tokens": 27630777.0, "reward": 0.03198745846748352, "reward_std": 0.03233847767114639, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.000819399137981236, "rewards/logprob_reward/std": 0.0019720583222806454, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 716.96875, "completions/mean_terminated_length": 660.1111450195312, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 3.0246913580246915, "grad_norm": 2.0156040990052833, "kl": 0.241943359375, "learning_rate": 1.78760518888722e-07, "loss": -0.1204, "num_tokens": 27660368.0, "reward": 0.03233620151877403, "reward_std": 0.027743151411414146, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0012068887008354068, "rewards/logprob_reward/std": 0.0026189556811004877, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 631.84375, "completions/mean_terminated_length": 619.1935424804688, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 3.0277777777777777, "grad_norm": 1.9160075974092692, "kl": 0.2626953125, "learning_rate": 1.782814537723617e-07, "loss": -0.0237, "num_tokens": 27686723.0, "reward": 0.05130518600344658, "reward_std": 0.03933805599808693, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.001450207200832665, "rewards/logprob_reward/std": 0.0026596777606755495, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 738.125, "completions/mean_terminated_length": 708.5516967773438, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 3.0308641975308643, "grad_norm": 3.7638909857862006, "kl": 0.2254638671875, "learning_rate": 1.7780267545571175e-07, "loss": -0.2725, "num_tokens": 27716895.0, "reward": 0.04446043074131012, "reward_std": 0.04653332382440567, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0007893663714639843, "rewards/logprob_reward/std": 0.002341961720958352, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 690.03125, "completions/mean_terminated_length": 679.258056640625, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 3.0339506172839505, "grad_norm": 2.510787819377592, "kl": 0.2392578125, "learning_rate": 1.7732418585338804e-07, "loss": -0.1958, "num_tokens": 27745608.0, "reward": 0.035718273371458054, "reward_std": 0.044114019721746445, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0014925259165465832, "rewards/logprob_reward/std": 0.0029040889348834753, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 751.5625, "completions/mean_terminated_length": 723.3793334960938, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 3.037037037037037, "grad_norm": 2.3752642757269244, "kl": 0.2181396484375, "learning_rate": 1.7684598687885216e-07, "loss": -0.2773, "num_tokens": 27776058.0, "reward": 0.0384933166205883, "reward_std": 0.04638366401195526, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.001103683258406818, "rewards/logprob_reward/std": 0.002340720733627677, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 672.15625, "completions/mean_terminated_length": 660.8064575195312, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 3.0401234567901234, "grad_norm": 2.0222497224829206, "kl": 0.2366943359375, "learning_rate": 1.7636808044440344e-07, "loss": -0.1634, "num_tokens": 27804003.0, "reward": 0.051867835223674774, "reward_std": 0.04846703261137009, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0020753692369908094, "rewards/logprob_reward/std": 0.004790341481566429, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 719.40625, "completions/mean_terminated_length": 699.1000366210938, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 3.04320987654321, "grad_norm": 2.5950314072937557, "kl": 0.223388671875, "learning_rate": 1.7589046846117132e-07, "loss": -0.2739, "num_tokens": 27833460.0, "reward": 0.045012228190898895, "reward_std": 0.047928906977176666, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0014024768024682999, "rewards/logprob_reward/std": 0.0025697017554193735, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 684.78125, "completions/mean_terminated_length": 662.1666870117188, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 3.0462962962962963, "grad_norm": 2.3693857370894746, "kl": 0.2379150390625, "learning_rate": 1.754131528391078e-07, "loss": -0.2839, "num_tokens": 27861725.0, "reward": 0.04861406981945038, "reward_std": 0.049128152430057526, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0019322949228808284, "rewards/logprob_reward/std": 0.0037612488958984613, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 663.03125, "completions/mean_terminated_length": 638.9666748046875, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 3.049382716049383, "grad_norm": 2.3964764203097415, "kl": 0.2425537109375, "learning_rate": 1.7493613548697966e-07, "loss": -0.1999, "num_tokens": 27889606.0, "reward": 0.044594209641218185, "reward_std": 0.04765457659959793, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0009380142437294126, "rewards/logprob_reward/std": 0.0017897032666951418, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 706.125, "completions/mean_terminated_length": 673.2413940429688, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 3.052469135802469, "grad_norm": 3.0514295695731266, "kl": 0.2366943359375, "learning_rate": 1.744594183123611e-07, "loss": -0.2699, "num_tokens": 27919190.0, "reward": 0.0392814502120018, "reward_std": 0.043264396488666534, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0019793917890638113, "rewards/logprob_reward/std": 0.003085510805249214, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 618.75, "completions/mean_terminated_length": 618.75, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 3.0555555555555554, "grad_norm": 1.9141571541766174, "kl": 0.227783203125, "learning_rate": 1.7398300322162563e-07, "loss": -0.1182, "num_tokens": 27945302.0, "reward": 0.03834158182144165, "reward_std": 0.04912228882312775, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0009350912878289819, "rewards/logprob_reward/std": 0.0020096460357308388, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 685.78125, "completions/mean_terminated_length": 663.2333984375, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 3.058641975308642, "grad_norm": 2.431587339854599, "kl": 0.2183837890625, "learning_rate": 1.7350689211993902e-07, "loss": -0.296, "num_tokens": 27973827.0, "reward": 0.03231380879878998, "reward_std": 0.03982333093881607, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0011820093495771289, "rewards/logprob_reward/std": 0.0032097590155899525, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 698.40625, "completions/mean_terminated_length": 676.7000122070312, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 3.0617283950617282, "grad_norm": 2.468386918464936, "kl": 0.22802734375, "learning_rate": 1.7303108691125107e-07, "loss": -0.1473, "num_tokens": 28002588.0, "reward": 0.03651568293571472, "reward_std": 0.03315194696187973, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0023785370867699385, "rewards/logprob_reward/std": 0.002841934096068144, "step": 992 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 818.15625, "completions/mean_terminated_length": 760.5199584960938, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 3.064814814814815, "grad_norm": 2.054287674639405, "kl": NaN, "learning_rate": 1.725555894982887e-07, "loss": -0.2509, "num_tokens": 28036217.0, "reward": 0.025921491906046867, "reward_std": 0.03331119567155838, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0010238795075565577, "rewards/logprob_reward/std": 0.0026635329704731703, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 718.1875, "completions/mean_terminated_length": 708.3225708007812, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 3.067901234567901, "grad_norm": 1.999015239923265, "kl": 0.2410888671875, "learning_rate": 1.7208040178254768e-07, "loss": -0.0784, "num_tokens": 28065959.0, "reward": 0.03385654091835022, "reward_std": 0.034333594143390656, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0028961554635316133, "rewards/logprob_reward/std": 0.005538558587431908, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 695.21875, "completions/mean_terminated_length": 695.21875, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 3.0709876543209877, "grad_norm": 3.0853137953398457, "kl": 0.2315673828125, "learning_rate": 1.716055256642855e-07, "loss": -0.3443, "num_tokens": 28095138.0, "reward": 0.0421161986887455, "reward_std": 0.047297537326812744, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0016568891005590558, "rewards/logprob_reward/std": 0.003244540421292186, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 701.53125, "completions/mean_terminated_length": 691.1290283203125, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 3.074074074074074, "grad_norm": 2.3533149446198283, "kl": 0.20751953125, "learning_rate": 1.711309630425135e-07, "loss": -0.364, "num_tokens": 28123683.0, "reward": 0.05214660242199898, "reward_std": 0.041597574949264526, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0023851157166063786, "rewards/logprob_reward/std": 0.008369777351617813, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 646.0625, "completions/mean_terminated_length": 620.86669921875, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 3.0771604938271606, "grad_norm": 2.593518711271626, "kl": 0.2825927734375, "learning_rate": 1.7065671581498936e-07, "loss": -0.2214, "num_tokens": 28150341.0, "reward": 0.02638131007552147, "reward_std": 0.032486990094184875, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0015347886364907026, "rewards/logprob_reward/std": 0.003711605677381158, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 702.03125, "completions/mean_terminated_length": 668.72412109375, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 3.080246913580247, "grad_norm": 2.2021572076631157, "kl": 0.26025390625, "learning_rate": 1.701827858782095e-07, "loss": -0.1954, "num_tokens": 28178682.0, "reward": 0.029635073617100716, "reward_std": 0.02696829102933407, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0016778578283265233, "rewards/logprob_reward/std": 0.0037237918004393578, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 738.875, "completions/mean_terminated_length": 729.6773681640625, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 3.0833333333333335, "grad_norm": 2.0928816058538193, "kl": 0.2159423828125, "learning_rate": 1.697091751274016e-07, "loss": -0.1492, "num_tokens": 28209202.0, "reward": 0.041847534477710724, "reward_std": 0.04098542034626007, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0013583700638264418, "rewards/logprob_reward/std": 0.0031691337935626507, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 722.3125, "completions/mean_terminated_length": 722.3125, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 3.0864197530864197, "grad_norm": 1.712853241916546, "kl": 0.2030029296875, "learning_rate": 1.6923588545651672e-07, "loss": -0.0253, "num_tokens": 28239016.0, "reward": 0.04628859832882881, "reward_std": 0.03862706571817398, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.002820664783939719, "rewards/logprob_reward/std": 0.0033508921042084694, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 738.65625, "completions/mean_terminated_length": 709.137939453125, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 3.0895061728395063, "grad_norm": 2.1370810475376314, "kl": 0.22802734375, "learning_rate": 1.687629187582221e-07, "loss": -0.1515, "num_tokens": 28269029.0, "reward": 0.012849628925323486, "reward_std": 0.025137685239315033, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.00038847571704536676, "rewards/logprob_reward/std": 0.001013743574731052, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 695.53125, "completions/mean_terminated_length": 684.9354858398438, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 3.0925925925925926, "grad_norm": 1.8238148773066314, "kl": 0.242431640625, "learning_rate": 1.6829027692389343e-07, "loss": -0.0121, "num_tokens": 28297110.0, "reward": 0.04439171403646469, "reward_std": 0.04510121047496796, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0007130148005671799, "rewards/logprob_reward/std": 0.001146567054092884, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 755.6875, "completions/mean_terminated_length": 680.5599975585938, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 3.095679012345679, "grad_norm": 1.855925683973445, "kl": 0.23095703125, "learning_rate": 1.678179618436073e-07, "loss": -0.0604, "num_tokens": 28327868.0, "reward": 0.02557981386780739, "reward_std": 0.0329207107424736, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0006442366866394877, "rewards/logprob_reward/std": 0.0016100271604955196, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 700.3125, "completions/mean_terminated_length": 678.7333984375, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 3.0987654320987654, "grad_norm": 1.8113067304562025, "kl": 0.2279052734375, "learning_rate": 1.6734597540613344e-07, "loss": -0.101, "num_tokens": 28356466.0, "reward": 0.044626496732234955, "reward_std": 0.04724197834730148, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0009738857625052333, "rewards/logprob_reward/std": 0.0015569920651614666, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 733.46875, "completions/mean_terminated_length": 703.413818359375, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 3.1018518518518516, "grad_norm": 3.41024478756569, "kl": 0.2373046875, "learning_rate": 1.6687431949892753e-07, "loss": -0.4791, "num_tokens": 28386333.0, "reward": 0.03247016295790672, "reward_std": 0.045533549040555954, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.001355737796984613, "rewards/logprob_reward/std": 0.0025667925365269184, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 732.5, "completions/mean_terminated_length": 702.3448486328125, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 3.1049382716049383, "grad_norm": 2.131559011515287, "kl": 0.22900390625, "learning_rate": 1.664029960081234e-07, "loss": -0.1937, "num_tokens": 28416165.0, "reward": 0.0485088974237442, "reward_std": 0.034199196845293045, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001815442810766399, "rewards/logprob_reward/std": 0.0028548906557261944, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 677.53125, "completions/mean_terminated_length": 641.6896362304688, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 3.1080246913580245, "grad_norm": 2.070636568687511, "kl": 0.23876953125, "learning_rate": 1.6593200681852574e-07, "loss": -0.1072, "num_tokens": 28444226.0, "reward": 0.035720594227313995, "reward_std": 0.04157908633351326, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0014951014891266823, "rewards/logprob_reward/std": 0.0026378934271633625, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 731.34375, "completions/mean_terminated_length": 701.0689697265625, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 3.111111111111111, "grad_norm": 1.9857547587384312, "kl": 0.232421875, "learning_rate": 1.6546135381360194e-07, "loss": -0.1387, "num_tokens": 28474009.0, "reward": 0.03834307938814163, "reward_std": 0.03995034843683243, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0009367556776851416, "rewards/logprob_reward/std": 0.0020925123244524, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 737.125, "completions/mean_terminated_length": 718.0000610351562, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 3.1141975308641974, "grad_norm": 4.697781797002306, "kl": 0.2415771484375, "learning_rate": 1.6499103887547544e-07, "loss": -0.2556, "num_tokens": 28504245.0, "reward": 0.0329495370388031, "reward_std": 0.040524132549762726, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0018883757293224335, "rewards/logprob_reward/std": 0.003306453814730048, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 749.78125, "completions/mean_terminated_length": 721.413818359375, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 3.117283950617284, "grad_norm": 1.8396823859155615, "kl": 0.202880859375, "learning_rate": 1.6452106388491762e-07, "loss": -0.3582, "num_tokens": 28535226.0, "reward": 0.03948305547237396, "reward_std": 0.04296427220106125, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.002203392330557108, "rewards/logprob_reward/std": 0.004145064856857061, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 717.28125, "completions/mean_terminated_length": 696.8333740234375, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 3.1203703703703702, "grad_norm": 2.3819930186029965, "kl": 0.24072265625, "learning_rate": 1.6405143072134031e-07, "loss": -0.1811, "num_tokens": 28564647.0, "reward": 0.04897215962409973, "reward_std": 0.04186669737100601, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.002330177929252386, "rewards/logprob_reward/std": 0.004359105136245489, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 728.59375, "completions/mean_terminated_length": 708.9000244140625, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 3.123456790123457, "grad_norm": 2.987956025095811, "kl": 0.238525390625, "learning_rate": 1.6358214126278855e-07, "loss": -0.4203, "num_tokens": 28594550.0, "reward": 0.03603680804371834, "reward_std": 0.027981655672192574, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0018464522436261177, "rewards/logprob_reward/std": 0.003124150214716792, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 685.65625, "completions/mean_terminated_length": 637.3214721679688, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 3.126543209876543, "grad_norm": 1.7103195077043507, "kl": 0.25830078125, "learning_rate": 1.6311319738593281e-07, "loss": -0.0873, "num_tokens": 28622875.0, "reward": 0.04240027815103531, "reward_std": 0.02873365208506584, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.00197252887301147, "rewards/logprob_reward/std": 0.0038132520858198404, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 701.125, "completions/mean_terminated_length": 690.7096557617188, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 3.1296296296296298, "grad_norm": 2.042134028680897, "kl": 0.235107421875, "learning_rate": 1.6264460096606169e-07, "loss": -0.275, "num_tokens": 28651531.0, "reward": 0.04480944573879242, "reward_std": 0.04709518328309059, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0011771632125601172, "rewards/logprob_reward/std": 0.0018997747683897614, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 683.0625, "completions/mean_terminated_length": 660.3333740234375, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 3.132716049382716, "grad_norm": 2.3774079561933714, "kl": 0.236572265625, "learning_rate": 1.621763538770743e-07, "loss": -0.195, "num_tokens": 28679949.0, "reward": 0.016525961458683014, "reward_std": 0.026975713670253754, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.001001067110337317, "rewards/logprob_reward/std": 0.0017371024005115032, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 773.75, "completions/mean_terminated_length": 738.0000610351562, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 3.1358024691358026, "grad_norm": 2.4172906761602793, "kl": 0.1939697265625, "learning_rate": 1.6170845799147266e-07, "loss": -0.2543, "num_tokens": 28711293.0, "reward": 0.02256914973258972, "reward_std": 0.039598651230335236, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0007712763035669923, "rewards/logprob_reward/std": 0.0014853577595204115, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 745.03125, "completions/mean_terminated_length": 680.6538696289062, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 3.138888888888889, "grad_norm": 2.2310177139275753, "kl": 0.211669921875, "learning_rate": 1.6124091518035443e-07, "loss": -0.2689, "num_tokens": 28741122.0, "reward": 0.0354393795132637, "reward_std": 0.051835887134075165, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0011826427653431892, "rewards/logprob_reward/std": 0.0019029693212360144, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 700.5625, "completions/mean_terminated_length": 679.0000610351562, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 3.1419753086419755, "grad_norm": 3.6628987143442293, "kl": 0.253662109375, "learning_rate": 1.607737273134054e-07, "loss": -0.2316, "num_tokens": 28769980.0, "reward": 0.05108967423439026, "reward_std": 0.04814573749899864, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0012107489164918661, "rewards/logprob_reward/std": 0.0027121107559651136, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 754.25, "completions/mean_terminated_length": 726.3448486328125, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 3.1450617283950617, "grad_norm": 2.639872367878862, "kl": 0.22802734375, "learning_rate": 1.603068962588918e-07, "loss": -0.3494, "num_tokens": 28800628.0, "reward": 0.02560967206954956, "reward_std": 0.04000038653612137, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0006774109788239002, "rewards/logprob_reward/std": 0.0017976548988372087, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 698.03125, "completions/mean_terminated_length": 664.3103637695312, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 3.148148148148148, "grad_norm": 2.3527739878713354, "kl": 0.2325439453125, "learning_rate": 1.598404238836532e-07, "loss": -0.1972, "num_tokens": 28829805.0, "reward": 0.047809161245822906, "reward_std": 0.047749098390340805, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0010379606392234564, "rewards/logprob_reward/std": 0.0016769595677033067, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 760.4375, "completions/mean_terminated_length": 742.86669921875, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 3.1512345679012346, "grad_norm": 2.0403487129467446, "kl": 0.2138671875, "learning_rate": 1.5937431205309465e-07, "loss": -0.1521, "num_tokens": 28861047.0, "reward": 0.03904839605093002, "reward_std": 0.038639068603515625, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0017204422038048506, "rewards/logprob_reward/std": 0.0029273114632815123, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 760.4375, "completions/mean_terminated_length": 742.86669921875, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 3.154320987654321, "grad_norm": 1.7217313677726709, "kl": 0.245849609375, "learning_rate": 1.589085626311795e-07, "loss": -0.0634, "num_tokens": 28891781.0, "reward": 0.03565390780568123, "reward_std": 0.03870029002428055, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0014210063964128494, "rewards/logprob_reward/std": 0.0025313065852969885, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 731.03125, "completions/mean_terminated_length": 700.72412109375, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 3.1574074074074074, "grad_norm": 2.2566933995265686, "kl": 0.2421875, "learning_rate": 1.5844317748042167e-07, "loss": -0.1424, "num_tokens": 28921954.0, "reward": 0.04573764652013779, "reward_std": 0.03755033761262894, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0022084980737417936, "rewards/logprob_reward/std": 0.003018285147845745, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 775.8125, "completions/mean_terminated_length": 718.5385131835938, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 3.1604938271604937, "grad_norm": 1.9696639810162537, "kl": 0.2086181640625, "learning_rate": 1.5797815846187868e-07, "loss": -0.2129, "num_tokens": 28953320.0, "reward": 0.0224867295473814, "reward_std": 0.03390219807624817, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0006796991219744086, "rewards/logprob_reward/std": 0.0016114730387926102, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 718.1875, "completions/mean_terminated_length": 718.1875, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 3.1635802469135803, "grad_norm": 2.1648531153637016, "kl": 0.2083740234375, "learning_rate": 1.575135074351435e-07, "loss": -0.1379, "num_tokens": 28982930.0, "reward": 0.034196820110082626, "reward_std": 0.03071579523384571, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0032742430921643972, "rewards/logprob_reward/std": 0.0059987688437104225, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 736.125, "completions/mean_terminated_length": 706.3448486328125, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 3.1666666666666665, "grad_norm": 2.182235340928639, "kl": 0.2423095703125, "learning_rate": 1.5704922625833784e-07, "loss": -0.1535, "num_tokens": 29013358.0, "reward": 0.022522062063217163, "reward_std": 0.027043446898460388, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0007189557072706521, "rewards/logprob_reward/std": 0.0018420717678964138, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 712.375, "completions/mean_terminated_length": 680.137939453125, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 3.169753086419753, "grad_norm": 2.301605215830569, "kl": 0.226806640625, "learning_rate": 1.565853167881042e-07, "loss": -0.2366, "num_tokens": 29042442.0, "reward": 0.041161030530929565, "reward_std": 0.040761448442935944, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0005955874803476036, "rewards/logprob_reward/std": 0.0013528020353987813, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 690.78125, "completions/mean_terminated_length": 668.5667114257812, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 3.1728395061728394, "grad_norm": 1.7288588696954923, "kl": 0.2275390625, "learning_rate": 1.5612178087959887e-07, "loss": -0.0607, "num_tokens": 29070931.0, "reward": 0.05454988032579422, "reward_std": 0.03911234438419342, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0015832000644877553, "rewards/logprob_reward/std": 0.0027248659171164036, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 675.4375, "completions/mean_terminated_length": 664.1935424804688, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 3.175925925925926, "grad_norm": 2.007448995328529, "kl": 0.25830078125, "learning_rate": 1.556586203864841e-07, "loss": -0.237, "num_tokens": 29098761.0, "reward": 0.03903896361589432, "reward_std": 0.04599328339099884, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0017099580727517605, "rewards/logprob_reward/std": 0.0027005146257579327, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 767.78125, "completions/mean_terminated_length": 731.1785888671875, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 3.1790123456790123, "grad_norm": 2.224659062172624, "kl": 0.2608642578125, "learning_rate": 1.5519583716092077e-07, "loss": -0.1733, "num_tokens": 29130854.0, "reward": 0.02665591612458229, "reward_std": 0.033299244940280914, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0018399073742330074, "rewards/logprob_reward/std": 0.003487040288746357, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 797.25, "completions/mean_terminated_length": 782.1333618164062, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 3.182098765432099, "grad_norm": 2.2077104602188977, "kl": 0.1951904296875, "learning_rate": 1.5473343305356136e-07, "loss": -0.1501, "num_tokens": 29163074.0, "reward": 0.03273075073957443, "reward_std": 0.04101334884762764, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0016452809795737267, "rewards/logprob_reward/std": 0.003429353702813387, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 682.65625, "completions/mean_terminated_length": 671.6451416015625, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 3.185185185185185, "grad_norm": 2.987474498192784, "kl": 0.22998046875, "learning_rate": 1.5427140991354215e-07, "loss": -0.3532, "num_tokens": 29191467.0, "reward": 0.016893979161977768, "reward_std": 0.022237852215766907, "rewards/format_reward_func/mean": 0.15625, "rewards/format_reward_func/std": 0.3689020276069641, "rewards/logprob_reward/mean": 0.0014099783729761839, "rewards/logprob_reward/std": 0.0024758463259786367, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 713.59375, "completions/mean_terminated_length": 681.4827270507812, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 3.1882716049382718, "grad_norm": 2.205118304246579, "kl": 0.22509765625, "learning_rate": 1.5380976958847572e-07, "loss": -0.2685, "num_tokens": 29220790.0, "reward": 0.038550931960344315, "reward_std": 0.04296690225601196, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0011676998110488057, "rewards/logprob_reward/std": 0.0033502120058983564, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 830.875, "completions/mean_terminated_length": 766.5, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 3.191358024691358, "grad_norm": 2.122485568618202, "kl": 0.2191162109375, "learning_rate": 1.5334851392444412e-07, "loss": -0.2083, "num_tokens": 29254654.0, "reward": 0.03013978898525238, "reward_std": 0.04280904680490494, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0022386545315384865, "rewards/logprob_reward/std": 0.003554455004632473, "step": 1034 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 736.375, "completions/mean_terminated_length": 717.2000122070312, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 3.1944444444444446, "grad_norm": 2.1343672273445065, "kl": NaN, "learning_rate": 1.5288764476599102e-07, "loss": -0.1414, "num_tokens": 29284462.0, "reward": 0.028765369206666946, "reward_std": 0.033886875957250595, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0007115213084034622, "rewards/logprob_reward/std": 0.0022839908488094807, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 697.90625, "completions/mean_terminated_length": 676.1666870117188, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 3.197530864197531, "grad_norm": 3.3306503208284592, "kl": 0.2279052734375, "learning_rate": 1.524271639561145e-07, "loss": -0.2411, "num_tokens": 29313527.0, "reward": 0.050074078142642975, "reward_std": 0.03632235527038574, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.003554530907422304, "rewards/logprob_reward/std": 0.006470814347267151, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 710.0625, "completions/mean_terminated_length": 677.586181640625, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 3.200617283950617, "grad_norm": 2.07390040032122, "kl": 0.2005615234375, "learning_rate": 1.5196707333625959e-07, "loss": -0.1393, "num_tokens": 29342629.0, "reward": 0.03282039985060692, "reward_std": 0.04835759848356247, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0017448903527110815, "rewards/logprob_reward/std": 0.0045485785230994225, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 750.8125, "completions/mean_terminated_length": 742.0, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 3.2037037037037037, "grad_norm": 1.887487126054076, "kl": 0.2208251953125, "learning_rate": 1.5150737474631092e-07, "loss": -0.1085, "num_tokens": 29372939.0, "reward": 0.04539516195654869, "reward_std": 0.049080513417720795, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.001827956992201507, "rewards/logprob_reward/std": 0.0025081937201321125, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 685.28125, "completions/mean_terminated_length": 685.28125, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 3.20679012345679, "grad_norm": 2.1564246168720596, "kl": 0.2523193359375, "learning_rate": 1.5104807002458564e-07, "loss": -0.0752, "num_tokens": 29400848.0, "reward": 0.04461712762713432, "reward_std": 0.05231209099292755, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0009634761372581124, "rewards/logprob_reward/std": 0.0015501893358305097, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 699.9375, "completions/mean_terminated_length": 689.4838256835938, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 3.2098765432098766, "grad_norm": 2.0727946114675793, "kl": 0.2550048828125, "learning_rate": 1.5058916100782555e-07, "loss": -0.146, "num_tokens": 29429434.0, "reward": 0.03488758206367493, "reward_std": 0.04631597176194191, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0005695360014215112, "rewards/logprob_reward/std": 0.0011281168553978205, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 651.09375, "completions/mean_terminated_length": 651.09375, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 3.212962962962963, "grad_norm": 1.7391712881031045, "kl": 0.2294921875, "learning_rate": 1.5013064953119036e-07, "loss": -0.1336, "num_tokens": 29456773.0, "reward": 0.04492807388305664, "reward_std": 0.040203481912612915, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0013089715503156185, "rewards/logprob_reward/std": 0.0030327115673571825, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 648.84375, "completions/mean_terminated_length": 636.741943359375, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 3.2160493827160495, "grad_norm": 2.5413915392621638, "kl": 0.23828125, "learning_rate": 1.4967253742824962e-07, "loss": -0.122, "num_tokens": 29483516.0, "reward": 0.05759654939174652, "reward_std": 0.047098174691200256, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0014961676206439734, "rewards/logprob_reward/std": 0.002962657017633319, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 712.5625, "completions/mean_terminated_length": 668.0714721679688, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 3.2191358024691357, "grad_norm": 1.930893478817174, "kl": 0.2281494140625, "learning_rate": 1.4921482653097614e-07, "loss": -0.1961, "num_tokens": 29512698.0, "reward": 0.029292644932866096, "reward_std": 0.027006058022379875, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0012973814737051725, "rewards/logprob_reward/std": 0.0025804650504142046, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 655.09375, "completions/mean_terminated_length": 655.09375, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 3.2222222222222223, "grad_norm": 2.4608711727614896, "kl": 0.23681640625, "learning_rate": 1.487575186697381e-07, "loss": -0.2578, "num_tokens": 29539953.0, "reward": 0.050926871597766876, "reward_std": 0.0479266420006752, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0010298553388565779, "rewards/logprob_reward/std": 0.0023105530999600887, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 729.4375, "completions/mean_terminated_length": 709.800048828125, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 3.2253086419753085, "grad_norm": 2.16844550061594, "kl": 0.2325439453125, "learning_rate": 1.4830061567329223e-07, "loss": -0.2563, "num_tokens": 29569231.0, "reward": 0.034893713891506195, "reward_std": 0.03588524088263512, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0005763471708633006, "rewards/logprob_reward/std": 0.0013616306241601706, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 753.0625, "completions/mean_terminated_length": 744.3225708007812, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 3.228395061728395, "grad_norm": 2.2636305900387246, "kl": 0.228759765625, "learning_rate": 1.4784411936877596e-07, "loss": -0.007, "num_tokens": 29600005.0, "reward": 0.036059677600860596, "reward_std": 0.04184982180595398, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0018718604696914554, "rewards/logprob_reward/std": 0.002798425266519189, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 811.0, "completions/mean_terminated_length": 771.5555419921875, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 3.2314814814814814, "grad_norm": 2.014576773630118, "kl": 0.21826171875, "learning_rate": 1.4738803158170043e-07, "loss": -0.0861, "num_tokens": 29632581.0, "reward": 0.031759947538375854, "reward_std": 0.04669643193483353, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.00751105509698391, "rewards/logprob_reward/std": 0.020503010600805283, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 748.21875, "completions/mean_terminated_length": 708.8214721679688, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 3.234567901234568, "grad_norm": 3.8583906524450953, "kl": 0.20849609375, "learning_rate": 1.469323541359433e-07, "loss": -0.4765, "num_tokens": 29662948.0, "reward": 0.038361284881830215, "reward_std": 0.047246262431144714, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0009569848771207035, "rewards/logprob_reward/std": 0.0018392475321888924, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 747.96875, "completions/mean_terminated_length": 719.413818359375, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 3.2376543209876543, "grad_norm": 1.7135235071658506, "kl": 0.2265625, "learning_rate": 1.4647708885374105e-07, "loss": -0.0541, "num_tokens": 29693435.0, "reward": 0.019259363412857056, "reward_std": 0.03267936408519745, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0005659585003741086, "rewards/logprob_reward/std": 0.0018487462075427175, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 699.5625, "completions/mean_terminated_length": 666.0, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 3.240740740740741, "grad_norm": 2.0112628842864004, "kl": 0.225830078125, "learning_rate": 1.4602223755568212e-07, "loss": -0.0747, "num_tokens": 29721841.0, "reward": 0.030123358592391014, "reward_std": 0.03439383953809738, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0022203982807695866, "rewards/logprob_reward/std": 0.004442084114998579, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 751.3125, "completions/mean_terminated_length": 723.1034545898438, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 3.243827160493827, "grad_norm": 2.0339949808550184, "kl": 0.2362060546875, "learning_rate": 1.4556780206069925e-07, "loss": -0.1813, "num_tokens": 29752639.0, "reward": 0.036101654171943665, "reward_std": 0.0347515270113945, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0019185070414096117, "rewards/logprob_reward/std": 0.003167710965499282, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 671.9375, "completions/mean_terminated_length": 606.74072265625, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 3.246913580246914, "grad_norm": 1.8769141810596117, "kl": 0.25146484375, "learning_rate": 1.4511378418606272e-07, "loss": -0.013, "num_tokens": 29780501.0, "reward": 0.05136559158563614, "reward_std": 0.027284620329737663, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0015173237770795822, "rewards/logprob_reward/std": 0.0020197611302137375, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 692.25, "completions/mean_terminated_length": 670.1333618164062, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 3.25, "grad_norm": 2.0115822847084726, "kl": 0.2269287109375, "learning_rate": 1.4466018574737236e-07, "loss": -0.1848, "num_tokens": 29808813.0, "reward": 0.051165372133255005, "reward_std": 0.053093306720256805, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0012948578223586082, "rewards/logprob_reward/std": 0.0020043007098138332, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 736.6875, "completions/mean_terminated_length": 683.4815063476562, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 3.253086419753086, "grad_norm": 1.7710297363346417, "kl": 0.227294921875, "learning_rate": 1.4420700855855093e-07, "loss": -0.1326, "num_tokens": 29838807.0, "reward": 0.04775507003068924, "reward_std": 0.05113762617111206, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.000977854710072279, "rewards/logprob_reward/std": 0.0016442429041489959, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 706.40625, "completions/mean_terminated_length": 685.2333984375, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 3.256172839506173, "grad_norm": 3.2144726880549355, "kl": 0.22900390625, "learning_rate": 1.4375425443183675e-07, "loss": -0.3769, "num_tokens": 29867828.0, "reward": 0.039484698325395584, "reward_std": 0.05072614923119545, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0022052209824323654, "rewards/logprob_reward/std": 0.0035095112398266792, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 697.84375, "completions/mean_terminated_length": 687.3225708007812, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 3.259259259259259, "grad_norm": 2.187334941470878, "kl": 0.2265625, "learning_rate": 1.43301925177776e-07, "loss": -0.0828, "num_tokens": 29896563.0, "reward": 0.04186723381280899, "reward_std": 0.043973658233881, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.001380261266604066, "rewards/logprob_reward/std": 0.001648860750719905, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 728.71875, "completions/mean_terminated_length": 698.1724243164062, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 3.2623456790123457, "grad_norm": 1.792696979967954, "kl": 0.2396240234375, "learning_rate": 1.4285002260521617e-07, "loss": -0.1215, "num_tokens": 29926834.0, "reward": 0.04456322640180588, "reward_std": 0.028252314776182175, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0009035864495672286, "rewards/logprob_reward/std": 0.0018524707993492484, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 702.6875, "completions/mean_terminated_length": 681.2667236328125, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 3.265432098765432, "grad_norm": 1.8841740076052507, "kl": 0.3082275390625, "learning_rate": 1.4239854852129807e-07, "loss": 0.0148, "num_tokens": 29955612.0, "reward": 0.057417526841163635, "reward_std": 0.039684195071458817, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0012972512049600482, "rewards/logprob_reward/std": 0.0016004204517230392, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 727.03125, "completions/mean_terminated_length": 672.0370483398438, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 3.2685185185185186, "grad_norm": 1.9088485363841312, "kl": 0.2410888671875, "learning_rate": 1.419475047314493e-07, "loss": -0.129, "num_tokens": 29985173.0, "reward": 0.03619737550616264, "reward_std": 0.04631843417882919, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0020248633809387684, "rewards/logprob_reward/std": 0.003684094874188304, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 761.15625, "completions/mean_terminated_length": 743.6333618164062, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 3.271604938271605, "grad_norm": 2.005587177258118, "kl": 0.21630859375, "learning_rate": 1.4149689303937662e-07, "loss": -0.0956, "num_tokens": 30016106.0, "reward": 0.041586898267269135, "reward_std": 0.04829026758670807, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0010687728645280004, "rewards/logprob_reward/std": 0.002006959868595004, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 722.34375, "completions/mean_terminated_length": 712.6128540039062, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 3.2746913580246915, "grad_norm": 1.94952303374902, "kl": 0.2384033203125, "learning_rate": 1.4104671524705892e-07, "loss": -0.1065, "num_tokens": 30045733.0, "reward": 0.060765333473682404, "reward_std": 0.05329817533493042, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0015448182821273804, "rewards/logprob_reward/std": 0.0021833241917192936, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 700.15625, "completions/mean_terminated_length": 689.7096557617188, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 3.2777777777777777, "grad_norm": 1.9105227496199337, "kl": 0.2269287109375, "learning_rate": 1.4059697315473988e-07, "loss": -0.1209, "num_tokens": 30074278.0, "reward": 0.035946477204561234, "reward_std": 0.033595044165849686, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0017460857052356005, "rewards/logprob_reward/std": 0.00294461939483881, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 696.34375, "completions/mean_terminated_length": 685.774169921875, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 3.2808641975308643, "grad_norm": 3.7925798183141866, "kl": 0.2171630859375, "learning_rate": 1.4014766856092081e-07, "loss": -0.3987, "num_tokens": 30102937.0, "reward": 0.029342934489250183, "reward_std": 0.03597993403673172, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0013532602461054921, "rewards/logprob_reward/std": 0.0027918131090700626, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 772.53125, "completions/mean_terminated_length": 714.5, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 3.2839506172839505, "grad_norm": 2.6087865978155644, "kl": 0.2313232421875, "learning_rate": 1.3969880326235362e-07, "loss": -0.3567, "num_tokens": 30134278.0, "reward": 0.03363805264234543, "reward_std": 0.04130767285823822, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.002653393428772688, "rewards/logprob_reward/std": 0.0037167624104768038, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 678.21875, "completions/mean_terminated_length": 678.21875, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 3.287037037037037, "grad_norm": 2.36546647548035, "kl": 0.2498779296875, "learning_rate": 1.3925037905403324e-07, "loss": -0.2612, "num_tokens": 30162613.0, "reward": 0.03613483905792236, "reward_std": 0.04617026448249817, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0019553746096789837, "rewards/logprob_reward/std": 0.004605346359312534, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 697.3125, "completions/mean_terminated_length": 686.774169921875, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 3.2901234567901234, "grad_norm": 2.2890774861387446, "kl": 0.229248046875, "learning_rate": 1.38802397729191e-07, "loss": -0.0201, "num_tokens": 30191727.0, "reward": 0.042044222354888916, "reward_std": 0.03896318003535271, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0015769141027703881, "rewards/logprob_reward/std": 0.0022949024569243193, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 747.0625, "completions/mean_terminated_length": 728.6000366210938, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 3.29320987654321, "grad_norm": 3.3077592750464677, "kl": 0.2061767578125, "learning_rate": 1.3835486107928678e-07, "loss": -0.5235, "num_tokens": 30222065.0, "reward": 0.048837028443813324, "reward_std": 0.03456879034638405, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0021800282411277294, "rewards/logprob_reward/std": 0.0033928067423403263, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 687.9375, "completions/mean_terminated_length": 687.9375, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 3.2962962962962963, "grad_norm": 1.8257078762712673, "kl": 0.23486328125, "learning_rate": 1.3790777089400262e-07, "loss": -0.1296, "num_tokens": 30250055.0, "reward": 0.0545763298869133, "reward_std": 0.038820892572402954, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001612587017007172, "rewards/logprob_reward/std": 0.0026066070422530174, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 759.0, "completions/mean_terminated_length": 741.3333740234375, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "epoch": 3.299382716049383, "grad_norm": 2.3746136697516618, "kl": 0.2366943359375, "learning_rate": 1.3746112896123494e-07, "loss": -0.2558, "num_tokens": 30281047.0, "reward": 0.05230496823787689, "reward_std": 0.049082618206739426, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.002561077941209078, "rewards/logprob_reward/std": 0.0044354889541864395, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 685.65625, "completions/mean_terminated_length": 685.65625, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 3.302469135802469, "grad_norm": 1.94869486964416, "kl": 0.244873046875, "learning_rate": 1.3701493706708768e-07, "loss": -0.1714, "num_tokens": 30309364.0, "reward": 0.04212292283773422, "reward_std": 0.03466665744781494, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0016643566777929664, "rewards/logprob_reward/std": 0.00276436610147357, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 717.09375, "completions/mean_terminated_length": 673.25, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 3.3055555555555554, "grad_norm": 2.319787808212525, "kl": 0.240966796875, "learning_rate": 1.3656919699586503e-07, "loss": -0.2518, "num_tokens": 30338719.0, "reward": 0.05116596817970276, "reward_std": 0.054804928600788116, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0012955210404470563, "rewards/logprob_reward/std": 0.0032878448255360126, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 717.875, "completions/mean_terminated_length": 717.875, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 3.308641975308642, "grad_norm": 1.8546660989109238, "kl": 0.243896484375, "learning_rate": 1.3612391053006446e-07, "loss": -0.0398, "num_tokens": 30368087.0, "reward": 0.02947724051773548, "reward_std": 0.03402159735560417, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0015024887397885323, "rewards/logprob_reward/std": 0.0030562926549464464, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 727.3125, "completions/mean_terminated_length": 727.3125, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 3.3117283950617282, "grad_norm": 1.8364315932873658, "kl": 0.226318359375, "learning_rate": 1.356790794503694e-07, "loss": -0.0978, "num_tokens": 30398241.0, "reward": 0.05554766207933426, "reward_std": 0.04061705991625786, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0026918482035398483, "rewards/logprob_reward/std": 0.0036417266819626093, "step": 1073 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 746.40625, "completions/mean_terminated_length": 706.7500610351562, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 3.314814814814815, "grad_norm": 1.750053847049476, "kl": NaN, "learning_rate": 1.3523470553564238e-07, "loss": -0.0674, "num_tokens": 30429078.0, "reward": 0.025996271520853043, "reward_std": 0.020551001653075218, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0011069679167121649, "rewards/logprob_reward/std": 0.002348089125007391, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 738.96875, "completions/mean_terminated_length": 729.774169921875, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 3.317901234567901, "grad_norm": 2.5657590551853753, "kl": 0.229736328125, "learning_rate": 1.3479079056291738e-07, "loss": -0.2523, "num_tokens": 30459509.0, "reward": 0.045897215604782104, "reward_std": 0.04792949557304382, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.002385790692642331, "rewards/logprob_reward/std": 0.003764881519600749, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 672.5, "completions/mean_terminated_length": 661.1612548828125, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 3.3209876543209877, "grad_norm": 2.0646556550799566, "kl": 0.231201171875, "learning_rate": 1.3434733630739345e-07, "loss": -0.0329, "num_tokens": 30487429.0, "reward": 0.046139415353536606, "reward_std": 0.048572394996881485, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0026549044996500015, "rewards/logprob_reward/std": 0.005118787754327059, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 717.28125, "completions/mean_terminated_length": 696.8333740234375, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 3.324074074074074, "grad_norm": 2.222405698943655, "kl": 0.257080078125, "learning_rate": 1.3390434454242704e-07, "loss": -0.1534, "num_tokens": 30517054.0, "reward": 0.051761068403720856, "reward_std": 0.03332982957363129, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.00195673992857337, "rewards/logprob_reward/std": 0.003259633667767048, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 751.625, "completions/mean_terminated_length": 723.4483032226562, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 3.3271604938271606, "grad_norm": 1.8774439793909277, "kl": 0.217041015625, "learning_rate": 1.334618170395254e-07, "loss": -0.1122, "num_tokens": 30547698.0, "reward": 0.03618477284908295, "reward_std": 0.04643512889742851, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.002010858617722988, "rewards/logprob_reward/std": 0.003403040813282132, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 683.46875, "completions/mean_terminated_length": 672.4838256835938, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 3.330246913580247, "grad_norm": 2.3012361545215336, "kl": 0.24853515625, "learning_rate": 1.3301975556833872e-07, "loss": -0.2881, "num_tokens": 30575953.0, "reward": 0.04187607765197754, "reward_std": 0.04695266857743263, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0013900839257985353, "rewards/logprob_reward/std": 0.0022216225042939186, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 734.90625, "completions/mean_terminated_length": 715.6333618164062, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 3.3333333333333335, "grad_norm": 2.102076853054117, "kl": 0.223388671875, "learning_rate": 1.3257816189665398e-07, "loss": -0.2217, "num_tokens": 30605794.0, "reward": 0.03226393833756447, "reward_std": 0.04563473165035248, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0011265964712947607, "rewards/logprob_reward/std": 0.002209179336205125, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 707.59375, "completions/mean_terminated_length": 697.3870849609375, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 3.3364197530864197, "grad_norm": 1.9624133298170543, "kl": 0.241943359375, "learning_rate": 1.3213703779038726e-07, "loss": -0.1519, "num_tokens": 30635041.0, "reward": 0.04342280328273773, "reward_std": 0.04256758093833923, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0031086672097444534, "rewards/logprob_reward/std": 0.007012155372649431, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 760.5625, "completions/mean_terminated_length": 733.3103637695312, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 3.3395061728395063, "grad_norm": 1.867376771617639, "kl": 0.22021484375, "learning_rate": 1.3169638501357697e-07, "loss": -0.1062, "num_tokens": 30666043.0, "reward": 0.020213942974805832, "reward_std": 0.0332123339176178, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0016266022576019168, "rewards/logprob_reward/std": 0.0027723326347768307, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 731.28125, "completions/mean_terminated_length": 689.4642944335938, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 3.3425925925925926, "grad_norm": 2.2493488637163948, "kl": 0.2119140625, "learning_rate": 1.3125620532837667e-07, "loss": -0.2133, "num_tokens": 30695724.0, "reward": 0.03854992985725403, "reward_std": 0.047263868153095245, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0011665882775560021, "rewards/logprob_reward/std": 0.002320114290341735, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 711.5, "completions/mean_terminated_length": 690.6666870117188, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 3.3456790123456788, "grad_norm": 1.7566383739325062, "kl": 0.234375, "learning_rate": 1.3081650049504784e-07, "loss": -0.2941, "num_tokens": 30724772.0, "reward": 0.05443764105439186, "reward_std": 0.046541810035705566, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001458491780795157, "rewards/logprob_reward/std": 0.0019860758911818266, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 677.4375, "completions/mean_terminated_length": 666.258056640625, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 3.3487654320987654, "grad_norm": 2.144118656152997, "kl": 0.2432861328125, "learning_rate": 1.3037727227195333e-07, "loss": -0.0905, "num_tokens": 30752434.0, "reward": 0.05798827111721039, "reward_std": 0.04746585339307785, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0019314105156809092, "rewards/logprob_reward/std": 0.0033570851664990187, "step": 1085 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 710.09375, "completions/mean_terminated_length": 637.6538696289062, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 3.351851851851852, "grad_norm": 2.748219497473315, "kl": NaN, "learning_rate": 1.2993852241554986e-07, "loss": -0.259, "num_tokens": 30781593.0, "reward": 0.035829655826091766, "reward_std": 0.047223057597875595, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.00161628401838243, "rewards/logprob_reward/std": 0.0030803345143795013, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 709.71875, "completions/mean_terminated_length": 688.7667236328125, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 3.3549382716049383, "grad_norm": 2.1144851623470213, "kl": 0.2332763671875, "learning_rate": 1.295002526803813e-07, "loss": -0.1069, "num_tokens": 30810588.0, "reward": 0.03129696473479271, "reward_std": 0.03686773404479027, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0035244047176092863, "rewards/logprob_reward/std": 0.006848264019936323, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 715.375, "completions/mean_terminated_length": 694.800048828125, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 3.3580246913580245, "grad_norm": 2.9618576730531583, "kl": 0.242919921875, "learning_rate": 1.2906246481907145e-07, "loss": -0.4702, "num_tokens": 30839848.0, "reward": 0.03286594897508621, "reward_std": 0.04149939864873886, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0017954995855689049, "rewards/logprob_reward/std": 0.0030578349251300097, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 790.3125, "completions/mean_terminated_length": 724.8800048828125, "completions/min_length": 549.0, "completions/min_terminated_length": 549.0, "epoch": 3.361111111111111, "grad_norm": 4.720990537666491, "kl": 0.255859375, "learning_rate": 1.2862516058231718e-07, "loss": -0.4049, "num_tokens": 30871898.0, "reward": 0.03222402185201645, "reward_std": 0.0213757511228323, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.001082246657460928, "rewards/logprob_reward/std": 0.00286446837708354, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 785.15625, "completions/mean_terminated_length": 718.2799682617188, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 3.3641975308641974, "grad_norm": 3.713024794346732, "kl": 0.2628173828125, "learning_rate": 1.2818834171888136e-07, "loss": -0.2556, "num_tokens": 30903675.0, "reward": 0.022724945098161697, "reward_std": 0.02004055865108967, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0009443819872103631, "rewards/logprob_reward/std": 0.002085645915940404, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 691.09375, "completions/mean_terminated_length": 680.3547973632812, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 3.367283950617284, "grad_norm": 2.197632557539477, "kl": 0.2286376953125, "learning_rate": 1.277520099755857e-07, "loss": -0.1771, "num_tokens": 30932190.0, "reward": 0.03833567351102829, "reward_std": 0.04985808953642845, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.00440074922516942, "rewards/logprob_reward/std": 0.00975917000323534, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 756.5, "completions/mean_terminated_length": 728.8275756835938, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 3.3703703703703702, "grad_norm": 2.326730656353155, "kl": 0.2303466796875, "learning_rate": 1.2731616709730428e-07, "loss": -0.2801, "num_tokens": 30962786.0, "reward": 0.03941354900598526, "reward_std": 0.04291396588087082, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0021261684596538544, "rewards/logprob_reward/std": 0.0030039141420274973, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 700.125, "completions/mean_terminated_length": 653.857177734375, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 3.373456790123457, "grad_norm": 2.203955018923035, "kl": 0.241943359375, "learning_rate": 1.2688081482695577e-07, "loss": -0.1841, "num_tokens": 30991902.0, "reward": 0.03998091444373131, "reward_std": 0.04864718019962311, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0027565700002014637, "rewards/logprob_reward/std": 0.0038286775816231966, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 720.25, "completions/mean_terminated_length": 664.0, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 3.376543209876543, "grad_norm": 2.854076959202184, "kl": 0.2431640625, "learning_rate": 1.264459549054973e-07, "loss": -0.2387, "num_tokens": 31021362.0, "reward": 0.03293699026107788, "reward_std": 0.04116933047771454, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.001874431036412716, "rewards/logprob_reward/std": 0.0030146704521030188, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 712.3125, "completions/mean_terminated_length": 691.5333862304688, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 3.3796296296296298, "grad_norm": 2.180109317404519, "kl": 0.2454833984375, "learning_rate": 1.2601158907191696e-07, "loss": -0.2958, "num_tokens": 31050376.0, "reward": 0.048220813274383545, "reward_std": 0.053947970271110535, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001495343865826726, "rewards/logprob_reward/std": 0.0025552199222147465, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 685.65625, "completions/mean_terminated_length": 685.65625, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 3.382716049382716, "grad_norm": 1.9683328499369184, "kl": 0.2362060546875, "learning_rate": 1.2557771906322704e-07, "loss": -0.0929, "num_tokens": 31078785.0, "reward": 0.03360357880592346, "reward_std": 0.04289977252483368, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0026150825433433056, "rewards/logprob_reward/std": 0.005456762854009867, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 717.5625, "completions/mean_terminated_length": 707.6773681640625, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 3.3858024691358026, "grad_norm": 2.15758395020318, "kl": 0.2281494140625, "learning_rate": 1.2514434661445706e-07, "loss": -0.1629, "num_tokens": 31108331.0, "reward": 0.042262203991413116, "reward_std": 0.049088455736637115, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.001819114200770855, "rewards/logprob_reward/std": 0.003913812804967165, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 774.46875, "completions/mean_terminated_length": 748.6551513671875, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 3.388888888888889, "grad_norm": 3.5569402046661818, "kl": 0.2369384765625, "learning_rate": 1.2471147345864672e-07, "loss": -0.5862, "num_tokens": 31139710.0, "reward": 0.041524291038513184, "reward_std": 0.05353492870926857, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0009992129635065794, "rewards/logprob_reward/std": 0.0018569540698081255, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 680.84375, "completions/mean_terminated_length": 669.774169921875, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 3.3919753086419755, "grad_norm": 1.8845426581783664, "kl": 0.235595703125, "learning_rate": 1.2427910132683928e-07, "loss": -0.0689, "num_tokens": 31167677.0, "reward": 0.05112278461456299, "reward_std": 0.04662879928946495, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0012475401163101196, "rewards/logprob_reward/std": 0.0020419848151504993, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 789.28125, "completions/mean_terminated_length": 781.7096557617188, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 3.3950617283950617, "grad_norm": 1.964521616890569, "kl": 0.2049560546875, "learning_rate": 1.2384723194807408e-07, "loss": -0.2422, "num_tokens": 31199510.0, "reward": 0.05434644967317581, "reward_std": 0.04874827712774277, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0013571643503382802, "rewards/logprob_reward/std": 0.0026882714591920376, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 647.9375, "completions/mean_terminated_length": 635.8064575195312, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 3.398148148148148, "grad_norm": 2.907604830438677, "kl": 0.2667236328125, "learning_rate": 1.234158670493803e-07, "loss": -0.226, "num_tokens": 31226296.0, "reward": 0.022591644898056984, "reward_std": 0.038996174931526184, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0007962724776007235, "rewards/logprob_reward/std": 0.0012763147242367268, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 711.0625, "completions/mean_terminated_length": 666.357177734375, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 3.4012345679012346, "grad_norm": 2.917099739070145, "kl": 0.2615966796875, "learning_rate": 1.229850083557695e-07, "loss": -0.374, "num_tokens": 31255282.0, "reward": 0.04276447743177414, "reward_std": 0.05232639238238335, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.002377194818109274, "rewards/logprob_reward/std": 0.0037482907064259052, "step": 1102 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 709.46875, "completions/mean_terminated_length": 709.46875, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 3.4043209876543212, "grad_norm": 2.104831814389895, "kl": NaN, "learning_rate": 1.2255465759022913e-07, "loss": -0.1829, "num_tokens": 31284433.0, "reward": 0.025428790599107742, "reward_std": 0.03470895066857338, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.00047643345897085965, "rewards/logprob_reward/std": 0.0010723703308030963, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 734.65625, "completions/mean_terminated_length": 734.65625, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 3.4074074074074074, "grad_norm": 1.9420649150287443, "kl": 0.2213134765625, "learning_rate": 1.2212481647371542e-07, "loss": -0.1529, "num_tokens": 31314102.0, "reward": 0.041694849729537964, "reward_std": 0.0320030152797699, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0011887189466506243, "rewards/logprob_reward/std": 0.0022691090125590563, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 729.34375, "completions/mean_terminated_length": 698.862060546875, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 3.4104938271604937, "grad_norm": 2.578077322286555, "kl": 0.22021484375, "learning_rate": 1.2169548672514625e-07, "loss": -0.2975, "num_tokens": 31343713.0, "reward": 0.035099804401397705, "reward_std": 0.04647810012102127, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0008053354104049504, "rewards/logprob_reward/std": 0.0015714397886767983, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 753.03125, "completions/mean_terminated_length": 744.290283203125, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 3.4135802469135803, "grad_norm": 2.1314975986171705, "kl": 0.1986083984375, "learning_rate": 1.2126667006139495e-07, "loss": -0.076, "num_tokens": 31374750.0, "reward": 0.052901607006788254, "reward_std": 0.05452405661344528, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0032240066211670637, "rewards/logprob_reward/std": 0.006283770315349102, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 722.0625, "completions/mean_terminated_length": 690.8275756835938, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 3.4166666666666665, "grad_norm": 1.9101388093195726, "kl": 0.214111328125, "learning_rate": 1.208383681972829e-07, "loss": -0.1831, "num_tokens": 31404168.0, "reward": 0.053703077137470245, "reward_std": 0.04286354407668114, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.004114528186619282, "rewards/logprob_reward/std": 0.004552820231765509, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 697.0, "completions/mean_terminated_length": 686.4515991210938, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 3.419753086419753, "grad_norm": 2.826444952568141, "kl": 0.2506103515625, "learning_rate": 1.2041058284557277e-07, "loss": -0.1742, "num_tokens": 31432660.0, "reward": 0.03298354148864746, "reward_std": 0.04296446591615677, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0019261582056060433, "rewards/logprob_reward/std": 0.0031893160194158554, "step": 1108 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 783.125, "completions/mean_terminated_length": 758.2069091796875, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 3.4228395061728394, "grad_norm": 2.1305928869403727, "kl": NaN, "learning_rate": 1.1998331571696162e-07, "loss": -0.0304, "num_tokens": 31465048.0, "reward": 0.03501129895448685, "reward_std": 0.0384102463722229, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.004179219249635935, "rewards/logprob_reward/std": 0.016072003170847893, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 676.03125, "completions/mean_terminated_length": 652.8333740234375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 3.425925925925926, "grad_norm": 2.4360467739943394, "kl": 0.2364501953125, "learning_rate": 1.1955656852007438e-07, "loss": -0.1763, "num_tokens": 31493377.0, "reward": 0.04730891436338425, "reward_std": 0.04944960027933121, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.00395435094833374, "rewards/logprob_reward/std": 0.011206968687474728, "step": 1110 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 694.46875, "completions/mean_terminated_length": 694.46875, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 3.4290123456790123, "grad_norm": 2.8233126079081936, "kl": NaN, "learning_rate": 1.1913034296145669e-07, "loss": -0.3677, "num_tokens": 31521904.0, "reward": 0.045856691896915436, "reward_std": 0.04630706459283829, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0023407696280628443, "rewards/logprob_reward/std": 0.0050558545626699924, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 667.53125, "completions/mean_terminated_length": 643.7667236328125, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 3.432098765432099, "grad_norm": 2.028474611018986, "kl": 0.2296142578125, "learning_rate": 1.1870464074556816e-07, "loss": -0.0693, "num_tokens": 31549281.0, "reward": 0.03846452385187149, "reward_std": 0.04738791286945343, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0010716933757066727, "rewards/logprob_reward/std": 0.001394196879118681, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 721.375, "completions/mean_terminated_length": 711.6128540039062, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 3.435185185185185, "grad_norm": 2.105570556456589, "kl": 0.242919921875, "learning_rate": 1.1827946357477559e-07, "loss": -0.1779, "num_tokens": 31579001.0, "reward": 0.03864138573408127, "reward_std": 0.04990891367197037, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.00126820825971663, "rewards/logprob_reward/std": 0.0032743855845183134, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 651.28125, "completions/mean_terminated_length": 639.258056640625, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 3.4382716049382718, "grad_norm": 2.030575097798009, "kl": 0.2626953125, "learning_rate": 1.1785481314934618e-07, "loss": -0.2432, "num_tokens": 31606130.0, "reward": 0.06310325860977173, "reward_std": 0.047171108424663544, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0006702886312268674, "rewards/logprob_reward/std": 0.0013811569660902023, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 735.125, "completions/mean_terminated_length": 725.8064575195312, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 3.441358024691358, "grad_norm": 2.0024161143975756, "kl": 0.226806640625, "learning_rate": 1.1743069116744064e-07, "loss": -0.0735, "num_tokens": 31636686.0, "reward": 0.036736417561769485, "reward_std": 0.04932186007499695, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0026237964630126953, "rewards/logprob_reward/std": 0.003783321939408779, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 720.9375, "completions/mean_terminated_length": 700.7333984375, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 3.4444444444444446, "grad_norm": 1.6023836770894058, "kl": 0.2440185546875, "learning_rate": 1.1700709932510656e-07, "loss": -0.0023, "num_tokens": 31666408.0, "reward": 0.026854412630200386, "reward_std": 0.03465839475393295, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0020604589954018593, "rewards/logprob_reward/std": 0.0032155991066247225, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 771.84375, "completions/mean_terminated_length": 745.7586059570312, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "epoch": 3.447530864197531, "grad_norm": 2.294279853209709, "kl": 0.2257080078125, "learning_rate": 1.1658403931627125e-07, "loss": -0.2441, "num_tokens": 31697615.0, "reward": 0.032512031495571136, "reward_std": 0.03311271220445633, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0014022570103406906, "rewards/logprob_reward/std": 0.0018350256141275167, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 669.78125, "completions/mean_terminated_length": 658.3547973632812, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 3.450617283950617, "grad_norm": 1.8916531530781802, "kl": 0.2376708984375, "learning_rate": 1.1616151283273565e-07, "loss": -0.0906, "num_tokens": 31725732.0, "reward": 0.03587503358721733, "reward_std": 0.039832498878240585, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0016667036106809974, "rewards/logprob_reward/std": 0.002389201894402504, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 745.28125, "completions/mean_terminated_length": 745.28125, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 3.4537037037037037, "grad_norm": 2.1417357408168645, "kl": 0.2279052734375, "learning_rate": 1.1573952156416672e-07, "loss": -0.1894, "num_tokens": 31756221.0, "reward": 0.04853258654475212, "reward_std": 0.04154781997203827, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0018417645478621125, "rewards/logprob_reward/std": 0.0029962125699967146, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 724.34375, "completions/mean_terminated_length": 693.3448486328125, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 3.45679012345679, "grad_norm": 2.1271526220525567, "kl": 0.234130859375, "learning_rate": 1.1531806719809142e-07, "loss": -0.1871, "num_tokens": 31786008.0, "reward": 0.04200819879770279, "reward_std": 0.04148890823125839, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0015368824824690819, "rewards/logprob_reward/std": 0.002423949772492051, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 713.0, "completions/mean_terminated_length": 692.2667236328125, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 3.4598765432098766, "grad_norm": 1.8761988556721207, "kl": 0.2294921875, "learning_rate": 1.1489715141988954e-07, "loss": -0.1463, "num_tokens": 31815312.0, "reward": 0.038898445665836334, "reward_std": 0.04714834690093994, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0015538274310529232, "rewards/logprob_reward/std": 0.0023342163767665625, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 730.15625, "completions/mean_terminated_length": 699.7586059570312, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 3.462962962962963, "grad_norm": 2.6330904755628635, "kl": 0.2239990234375, "learning_rate": 1.1447677591278715e-07, "loss": -0.1677, "num_tokens": 31844745.0, "reward": 0.035472720861434937, "reward_std": 0.04140324890613556, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0012196903117001057, "rewards/logprob_reward/std": 0.001673478283919394, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 678.4375, "completions/mean_terminated_length": 678.4375, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 3.4660493827160495, "grad_norm": 1.8465228167080314, "kl": 0.2327880859375, "learning_rate": 1.1405694235784972e-07, "loss": -0.075, "num_tokens": 31872487.0, "reward": 0.06046842038631439, "reward_std": 0.05312386155128479, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0012149163521826267, "rewards/logprob_reward/std": 0.0021992295514792204, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 763.71875, "completions/mean_terminated_length": 715.5184936523438, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 3.4691358024691357, "grad_norm": 2.304488976219399, "kl": 0.2335205078125, "learning_rate": 1.1363765243397555e-07, "loss": -0.1157, "num_tokens": 31903734.0, "reward": 0.02951209619641304, "reward_std": 0.03832588717341423, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0015412173233926296, "rewards/logprob_reward/std": 0.002513695042580366, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 708.5, "completions/mean_terminated_length": 698.3225708007812, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 3.4722222222222223, "grad_norm": 2.0540712434195814, "kl": 0.2178955078125, "learning_rate": 1.1321890781788884e-07, "loss": -0.1512, "num_tokens": 31933194.0, "reward": 0.05982912331819534, "reward_std": 0.05028437077999115, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.003976802341639996, "rewards/logprob_reward/std": 0.004956958349794149, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 688.40625, "completions/mean_terminated_length": 666.0333862304688, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 3.4753086419753085, "grad_norm": 2.110734419144824, "kl": 0.2159423828125, "learning_rate": 1.1280071018413326e-07, "loss": -0.2126, "num_tokens": 31961363.0, "reward": 0.044660262763500214, "reward_std": 0.047581709921360016, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0010113996686413884, "rewards/logprob_reward/std": 0.0016988972201943398, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 681.59375, "completions/mean_terminated_length": 681.59375, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 3.478395061728395, "grad_norm": 1.889477444872169, "kl": 0.2265625, "learning_rate": 1.1238306120506505e-07, "loss": -0.1439, "num_tokens": 31989490.0, "reward": 0.0637340098619461, "reward_std": 0.04231216013431549, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0013711245264858007, "rewards/logprob_reward/std": 0.0019430826650932431, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 724.5625, "completions/mean_terminated_length": 714.9031982421875, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 3.4814814814814814, "grad_norm": 2.9944010607198024, "kl": 0.232666015625, "learning_rate": 1.1196596255084648e-07, "loss": -0.2672, "num_tokens": 32019300.0, "reward": 0.038902007043361664, "reward_std": 0.03860536217689514, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.00155778625048697, "rewards/logprob_reward/std": 0.0023658720310777426, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 710.40625, "completions/mean_terminated_length": 689.5000610351562, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 3.484567901234568, "grad_norm": 2.3202481339552774, "kl": 0.21337890625, "learning_rate": 1.11549415889439e-07, "loss": -0.1087, "num_tokens": 32048165.0, "reward": 0.035354673862457275, "reward_std": 0.04831884428858757, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0010885288938879967, "rewards/logprob_reward/std": 0.0017892775358632207, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 738.46875, "completions/mean_terminated_length": 719.433349609375, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 3.4876543209876543, "grad_norm": 1.9789032421868742, "kl": 0.2420654296875, "learning_rate": 1.1113342288659683e-07, "loss": -0.0749, "num_tokens": 32078748.0, "reward": 0.030769433826208115, "reward_std": 0.02733692154288292, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.002938260091468692, "rewards/logprob_reward/std": 0.004040616098791361, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 700.90625, "completions/mean_terminated_length": 700.90625, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 3.490740740740741, "grad_norm": 2.187451897590174, "kl": 0.2154541015625, "learning_rate": 1.1071798520585979e-07, "loss": -0.1702, "num_tokens": 32107581.0, "reward": 0.06450729072093964, "reward_std": 0.050118058919906616, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0022303247824311256, "rewards/logprob_reward/std": 0.0029016912449151278, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 770.84375, "completions/mean_terminated_length": 753.9667358398438, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 3.493827160493827, "grad_norm": 1.7180303555451324, "kl": 0.2032470703125, "learning_rate": 1.1030310450854729e-07, "loss": -0.2091, "num_tokens": 32138784.0, "reward": 0.029544800519943237, "reward_std": 0.03377370163798332, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0015775556676089764, "rewards/logprob_reward/std": 0.0020715794526040554, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 706.4375, "completions/mean_terminated_length": 673.586181640625, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 3.496913580246914, "grad_norm": 2.8185615779128224, "kl": 0.2470703125, "learning_rate": 1.0988878245375138e-07, "loss": -0.4412, "num_tokens": 32168162.0, "reward": 0.06334048509597778, "reward_std": 0.028165031224489212, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.004406091757118702, "rewards/logprob_reward/std": 0.004128764383494854, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 665.96875, "completions/mean_terminated_length": 654.4193115234375, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 3.5, "grad_norm": 2.2747570750255672, "kl": 0.2449951171875, "learning_rate": 1.094750206983299e-07, "loss": -0.0658, "num_tokens": 32195185.0, "reward": 0.05119810998439789, "reward_std": 0.04677068069577217, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0013312329538166523, "rewards/logprob_reward/std": 0.0021630190312862396, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 787.46875, "completions/mean_terminated_length": 743.6666870117188, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 3.503086419753086, "grad_norm": 1.8363172237592025, "kl": 0.202880859375, "learning_rate": 1.0906182089690025e-07, "loss": -0.0546, "num_tokens": 32227168.0, "reward": 0.04166097193956375, "reward_std": 0.02731870487332344, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0011510797776281834, "rewards/logprob_reward/std": 0.002137968083843589, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 635.125, "completions/mean_terminated_length": 635.125, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 3.506172839506173, "grad_norm": 2.1241744368102093, "kl": 0.266845703125, "learning_rate": 1.0864918470183258e-07, "loss": -0.266, "num_tokens": 32253412.0, "reward": 0.05826055258512497, "reward_std": 0.055479831993579865, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.002233944833278656, "rewards/logprob_reward/std": 0.004511518403887749, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 701.3125, "completions/mean_terminated_length": 679.800048828125, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 3.5092592592592595, "grad_norm": 2.6466497665544155, "kl": 0.233642578125, "learning_rate": 1.0823711376324313e-07, "loss": -0.1377, "num_tokens": 32282082.0, "reward": 0.05135006457567215, "reward_std": 0.04925466328859329, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0015000696294009686, "rewards/logprob_reward/std": 0.0025531058199703693, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 747.3125, "completions/mean_terminated_length": 738.3870849609375, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 3.5123456790123457, "grad_norm": 3.4678811944566195, "kl": 0.2003173828125, "learning_rate": 1.0782560972898783e-07, "loss": -0.4641, "num_tokens": 32312304.0, "reward": 0.04236939549446106, "reward_std": 0.05416145175695419, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0019382155733183026, "rewards/logprob_reward/std": 0.0023308454547077417, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 785.09375, "completions/mean_terminated_length": 760.3793334960938, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 3.515432098765432, "grad_norm": 2.3914373290431157, "kl": 0.18701171875, "learning_rate": 1.0741467424465544e-07, "loss": -0.1969, "num_tokens": 32343919.0, "reward": 0.025426620617508888, "reward_std": 0.04071640968322754, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.003946245182305574, "rewards/logprob_reward/std": 0.007611100561916828, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 736.34375, "completions/mean_terminated_length": 706.586181640625, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 3.5185185185185186, "grad_norm": 1.8549279697319916, "kl": 0.2083740234375, "learning_rate": 1.0700430895356119e-07, "loss": -0.0428, "num_tokens": 32374274.0, "reward": 0.05179382860660553, "reward_std": 0.03939950466156006, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0019931443966925144, "rewards/logprob_reward/std": 0.00250457925722003, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 682.5, "completions/mean_terminated_length": 659.7333374023438, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 3.521604938271605, "grad_norm": 2.1549472574588724, "kl": 0.2467041015625, "learning_rate": 1.0659451549674018e-07, "loss": -0.201, "num_tokens": 32402554.0, "reward": 0.04312340170145035, "reward_std": 0.04689634591341019, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.002776003908365965, "rewards/logprob_reward/std": 0.004801588132977486, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 727.40625, "completions/mean_terminated_length": 696.72412109375, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 3.5246913580246915, "grad_norm": 2.0764939972234004, "kl": 0.214111328125, "learning_rate": 1.0618529551294053e-07, "loss": -0.0672, "num_tokens": 32432175.0, "reward": 0.03544144332408905, "reward_std": 0.04791321977972984, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0011849362635985017, "rewards/logprob_reward/std": 0.002227792516350746, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 707.90625, "completions/mean_terminated_length": 675.2069091796875, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 3.5277777777777777, "grad_norm": 2.093093067534423, "kl": 0.223388671875, "learning_rate": 1.0577665063861735e-07, "loss": -0.154, "num_tokens": 32461232.0, "reward": 0.05621867626905441, "reward_std": 0.05492142587900162, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.01732630468904972, "rewards/logprob_reward/std": 0.04858780652284622, "step": 1143 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 753.9375, "completions/mean_terminated_length": 715.357177734375, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 3.5308641975308643, "grad_norm": 1.7649380791762947, "kl": NaN, "learning_rate": 1.0536858250792582e-07, "loss": -0.1807, "num_tokens": 32492118.0, "reward": 0.03603683412075043, "reward_std": 0.03452879190444946, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.001846479601226747, "rewards/logprob_reward/std": 0.0036165211349725723, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 663.96875, "completions/mean_terminated_length": 652.3547973632812, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 3.5339506172839505, "grad_norm": 1.8531141622542997, "kl": 0.2381591796875, "learning_rate": 1.0496109275271456e-07, "loss": -0.0774, "num_tokens": 32519561.0, "reward": 0.054988741874694824, "reward_std": 0.041676297783851624, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0020708239171653986, "rewards/logprob_reward/std": 0.0025657275691628456, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 759.625, "completions/mean_terminated_length": 742.0000610351562, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 3.537037037037037, "grad_norm": 2.59980971611076, "kl": 0.19775390625, "learning_rate": 1.0455418300251953e-07, "loss": -0.2394, "num_tokens": 32550661.0, "reward": 0.02973213419318199, "reward_std": 0.0364789180457592, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0017857032362371683, "rewards/logprob_reward/std": 0.0033551438245922327, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 765.15625, "completions/mean_terminated_length": 728.1785888671875, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 3.5401234567901234, "grad_norm": 3.37755020614535, "kl": 0.2130126953125, "learning_rate": 1.0414785488455718e-07, "loss": -0.4422, "num_tokens": 32581882.0, "reward": 0.039593394845724106, "reward_std": 0.048004984855651855, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.002325995359569788, "rewards/logprob_reward/std": 0.002745207166299224, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 752.4375, "completions/mean_terminated_length": 724.3448486328125, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 3.5432098765432096, "grad_norm": 2.5484335148892345, "kl": 0.201171875, "learning_rate": 1.0374211002371808e-07, "loss": -0.2391, "num_tokens": 32612820.0, "reward": 0.03568592667579651, "reward_std": 0.01441339310258627, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0014565885066986084, "rewards/logprob_reward/std": 0.002635025419294834, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 700.59375, "completions/mean_terminated_length": 690.1612548828125, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 3.5462962962962963, "grad_norm": 2.227758388455807, "kl": 0.2174072265625, "learning_rate": 1.0333695004256035e-07, "loss": -0.1032, "num_tokens": 32641467.0, "reward": 0.03294813632965088, "reward_std": 0.04731585085391998, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.001886819489300251, "rewards/logprob_reward/std": 0.003531733760610223, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 713.84375, "completions/mean_terminated_length": 703.8386840820312, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 3.549382716049383, "grad_norm": 2.1803390550057085, "kl": 0.23193359375, "learning_rate": 1.0293237656130304e-07, "loss": -0.1421, "num_tokens": 32670570.0, "reward": 0.04645146429538727, "reward_std": 0.04729815572500229, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0030016263481229544, "rewards/logprob_reward/std": 0.005217393394559622, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 773.28125, "completions/mean_terminated_length": 715.423095703125, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 3.552469135802469, "grad_norm": 2.0905799438129216, "kl": 0.245849609375, "learning_rate": 1.0252839119782006e-07, "loss": -0.1921, "num_tokens": 32702179.0, "reward": 0.04366558417677879, "reward_std": 0.047508589923381805, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.003378424560651183, "rewards/logprob_reward/std": 0.005873650312423706, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 732.90625, "completions/mean_terminated_length": 702.7930908203125, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 3.5555555555555554, "grad_norm": 1.834207846084838, "kl": 0.21826171875, "learning_rate": 1.0212499556763335e-07, "loss": -0.0765, "num_tokens": 32731848.0, "reward": 0.03870991989970207, "reward_std": 0.047259245067834854, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.001344357617199421, "rewards/logprob_reward/std": 0.002039544750005007, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 739.8125, "completions/mean_terminated_length": 687.1851806640625, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 3.558641975308642, "grad_norm": 1.70906949737979, "kl": 0.2301025390625, "learning_rate": 1.017221912839065e-07, "loss": -0.079, "num_tokens": 32762650.0, "reward": 0.02254810370504856, "reward_std": 0.028371116146445274, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0007478923653252423, "rewards/logprob_reward/std": 0.001637790584936738, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 725.84375, "completions/mean_terminated_length": 705.9666748046875, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "epoch": 3.5617283950617287, "grad_norm": 1.8843546376830838, "kl": 0.2266845703125, "learning_rate": 1.0131997995743838e-07, "loss": -0.1384, "num_tokens": 32792361.0, "reward": 0.029994942247867584, "reward_std": 0.04116266965866089, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.002077713841572404, "rewards/logprob_reward/std": 0.002635175595059991, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 765.75, "completions/mean_terminated_length": 728.857177734375, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 3.564814814814815, "grad_norm": 1.9939235544558975, "kl": 0.2037353515625, "learning_rate": 1.0091836319665664e-07, "loss": -0.0247, "num_tokens": 32823197.0, "reward": 0.046854279935359955, "reward_std": 0.05307752639055252, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.003449202748015523, "rewards/logprob_reward/std": 0.0035610662307590246, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 763.8125, "completions/mean_terminated_length": 703.769287109375, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 3.567901234567901, "grad_norm": 2.127539500228586, "kl": 0.226806640625, "learning_rate": 1.0051734260761135e-07, "loss": -0.0656, "num_tokens": 32853927.0, "reward": 0.03639114648103714, "reward_std": 0.04656834155321121, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.002240160945802927, "rewards/logprob_reward/std": 0.003256736323237419, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 711.0625, "completions/mean_terminated_length": 700.9677124023438, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 3.5709876543209877, "grad_norm": 2.2089161764476786, "kl": 0.219482421875, "learning_rate": 1.0011691979396827e-07, "loss": -0.0961, "num_tokens": 32883197.0, "reward": 0.045687802135944366, "reward_std": 0.02866285666823387, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0021531146485358477, "rewards/logprob_reward/std": 0.005218836013227701, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 733.25, "completions/mean_terminated_length": 713.86669921875, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 3.574074074074074, "grad_norm": 2.585901571693692, "kl": 0.221923828125, "learning_rate": 9.971709635700301e-08, "loss": -0.2335, "num_tokens": 32912889.0, "reward": 0.03194165229797363, "reward_std": 0.05235572159290314, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0007685050368309021, "rewards/logprob_reward/std": 0.0014013643376529217, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 778.5625, "completions/mean_terminated_length": 721.923095703125, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "epoch": 3.5771604938271606, "grad_norm": 2.5027511395310427, "kl": 0.2294921875, "learning_rate": 9.931787389559393e-08, "loss": -0.2085, "num_tokens": 32944799.0, "reward": 0.03651416301727295, "reward_std": 0.03996749222278595, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0023768495302647352, "rewards/logprob_reward/std": 0.0031294492073357105, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 729.65625, "completions/mean_terminated_length": 710.0333862304688, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 3.580246913580247, "grad_norm": 1.916702283811012, "kl": 0.21142578125, "learning_rate": 9.891925400621642e-08, "loss": -0.0708, "num_tokens": 32974408.0, "reward": 0.045670777559280396, "reward_std": 0.04736469313502312, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0021341973915696144, "rewards/logprob_reward/std": 0.003208830486983061, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 675.90625, "completions/mean_terminated_length": 664.6774291992188, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 3.5833333333333335, "grad_norm": 2.309206694151157, "kl": 0.2421875, "learning_rate": 9.852123828293612e-08, "loss": -0.1358, "num_tokens": 33002549.0, "reward": 0.038855329155921936, "reward_std": 0.0476827509701252, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0015059204306453466, "rewards/logprob_reward/std": 0.0023971146438270807, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 714.90625, "completions/mean_terminated_length": 704.9354858398438, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 3.5864197530864197, "grad_norm": 2.4419855450292083, "kl": 0.24462890625, "learning_rate": 9.812382831740259e-08, "loss": -0.1761, "num_tokens": 33032410.0, "reward": 0.03587932139635086, "reward_std": 0.05150279775261879, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0016714700032025576, "rewards/logprob_reward/std": 0.0032537688966840506, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 688.65625, "completions/mean_terminated_length": 688.65625, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 3.5895061728395063, "grad_norm": 2.154061690190303, "kl": 0.220458984375, "learning_rate": 9.772702569884301e-08, "loss": -0.1281, "num_tokens": 33061115.0, "reward": 0.04232947528362274, "reward_std": 0.054103728383779526, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.001893858890980482, "rewards/logprob_reward/std": 0.00251575093716383, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 744.75, "completions/mean_terminated_length": 715.862060546875, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 3.5925925925925926, "grad_norm": 3.837670779890254, "kl": 0.2205810546875, "learning_rate": 9.733083201405578e-08, "loss": -0.3855, "num_tokens": 33091775.0, "reward": 0.026944581419229507, "reward_std": 0.0369250513613224, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0021606460213661194, "rewards/logprob_reward/std": 0.004713218659162521, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 740.0, "completions/mean_terminated_length": 710.6206665039062, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 3.5956790123456788, "grad_norm": 1.7853595308513794, "kl": 0.2216796875, "learning_rate": 9.693524884740425e-08, "loss": -0.0343, "num_tokens": 33121851.0, "reward": 0.055697403848171234, "reward_std": 0.04853183776140213, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0028582289814949036, "rewards/logprob_reward/std": 0.0037844711914658546, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 745.6875, "completions/mean_terminated_length": 705.9285888671875, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 3.5987654320987654, "grad_norm": 1.9216443876331215, "kl": 0.2117919921875, "learning_rate": 9.654027778081042e-08, "loss": -0.2153, "num_tokens": 33152401.0, "reward": 0.04970502480864525, "reward_std": 0.04187548905611038, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.003144473535940051, "rewards/logprob_reward/std": 0.0032250788062810898, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 687.15625, "completions/mean_terminated_length": 687.15625, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 3.601851851851852, "grad_norm": 2.6335268914583527, "kl": 0.237060546875, "learning_rate": 9.614592039374817e-08, "loss": -0.1578, "num_tokens": 33181054.0, "reward": 0.04822558909654617, "reward_std": 0.05351199582219124, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0015006531029939651, "rewards/logprob_reward/std": 0.0025360500440001488, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 737.21875, "completions/mean_terminated_length": 718.1000366210938, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 3.6049382716049383, "grad_norm": 2.108476036100173, "kl": 0.2164306640625, "learning_rate": 9.575217826323761e-08, "loss": -0.1678, "num_tokens": 33210873.0, "reward": 0.03886102885007858, "reward_std": 0.05453537404537201, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0015122548211365938, "rewards/logprob_reward/std": 0.002545837312936783, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 705.09375, "completions/mean_terminated_length": 683.8333740234375, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 3.6080246913580245, "grad_norm": 2.093434761242607, "kl": 0.220458984375, "learning_rate": 9.535905296383848e-08, "loss": -0.1648, "num_tokens": 33239724.0, "reward": 0.04284374788403511, "reward_std": 0.05345625802874565, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0024652741849422455, "rewards/logprob_reward/std": 0.004442441742867231, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 761.84375, "completions/mean_terminated_length": 744.36669921875, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 3.611111111111111, "grad_norm": 2.5958651505623527, "kl": 0.2073974609375, "learning_rate": 9.496654606764373e-08, "loss": -0.4292, "num_tokens": 33270991.0, "reward": 0.03993716090917587, "reward_std": 0.040980130434036255, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.002707959618419409, "rewards/logprob_reward/std": 0.0048716869205236435, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 730.34375, "completions/mean_terminated_length": 699.9655151367188, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 3.6141975308641974, "grad_norm": 2.0841017528140022, "kl": 0.2457275390625, "learning_rate": 9.457465914427326e-08, "loss": -0.1026, "num_tokens": 33301178.0, "reward": 0.03388774394989014, "reward_std": 0.034075312316417694, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.002930824179202318, "rewards/logprob_reward/std": 0.006740282755345106, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 737.0625, "completions/mean_terminated_length": 717.933349609375, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 3.617283950617284, "grad_norm": 2.087094990381655, "kl": 0.2344970703125, "learning_rate": 9.418339376086785e-08, "loss": -0.1951, "num_tokens": 33331128.0, "reward": 0.0337870828807354, "reward_std": 0.04293426126241684, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.002818981185555458, "rewards/logprob_reward/std": 0.0037272456102073193, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 710.625, "completions/mean_terminated_length": 689.7333984375, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 3.6203703703703702, "grad_norm": 2.4310665200823744, "kl": 0.2216796875, "learning_rate": 9.379275148208276e-08, "loss": -0.1287, "num_tokens": 33360504.0, "reward": 0.033377714455127716, "reward_std": 0.03862452134490013, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.002364126965403557, "rewards/logprob_reward/std": 0.0032782454509288073, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 731.0625, "completions/mean_terminated_length": 676.8148193359375, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 3.623456790123457, "grad_norm": 1.8869975444742286, "kl": 0.24365234375, "learning_rate": 9.340273387008152e-08, "loss": -0.0798, "num_tokens": 33390974.0, "reward": 0.02314150519669056, "reward_std": 0.028033804148435593, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0014072273625060916, "rewards/logprob_reward/std": 0.0022848050575703382, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 711.75, "completions/mean_terminated_length": 711.75, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 3.626543209876543, "grad_norm": 2.198024711927301, "kl": 0.2171630859375, "learning_rate": 9.30133424845294e-08, "loss": -0.1943, "num_tokens": 33420078.0, "reward": 0.04502446576952934, "reward_std": 0.05276203155517578, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0014160738792270422, "rewards/logprob_reward/std": 0.0021478289272636175, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 694.5, "completions/mean_terminated_length": 672.5333862304688, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 3.6296296296296298, "grad_norm": 2.229631533463387, "kl": 0.2266845703125, "learning_rate": 9.26245788825877e-08, "loss": -0.1133, "num_tokens": 33448962.0, "reward": 0.05338180810213089, "reward_std": 0.04487844929099083, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0037575638853013515, "rewards/logprob_reward/std": 0.005705379415303469, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 792.875, "completions/mean_terminated_length": 715.8333740234375, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 3.632716049382716, "grad_norm": 1.7076910558267198, "kl": 0.228515625, "learning_rate": 9.223644461890711e-08, "loss": 0.0117, "num_tokens": 33481466.0, "reward": 0.03370305895805359, "reward_std": 0.042882040143013, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.002725622383877635, "rewards/logprob_reward/std": 0.004249474499374628, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 720.59375, "completions/mean_terminated_length": 677.25, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 3.6358024691358026, "grad_norm": 1.7917361604252584, "kl": 0.2369384765625, "learning_rate": 9.184894124562162e-08, "loss": -0.0305, "num_tokens": 33511005.0, "reward": 0.03542282432317734, "reward_std": 0.0337524339556694, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0011642478639259934, "rewards/logprob_reward/std": 0.0017650318332016468, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 707.375, "completions/mean_terminated_length": 697.1612548828125, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 3.638888888888889, "grad_norm": 2.7640861439708604, "kl": 0.2716064453125, "learning_rate": 9.146207031234232e-08, "loss": -0.148, "num_tokens": 33539557.0, "reward": 0.02736113965511322, "reward_std": 0.04650455713272095, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0026234870310872793, "rewards/logprob_reward/std": 0.0050385938957333565, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 678.0, "completions/mean_terminated_length": 666.8386840820312, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 3.6419753086419755, "grad_norm": 1.9242357904456104, "kl": 0.224853515625, "learning_rate": 9.107583336615124e-08, "loss": -0.1959, "num_tokens": 33567653.0, "reward": 0.04546947404742241, "reward_std": 0.04752422869205475, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0019105253741145134, "rewards/logprob_reward/std": 0.0036665520165115595, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 787.1875, "completions/mean_terminated_length": 771.4000244140625, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "epoch": 3.6450617283950617, "grad_norm": 2.109700825889162, "kl": 0.201904296875, "learning_rate": 9.069023195159505e-08, "loss": -0.224, "num_tokens": 33599795.0, "reward": 0.047613054513931274, "reward_std": 0.045055896043777466, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.004292279947549105, "rewards/logprob_reward/std": 0.011248183436691761, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 742.09375, "completions/mean_terminated_length": 701.8214721679688, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 3.648148148148148, "grad_norm": 1.9162535355414676, "kl": 0.210205078125, "learning_rate": 9.030526761067911e-08, "loss": -0.2148, "num_tokens": 33630026.0, "reward": 0.039342716336250305, "reward_std": 0.0419294573366642, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.002047464484348893, "rewards/logprob_reward/std": 0.0057477750815451145, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 781.875, "completions/mean_terminated_length": 737.0370483398438, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 3.6512345679012346, "grad_norm": 1.9041115803781818, "kl": 0.2135009765625, "learning_rate": 8.992094188286081e-08, "loss": -0.2172, "num_tokens": 33661490.0, "reward": 0.047771573066711426, "reward_std": 0.04141303151845932, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.000996189541183412, "rewards/logprob_reward/std": 0.0018626094097271562, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 746.3125, "completions/mean_terminated_length": 727.800048828125, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 3.6543209876543212, "grad_norm": 2.168200078517223, "kl": 0.216552734375, "learning_rate": 8.953725630504419e-08, "loss": -0.1496, "num_tokens": 33692164.0, "reward": 0.03015190362930298, "reward_std": 0.04134177416563034, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0022521147038787603, "rewards/logprob_reward/std": 0.003899810602888465, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 661.8125, "completions/mean_terminated_length": 650.1290283203125, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 3.6574074074074074, "grad_norm": 1.9923295968789287, "kl": 0.2342529296875, "learning_rate": 8.915421241157292e-08, "loss": -0.1572, "num_tokens": 33719854.0, "reward": 0.06792387366294861, "reward_std": 0.049035366624593735, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.006026518531143665, "rewards/logprob_reward/std": 0.014248420484364033, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 711.90625, "completions/mean_terminated_length": 691.1000366210938, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 3.6604938271604937, "grad_norm": 2.452255438851298, "kl": 0.236328125, "learning_rate": 8.877181173422487e-08, "loss": -0.2436, "num_tokens": 33749215.0, "reward": 0.03879091516137123, "reward_std": 0.048295870423316956, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.001434349687770009, "rewards/logprob_reward/std": 0.002998590236529708, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 730.0, "completions/mean_terminated_length": 675.5555419921875, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 3.6635802469135803, "grad_norm": 1.8964564230924912, "kl": 0.2518310546875, "learning_rate": 8.839005580220574e-08, "loss": -0.156, "num_tokens": 33779163.0, "reward": 0.051121100783348083, "reward_std": 0.027093954384326935, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0012456680415198207, "rewards/logprob_reward/std": 0.0023642387241125107, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 707.6875, "completions/mean_terminated_length": 686.6000366210938, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 3.6666666666666665, "grad_norm": 2.698063869656446, "kl": 0.2183837890625, "learning_rate": 8.800894614214274e-08, "loss": -0.1528, "num_tokens": 33808297.0, "reward": 0.03385056555271149, "reward_std": 0.03981088474392891, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0028895149007439613, "rewards/logprob_reward/std": 0.00452409265562892, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 753.9375, "completions/mean_terminated_length": 715.357177734375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 3.669753086419753, "grad_norm": 1.7943247066398242, "kl": 0.24609375, "learning_rate": 8.762848427807882e-08, "loss": -0.0357, "num_tokens": 33839319.0, "reward": 0.042798519134521484, "reward_std": 0.03934928774833679, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0024150179233402014, "rewards/logprob_reward/std": 0.005606563296169043, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 781.46875, "completions/mean_terminated_length": 765.300048828125, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 3.6728395061728394, "grad_norm": 1.8116751313581374, "kl": 0.2315673828125, "learning_rate": 8.724867173146633e-08, "loss": -0.0796, "num_tokens": 33871646.0, "reward": 0.027132708579301834, "reward_std": 0.03633095324039459, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.002369676483795047, "rewards/logprob_reward/std": 0.0037503838539123535, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 768.625, "completions/mean_terminated_length": 742.2069091796875, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 3.675925925925926, "grad_norm": 2.0130064927685023, "kl": 0.223876953125, "learning_rate": 8.686951002116111e-08, "loss": -0.2278, "num_tokens": 33902506.0, "reward": 0.045343153178691864, "reward_std": 0.0349864661693573, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0017701697070151567, "rewards/logprob_reward/std": 0.0032237397972494364, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 754.375, "completions/mean_terminated_length": 715.857177734375, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 3.6790123456790123, "grad_norm": 2.337435117630004, "kl": 0.206298828125, "learning_rate": 8.649100066341614e-08, "loss": -0.282, "num_tokens": 33932814.0, "reward": 0.05505530908703804, "reward_std": 0.0429777055978775, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0056170071475207806, "rewards/logprob_reward/std": 0.011483278125524521, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 783.0, "completions/mean_terminated_length": 758.0689697265625, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "epoch": 3.682098765432099, "grad_norm": 1.8540610444487833, "kl": 0.2149658203125, "learning_rate": 8.611314517187584e-08, "loss": -0.1743, "num_tokens": 33964362.0, "reward": 0.03965158388018608, "reward_std": 0.04772069305181503, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0023906445130705833, "rewards/logprob_reward/std": 0.002835685620084405, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 724.46875, "completions/mean_terminated_length": 714.8064575195312, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 3.685185185185185, "grad_norm": 2.0576603816592525, "kl": 0.223388671875, "learning_rate": 8.573594505756982e-08, "loss": -0.1572, "num_tokens": 33993769.0, "reward": 0.03854009136557579, "reward_std": 0.04036765918135643, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0011556560639292002, "rewards/logprob_reward/std": 0.002103740582242608, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 751.46875, "completions/mean_terminated_length": 742.6773681640625, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 3.6882716049382713, "grad_norm": 2.183602194642444, "kl": 0.21484375, "learning_rate": 8.535940182890685e-08, "loss": -0.2377, "num_tokens": 34024244.0, "reward": 0.04226265847682953, "reward_std": 0.0493067130446434, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0018196202581748366, "rewards/logprob_reward/std": 0.003216702723875642, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 705.4375, "completions/mean_terminated_length": 684.2000122070312, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 3.691358024691358, "grad_norm": 1.9032977973811886, "kl": 0.231201171875, "learning_rate": 8.498351699166889e-08, "loss": -0.1241, "num_tokens": 34053138.0, "reward": 0.042191024869680405, "reward_std": 0.043851274996995926, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.001740025938488543, "rewards/logprob_reward/std": 0.0024247795809060335, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 742.96875, "completions/mean_terminated_length": 742.96875, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 3.6944444444444446, "grad_norm": 1.9042166827183096, "kl": 0.2301025390625, "learning_rate": 8.460829204900483e-08, "loss": -0.0344, "num_tokens": 34083621.0, "reward": 0.04693237692117691, "reward_std": 0.03480307385325432, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.003535972908139229, "rewards/logprob_reward/std": 0.004171703942120075, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 807.34375, "completions/mean_terminated_length": 757.34619140625, "completions/min_length": 528.0, "completions/min_terminated_length": 528.0, "epoch": 3.697530864197531, "grad_norm": 1.9783222280287667, "kl": 0.213623046875, "learning_rate": 8.423372850142482e-08, "loss": -0.1585, "num_tokens": 34116080.0, "reward": 0.029560063034296036, "reward_std": 0.034590378403663635, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0015945147024467587, "rewards/logprob_reward/std": 0.00318322260864079, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 664.75, "completions/mean_terminated_length": 664.75, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 3.700617283950617, "grad_norm": 1.9085474594234189, "kl": 0.2432861328125, "learning_rate": 8.385982784679416e-08, "loss": -0.071, "num_tokens": 34143472.0, "reward": 0.04795948788523674, "reward_std": 0.04828812927007675, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0012049854267388582, "rewards/logprob_reward/std": 0.0014321227790787816, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 676.96875, "completions/mean_terminated_length": 653.8333740234375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 3.7037037037037037, "grad_norm": 2.9150387049016016, "kl": 0.2626953125, "learning_rate": 8.348659158032723e-08, "loss": -0.2794, "num_tokens": 34171403.0, "reward": 0.03862811625003815, "reward_std": 0.03850669786334038, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0012534614652395248, "rewards/logprob_reward/std": 0.0016249925829470158, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 700.84375, "completions/mean_terminated_length": 667.413818359375, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 3.7067901234567904, "grad_norm": 2.0905855134396663, "kl": 0.2216796875, "learning_rate": 8.311402119458138e-08, "loss": -0.056, "num_tokens": 34200246.0, "reward": 0.04225610941648483, "reward_std": 0.04913446307182312, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0018123441841453314, "rewards/logprob_reward/std": 0.003109812270849943, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 755.65625, "completions/mean_terminated_length": 737.7667236328125, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 3.7098765432098766, "grad_norm": 2.453056232109461, "kl": 0.2198486328125, "learning_rate": 8.274211817945135e-08, "loss": -0.0325, "num_tokens": 34230919.0, "reward": 0.03255268186330795, "reward_std": 0.04033456742763519, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0014474234776571393, "rewards/logprob_reward/std": 0.0026141770649701357, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 736.46875, "completions/mean_terminated_length": 736.46875, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 3.712962962962963, "grad_norm": 2.251102088294818, "kl": 0.226806640625, "learning_rate": 8.237088402216297e-08, "loss": -0.1966, "num_tokens": 34261242.0, "reward": 0.024148931726813316, "reward_std": 0.03446749225258827, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0025265919975936413, "rewards/logprob_reward/std": 0.0057501113042235374, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 692.03125, "completions/mean_terminated_length": 681.3225708007812, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 3.7160493827160495, "grad_norm": 1.9580968722110697, "kl": 0.2088623046875, "learning_rate": 8.20003202072674e-08, "loss": -0.2147, "num_tokens": 34290075.0, "reward": 0.055211372673511505, "reward_std": 0.049151401966810226, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0023181959986686707, "rewards/logprob_reward/std": 0.0029256767593324184, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 669.65625, "completions/mean_terminated_length": 669.65625, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 3.7191358024691357, "grad_norm": 1.8875933590230973, "kl": 0.233154296875, "learning_rate": 8.163042821663507e-08, "loss": -0.2322, "num_tokens": 34317568.0, "reward": 0.04830477386713028, "reward_std": 0.03416569530963898, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001588636077940464, "rewards/logprob_reward/std": 0.002541485708206892, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 761.53125, "completions/mean_terminated_length": 744.0333862304688, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 3.7222222222222223, "grad_norm": 1.548060413554958, "kl": 0.20703125, "learning_rate": 8.126120952944987e-08, "loss": -0.0375, "num_tokens": 34348365.0, "reward": 0.049736231565475464, "reward_std": 0.04507913067936897, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.003179146908223629, "rewards/logprob_reward/std": 0.006874183192849159, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 679.59375, "completions/mean_terminated_length": 668.4838256835938, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 3.7253086419753085, "grad_norm": 1.722266109401197, "kl": 0.231689453125, "learning_rate": 8.089266562220312e-08, "loss": -0.0794, "num_tokens": 34376276.0, "reward": 0.049007855355739594, "reward_std": 0.03966151177883148, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0023698355071246624, "rewards/logprob_reward/std": 0.003103325143456459, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 725.4375, "completions/mean_terminated_length": 715.8064575195312, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 3.728395061728395, "grad_norm": 2.1382150265884814, "kl": 0.2340087890625, "learning_rate": 8.052479796868784e-08, "loss": -0.1193, "num_tokens": 34406058.0, "reward": 0.046502694487571716, "reward_std": 0.041052673012018204, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0030585499480366707, "rewards/logprob_reward/std": 0.0041487994603812695, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 744.84375, "completions/mean_terminated_length": 726.2333984375, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 3.7314814814814814, "grad_norm": 2.07009566028124, "kl": 0.23486328125, "learning_rate": 8.015760803999244e-08, "loss": -0.0924, "num_tokens": 34436981.0, "reward": 0.03666501119732857, "reward_std": 0.0349053293466568, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0025444573257118464, "rewards/logprob_reward/std": 0.0035742786712944508, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 661.53125, "completions/mean_terminated_length": 649.8386840820312, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 3.734567901234568, "grad_norm": 2.7000418451746033, "kl": 0.236328125, "learning_rate": 7.979109730449552e-08, "loss": -0.2496, "num_tokens": 34464586.0, "reward": 0.04257820546627045, "reward_std": 0.046489086002111435, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0021702260710299015, "rewards/logprob_reward/std": 0.002300801919773221, "step": 1210 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 747.78125, "completions/mean_terminated_length": 696.629638671875, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 3.7376543209876543, "grad_norm": 1.8317555702504096, "kl": NaN, "learning_rate": 7.942526722785927e-08, "loss": -0.066, "num_tokens": 34495327.0, "reward": 0.04541856423020363, "reward_std": 0.04059213399887085, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0018539587035775185, "rewards/logprob_reward/std": 0.002871278440579772, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 743.21875, "completions/mean_terminated_length": 714.1724243164062, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 3.7407407407407405, "grad_norm": 2.0344929358154697, "kl": 0.2208251953125, "learning_rate": 7.906011927302417e-08, "loss": -0.0623, "num_tokens": 34525486.0, "reward": 0.04631774500012398, "reward_std": 0.03879896551370621, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.002853048499673605, "rewards/logprob_reward/std": 0.0040617105551064014, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 693.21875, "completions/mean_terminated_length": 682.54833984375, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 3.743827160493827, "grad_norm": 2.400078149946603, "kl": 0.259765625, "learning_rate": 7.869565490020288e-08, "loss": -0.2001, "num_tokens": 34554457.0, "reward": 0.03221950680017471, "reward_std": 0.04514000564813614, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0010772309033200145, "rewards/logprob_reward/std": 0.0015260858926922083, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 748.8125, "completions/mean_terminated_length": 697.8518676757812, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 3.746913580246914, "grad_norm": 2.865671207904787, "kl": 0.248046875, "learning_rate": 7.833187556687443e-08, "loss": -0.1345, "num_tokens": 34585211.0, "reward": 0.03613856062293053, "reward_std": 0.033397313207387924, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.001959512010216713, "rewards/logprob_reward/std": 0.004207263700664043, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 721.59375, "completions/mean_terminated_length": 721.59375, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 3.75, "grad_norm": 1.7862320520107289, "kl": 0.2359619140625, "learning_rate": 7.796878272777835e-08, "loss": -0.1372, "num_tokens": 34614774.0, "reward": 0.0359145887196064, "reward_std": 0.041065141558647156, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0017106522573158145, "rewards/logprob_reward/std": 0.002327613066881895, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 700.65625, "completions/mean_terminated_length": 690.2257690429688, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 3.753086419753086, "grad_norm": 1.6872003359318692, "kl": 0.2388916015625, "learning_rate": 7.760637783490906e-08, "loss": -0.0253, "num_tokens": 34643783.0, "reward": 0.0515599325299263, "reward_std": 0.04761520400643349, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0017332553397864103, "rewards/logprob_reward/std": 0.0022615049965679646, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 659.90625, "completions/mean_terminated_length": 635.6333618164062, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 3.756172839506173, "grad_norm": 1.8086629239116327, "kl": 0.2423095703125, "learning_rate": 7.724466233750961e-08, "loss": -0.1475, "num_tokens": 34670932.0, "reward": 0.05709145590662956, "reward_std": 0.04678111895918846, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.000934947922360152, "rewards/logprob_reward/std": 0.0011365532409399748, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 713.5, "completions/mean_terminated_length": 692.800048828125, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 3.7592592592592595, "grad_norm": 1.695154414779429, "kl": 0.228515625, "learning_rate": 7.688363768206651e-08, "loss": -0.0119, "num_tokens": 34700416.0, "reward": 0.03903389722108841, "reward_std": 0.03349316865205765, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0017043291591107845, "rewards/logprob_reward/std": 0.0036165001802146435, "step": 1218 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 785.09375, "completions/mean_terminated_length": 750.9642944335938, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 3.7623456790123457, "grad_norm": 1.7150936142135094, "kl": NaN, "learning_rate": 7.652330531230344e-08, "loss": -0.1117, "num_tokens": 34732815.0, "reward": 0.023483876138925552, "reward_std": 0.019030466675758362, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0017876423662528396, "rewards/logprob_reward/std": 0.0036532750818878412, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 723.71875, "completions/mean_terminated_length": 723.71875, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 3.765432098765432, "grad_norm": 1.827179034852, "kl": 0.2166748046875, "learning_rate": 7.616366666917571e-08, "loss": -0.0628, "num_tokens": 34762474.0, "reward": 0.043150365352630615, "reward_std": 0.04825136065483093, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0028059615287929773, "rewards/logprob_reward/std": 0.00501100393012166, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 680.1875, "completions/mean_terminated_length": 680.1875, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 3.7685185185185186, "grad_norm": 2.2984840229526817, "kl": 0.2330322265625, "learning_rate": 7.580472319086442e-08, "loss": -0.1623, "num_tokens": 34790476.0, "reward": 0.05447046086192131, "reward_std": 0.05344248563051224, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0014949534088373184, "rewards/logprob_reward/std": 0.001674546510912478, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 763.09375, "completions/mean_terminated_length": 725.8214721679688, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 3.771604938271605, "grad_norm": 1.9419510096840515, "kl": 0.348876953125, "learning_rate": 7.544647631277085e-08, "loss": -0.2638, "num_tokens": 34821555.0, "reward": 0.04223502427339554, "reward_std": 0.029497407376766205, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0017889136215671897, "rewards/logprob_reward/std": 0.002690923633053899, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 767.96875, "completions/mean_terminated_length": 682.625, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 3.7746913580246915, "grad_norm": 4.038824756495392, "kl": 0.26171875, "learning_rate": 7.508892746751034e-08, "loss": -0.4471, "num_tokens": 34853166.0, "reward": 0.020810972899198532, "reward_std": 0.030478069558739662, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0022899697069078684, "rewards/logprob_reward/std": 0.005298912525177002, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 724.40625, "completions/mean_terminated_length": 714.741943359375, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 3.7777777777777777, "grad_norm": 2.502944826698859, "kl": 0.2000732421875, "learning_rate": 7.473207808490701e-08, "loss": -0.4054, "num_tokens": 34883583.0, "reward": 0.036741383373737335, "reward_std": 0.04447250813245773, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.002629311755299568, "rewards/logprob_reward/std": 0.0031205445993691683, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 716.78125, "completions/mean_terminated_length": 706.8709716796875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 3.7808641975308643, "grad_norm": 1.8493755286529348, "kl": 0.2239990234375, "learning_rate": 7.437592959198796e-08, "loss": -0.2095, "num_tokens": 34913224.0, "reward": 0.045536018908023834, "reward_std": 0.04138926416635513, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.00198446586728096, "rewards/logprob_reward/std": 0.0033418447710573673, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 732.75, "completions/mean_terminated_length": 691.1428833007812, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 3.7839506172839505, "grad_norm": 1.9340080967606625, "kl": 0.2315673828125, "learning_rate": 7.402048341297718e-08, "loss": -0.004, "num_tokens": 34942832.0, "reward": 0.03293833136558533, "reward_std": 0.04251699149608612, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0018759226659312844, "rewards/logprob_reward/std": 0.0024454020895063877, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 673.4375, "completions/mean_terminated_length": 662.1290283203125, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 3.787037037037037, "grad_norm": 2.113541289678668, "kl": 0.2188720703125, "learning_rate": 7.36657409692903e-08, "loss": -0.0908, "num_tokens": 34970450.0, "reward": 0.03614543378353119, "reward_std": 0.0392109677195549, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0019671465270221233, "rewards/logprob_reward/std": 0.0022074554581195116, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 713.09375, "completions/mean_terminated_length": 680.9310302734375, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 3.7901234567901234, "grad_norm": 2.2451632956676395, "kl": 0.2337646484375, "learning_rate": 7.331170367952874e-08, "loss": -0.1257, "num_tokens": 34999701.0, "reward": 0.03641784191131592, "reward_std": 0.041536085307598114, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.002269826363772154, "rewards/logprob_reward/std": 0.0030190160032361746, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 759.25, "completions/mean_terminated_length": 710.2222290039062, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 3.7932098765432096, "grad_norm": 1.790719025447086, "kl": 0.2132568359375, "learning_rate": 7.295837295947404e-08, "loss": -0.0636, "num_tokens": 35030449.0, "reward": 0.04219186305999756, "reward_std": 0.041881777346134186, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0017409594729542732, "rewards/logprob_reward/std": 0.0025964633096009493, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 720.75, "completions/mean_terminated_length": 700.5333862304688, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 3.7962962962962963, "grad_norm": 2.109531348390788, "kl": 0.244873046875, "learning_rate": 7.260575022208218e-08, "loss": -0.2573, "num_tokens": 35059809.0, "reward": 0.0454866886138916, "reward_std": 0.04775111377239227, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0019296524114906788, "rewards/logprob_reward/std": 0.0025852415710687637, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 674.25, "completions/mean_terminated_length": 674.25, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 3.799382716049383, "grad_norm": 2.0918421209421596, "kl": 0.245361328125, "learning_rate": 7.225383687747789e-08, "loss": -0.0674, "num_tokens": 35087617.0, "reward": 0.06574690341949463, "reward_std": 0.047928690910339355, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0036076600663363934, "rewards/logprob_reward/std": 0.004685068968683481, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 723.0, "completions/mean_terminated_length": 691.862060546875, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 3.802469135802469, "grad_norm": 2.1232772554206014, "kl": 0.2265625, "learning_rate": 7.190263433294913e-08, "loss": -0.255, "num_tokens": 35117245.0, "reward": 0.04533160477876663, "reward_std": 0.0413593128323555, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0017573356162756681, "rewards/logprob_reward/std": 0.0032630939967930317, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 743.125, "completions/mean_terminated_length": 714.0689697265625, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 3.8055555555555554, "grad_norm": 2.0827352450174166, "kl": 0.2052001953125, "learning_rate": 7.155214399294146e-08, "loss": -0.1668, "num_tokens": 35147805.0, "reward": 0.033200979232788086, "reward_std": 0.04059072211384773, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0021677513141185045, "rewards/logprob_reward/std": 0.0030180695466697216, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 652.6875, "completions/mean_terminated_length": 627.933349609375, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 3.808641975308642, "grad_norm": 1.965401530823781, "kl": 0.2252197265625, "learning_rate": 7.120236725905215e-08, "loss": -0.2894, "num_tokens": 35174631.0, "reward": 0.06410109996795654, "reward_std": 0.04031594097614288, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0017790001584216952, "rewards/logprob_reward/std": 0.0020099529065191746, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 713.96875, "completions/mean_terminated_length": 669.6785888671875, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 3.8117283950617287, "grad_norm": 2.2498048608172185, "kl": 0.25146484375, "learning_rate": 7.085330553002494e-08, "loss": -0.3914, "num_tokens": 35203822.0, "reward": 0.04209384322166443, "reward_std": 0.0550503246486187, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0016320511931553483, "rewards/logprob_reward/std": 0.003687640419229865, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 746.28125, "completions/mean_terminated_length": 694.8518676757812, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 3.814814814814815, "grad_norm": 2.1615279386635815, "kl": 0.2513427734375, "learning_rate": 7.05049602017444e-08, "loss": -0.2149, "num_tokens": 35234335.0, "reward": 0.05331094563007355, "reward_std": 0.0348031111061573, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0036788287106901407, "rewards/logprob_reward/std": 0.005828774534165859, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 688.15625, "completions/mean_terminated_length": 665.7667236328125, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 3.817901234567901, "grad_norm": 2.809253845645748, "kl": 0.26416015625, "learning_rate": 7.015733266722993e-08, "loss": -0.2882, "num_tokens": 35263092.0, "reward": 0.03941737860441208, "reward_std": 0.03392641246318817, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.00213041715323925, "rewards/logprob_reward/std": 0.003829734865576029, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 695.28125, "completions/mean_terminated_length": 673.36669921875, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 3.8209876543209877, "grad_norm": 2.3128877488742745, "kl": 0.2628173828125, "learning_rate": 6.981042431663075e-08, "loss": -0.3074, "num_tokens": 35291793.0, "reward": 0.05240838974714279, "reward_std": 0.05356854945421219, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0026759887114167213, "rewards/logprob_reward/std": 0.005153441336005926, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 699.75, "completions/mean_terminated_length": 699.75, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 3.824074074074074, "grad_norm": 1.985619890073846, "kl": 0.2230224609375, "learning_rate": 6.946423653722006e-08, "loss": 0.0096, "num_tokens": 35320597.0, "reward": 0.046561695635318756, "reward_std": 0.055780161172151566, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0031241043470799923, "rewards/logprob_reward/std": 0.004135539289563894, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 667.46875, "completions/mean_terminated_length": 655.9677124023438, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 3.8271604938271606, "grad_norm": 1.925630907400574, "kl": 0.2232666015625, "learning_rate": 6.911877071338942e-08, "loss": -0.0786, "num_tokens": 35348544.0, "reward": 0.04542498663067818, "reward_std": 0.05450732633471489, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0018610956612974405, "rewards/logprob_reward/std": 0.0020276005379855633, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 723.125, "completions/mean_terminated_length": 703.0667114257812, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 3.830246913580247, "grad_norm": 2.203898023208827, "kl": 0.22216796875, "learning_rate": 6.877402822664352e-08, "loss": -0.2233, "num_tokens": 35377924.0, "reward": 0.048514775931835175, "reward_std": 0.05417148023843765, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0018219701014459133, "rewards/logprob_reward/std": 0.002444783691316843, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 782.09375, "completions/mean_terminated_length": 726.269287109375, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 3.8333333333333335, "grad_norm": 2.114242025985549, "kl": 0.203369140625, "learning_rate": 6.843001045559416e-08, "loss": -0.1085, "num_tokens": 35409619.0, "reward": 0.045830316841602325, "reward_std": 0.040851891040802, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0023114606738090515, "rewards/logprob_reward/std": 0.0027213245630264282, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 801.90625, "completions/mean_terminated_length": 760.7777709960938, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 3.8364197530864197, "grad_norm": 3.3803453345376386, "kl": 0.2239990234375, "learning_rate": 6.808671877595524e-08, "loss": -0.2605, "num_tokens": 35442532.0, "reward": 0.013396810740232468, "reward_std": 0.02067142352461815, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.33601075410842896, "rewards/logprob_reward/mean": 0.000996455317363143, "rewards/logprob_reward/std": 0.0018346697324886918, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 703.75, "completions/mean_terminated_length": 703.75, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 3.8395061728395063, "grad_norm": 2.080147097218789, "kl": 0.2276611328125, "learning_rate": 6.774415456053697e-08, "loss": -0.2862, "num_tokens": 35471864.0, "reward": 0.06176084280014038, "reward_std": 0.054994165897369385, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0026509352028369904, "rewards/logprob_reward/std": 0.004046175163239241, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 707.03125, "completions/mean_terminated_length": 696.8064575195312, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 3.8425925925925926, "grad_norm": 2.0884837740930595, "kl": 0.24169921875, "learning_rate": 6.740231917924053e-08, "loss": -0.0457, "num_tokens": 35500845.0, "reward": 0.04472806304693222, "reward_std": 0.047552864998579025, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0010867358651012182, "rewards/logprob_reward/std": 0.0016396671999245882, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 733.6875, "completions/mean_terminated_length": 703.6551513671875, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 3.8456790123456788, "grad_norm": 2.299257104666202, "kl": 0.2156982421875, "learning_rate": 6.706121399905245e-08, "loss": -0.1831, "num_tokens": 35531067.0, "reward": 0.045227088034152985, "reward_std": 0.053179167211055756, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.001641207723878324, "rewards/logprob_reward/std": 0.002661538077518344, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 707.5625, "completions/mean_terminated_length": 662.357177734375, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 3.8487654320987654, "grad_norm": 2.7574531391395247, "kl": 0.249755859375, "learning_rate": 6.672084038403927e-08, "loss": -0.261, "num_tokens": 35560561.0, "reward": 0.04215136915445328, "reward_std": 0.048834048211574554, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0016959626227617264, "rewards/logprob_reward/std": 0.0026369316037744284, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 702.78125, "completions/mean_terminated_length": 669.5516967773438, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 3.851851851851852, "grad_norm": 2.3353384847628798, "kl": 0.2225341796875, "learning_rate": 6.638119969534201e-08, "loss": -0.2779, "num_tokens": 35589510.0, "reward": 0.04260419309139252, "reward_std": 0.05629081279039383, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0021991045214235783, "rewards/logprob_reward/std": 0.002575478982180357, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 730.53125, "completions/mean_terminated_length": 700.1724243164062, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 3.8549382716049383, "grad_norm": 1.8179618245074758, "kl": 0.218505859375, "learning_rate": 6.604229329117064e-08, "loss": -0.114, "num_tokens": 35619435.0, "reward": 0.048428699374198914, "reward_std": 0.04567597061395645, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001726333750411868, "rewards/logprob_reward/std": 0.0032993017230182886, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 673.625, "completions/mean_terminated_length": 637.3793334960938, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 3.8580246913580245, "grad_norm": 2.1642770276808836, "kl": 0.25, "learning_rate": 6.570412252679894e-08, "loss": -0.1348, "num_tokens": 35647623.0, "reward": 0.04711909219622612, "reward_std": 0.040639057755470276, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.003743434324860573, "rewards/logprob_reward/std": 0.00589523883536458, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 661.28125, "completions/mean_terminated_length": 649.5806274414062, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 3.861111111111111, "grad_norm": 1.9804081329675691, "kl": 0.23291015625, "learning_rate": 6.536668875455869e-08, "loss": -0.1854, "num_tokens": 35675420.0, "reward": 0.05114155262708664, "reward_std": 0.047681014984846115, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0012683928944170475, "rewards/logprob_reward/std": 0.0017212912207469344, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 689.96875, "completions/mean_terminated_length": 679.1935424804688, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 3.8641975308641974, "grad_norm": 2.0254402888670957, "kl": 0.225830078125, "learning_rate": 6.502999332383465e-08, "loss": -0.0254, "num_tokens": 35703959.0, "reward": 0.06178828328847885, "reward_std": 0.0480516254901886, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.002681422047317028, "rewards/logprob_reward/std": 0.0036414964124560356, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 700.59375, "completions/mean_terminated_length": 690.1612548828125, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 3.867283950617284, "grad_norm": 2.772763898119912, "kl": 0.236328125, "learning_rate": 6.469403758105894e-08, "loss": -0.3749, "num_tokens": 35733110.0, "reward": 0.04874895513057709, "reward_std": 0.04067114740610123, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.002082169521600008, "rewards/logprob_reward/std": 0.0030771277379244566, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 696.34375, "completions/mean_terminated_length": 685.774169921875, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 3.8703703703703702, "grad_norm": 2.4844023536789477, "kl": 0.23388671875, "learning_rate": 6.435882286970556e-08, "loss": -0.2869, "num_tokens": 35762161.0, "reward": 0.029596639797091484, "reward_std": 0.0414566695690155, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0016351554077118635, "rewards/logprob_reward/std": 0.0029583487194031477, "step": 1254 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 708.625, "completions/mean_terminated_length": 698.4515991210938, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 3.873456790123457, "grad_norm": 2.1787411189096044, "kl": NaN, "learning_rate": 6.402435053028538e-08, "loss": -0.1782, "num_tokens": 35791397.0, "reward": 0.04876616224646568, "reward_std": 0.049501799046993256, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0021012898068875074, "rewards/logprob_reward/std": 0.003905676770955324, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 762.4375, "completions/mean_terminated_length": 735.3793334960938, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 3.876543209876543, "grad_norm": 2.03078209328597, "kl": 0.2298583984375, "learning_rate": 6.369062190034036e-08, "loss": -0.1787, "num_tokens": 35822411.0, "reward": 0.04866570979356766, "reward_std": 0.047181855887174606, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0019896789453923702, "rewards/logprob_reward/std": 0.0026977788656949997, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 735.96875, "completions/mean_terminated_length": 694.8214721679688, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 3.8796296296296298, "grad_norm": 2.2449930248623526, "kl": 0.2501220703125, "learning_rate": 6.335763831443847e-08, "loss": -0.1564, "num_tokens": 35852934.0, "reward": 0.03295619413256645, "reward_std": 0.04246683791279793, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0018957715947180986, "rewards/logprob_reward/std": 0.004091357346624136, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 751.1875, "completions/mean_terminated_length": 700.6666870117188, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 3.882716049382716, "grad_norm": 2.0406042596363556, "kl": 0.2218017578125, "learning_rate": 6.302540110416837e-08, "loss": -0.182, "num_tokens": 35883040.0, "reward": 0.0543404221534729, "reward_std": 0.04171931743621826, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001350468141026795, "rewards/logprob_reward/std": 0.0019470700062811375, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 705.46875, "completions/mean_terminated_length": 684.2333984375, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 3.8858024691358026, "grad_norm": 2.393062096676376, "kl": 0.2298583984375, "learning_rate": 6.269391159813372e-08, "loss": -0.171, "num_tokens": 35912335.0, "reward": 0.051343321800231934, "reward_std": 0.04753294587135315, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0014925784198567271, "rewards/logprob_reward/std": 0.002065872075036168, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 686.4375, "completions/mean_terminated_length": 663.933349609375, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 3.888888888888889, "grad_norm": 2.876764353542928, "kl": 0.2431640625, "learning_rate": 6.236317112194844e-08, "loss": -0.2222, "num_tokens": 35940529.0, "reward": 0.04829934984445572, "reward_std": 0.04704994708299637, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0015826087910681963, "rewards/logprob_reward/std": 0.0027521736919879913, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 719.40625, "completions/mean_terminated_length": 687.8965454101562, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 3.8919753086419755, "grad_norm": 2.183774586870629, "kl": 0.23486328125, "learning_rate": 6.203318099823094e-08, "loss": -0.1403, "num_tokens": 35970066.0, "reward": 0.0399702787399292, "reward_std": 0.04147706180810928, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0027447554748505354, "rewards/logprob_reward/std": 0.004764461424201727, "step": 1261 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 752.625, "completions/mean_terminated_length": 724.5516967773438, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 3.8950617283950617, "grad_norm": 1.8409742201490922, "kl": NaN, "learning_rate": 6.17039425465991e-08, "loss": -0.098, "num_tokens": 36001174.0, "reward": 0.03989966958761215, "reward_std": 0.027711469680070877, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0026663001626729965, "rewards/logprob_reward/std": 0.0033170164097100496, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 723.84375, "completions/mean_terminated_length": 714.1612548828125, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 3.898148148148148, "grad_norm": 2.294005354144012, "kl": 0.2147216796875, "learning_rate": 6.137545708366476e-08, "loss": -0.3552, "num_tokens": 36030573.0, "reward": 0.05225500836968422, "reward_std": 0.04585783928632736, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.002505563199520111, "rewards/logprob_reward/std": 0.002747626742348075, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 731.25, "completions/mean_terminated_length": 700.9655151367188, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 3.9012345679012346, "grad_norm": 2.3506752127484782, "kl": 0.2198486328125, "learning_rate": 6.104772592302868e-08, "loss": -0.1969, "num_tokens": 36059917.0, "reward": 0.051935285329818726, "reward_std": 0.04809124022722244, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0021503143943846226, "rewards/logprob_reward/std": 0.002932285889983177, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 697.0625, "completions/mean_terminated_length": 675.2667236328125, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 3.9043209876543212, "grad_norm": 2.818576525015284, "kl": 0.2191162109375, "learning_rate": 6.072075037527519e-08, "loss": -0.2663, "num_tokens": 36088667.0, "reward": 0.05194810777902603, "reward_std": 0.055018581449985504, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0021645622327923775, "rewards/logprob_reward/std": 0.003375524654984474, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 736.9375, "completions/mean_terminated_length": 717.800048828125, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 3.9074074074074074, "grad_norm": 1.935271414649702, "kl": 0.2105712890625, "learning_rate": 6.039453174796699e-08, "loss": -0.0688, "num_tokens": 36118697.0, "reward": 0.05915181338787079, "reward_std": 0.055742938071489334, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0032242366578429937, "rewards/logprob_reward/std": 0.006154841743409634, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 736.34375, "completions/mean_terminated_length": 717.1666870117188, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 3.9104938271604937, "grad_norm": 2.5734633842286767, "kl": 0.2435302734375, "learning_rate": 6.006907134563973e-08, "loss": -0.3754, "num_tokens": 36148816.0, "reward": 0.03280571475625038, "reward_std": 0.04166591539978981, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0017285719513893127, "rewards/logprob_reward/std": 0.0027676033787429333, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 702.84375, "completions/mean_terminated_length": 681.433349609375, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 3.9135802469135803, "grad_norm": 2.0594885812245747, "kl": 0.23095703125, "learning_rate": 5.974437046979711e-08, "loss": -0.1846, "num_tokens": 36177675.0, "reward": 0.054407186806201935, "reward_std": 0.039308175444602966, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001424653921276331, "rewards/logprob_reward/std": 0.001887391204945743, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 715.125, "completions/mean_terminated_length": 715.125, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 3.9166666666666665, "grad_norm": 2.8789717930251757, "kl": 0.21484375, "learning_rate": 5.9420430418905435e-08, "loss": -0.3883, "num_tokens": 36206771.0, "reward": 0.03529440611600876, "reward_std": 0.046531401574611664, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0010215596994385123, "rewards/logprob_reward/std": 0.0017232416430488229, "step": 1269 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 750.03125, "completions/mean_terminated_length": 721.6896362304688, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 3.919753086419753, "grad_norm": 1.6858459763404439, "kl": NaN, "learning_rate": 5.909725248838854e-08, "loss": -0.2047, "num_tokens": 36237300.0, "reward": 0.03822425380349159, "reward_std": 0.0403776541352272, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0008047227747738361, "rewards/logprob_reward/std": 0.0015066579217091203, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 735.65625, "completions/mean_terminated_length": 694.4642944335938, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 3.9228395061728394, "grad_norm": 1.9435699110560907, "kl": 0.2420654296875, "learning_rate": 5.877483797062255e-08, "loss": -0.0681, "num_tokens": 36267285.0, "reward": 0.03239945322275162, "reward_std": 0.03997500240802765, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0012771696783602238, "rewards/logprob_reward/std": 0.0024854436051100492, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 768.9375, "completions/mean_terminated_length": 742.5516967773438, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 3.925925925925926, "grad_norm": 1.6475202535148172, "kl": 0.196044921875, "learning_rate": 5.845318815493069e-08, "loss": -0.1046, "num_tokens": 36298643.0, "reward": 0.042463794350624084, "reward_std": 0.03438322991132736, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0020431033335626125, "rewards/logprob_reward/std": 0.0030193571001291275, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 727.84375, "completions/mean_terminated_length": 708.1000366210938, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 3.9290123456790123, "grad_norm": 1.9309150674595645, "kl": 0.2166748046875, "learning_rate": 5.813230432757829e-08, "loss": -0.1629, "num_tokens": 36328302.0, "reward": 0.05194342881441116, "reward_std": 0.040441401302814484, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.002159362193197012, "rewards/logprob_reward/std": 0.0026275431737303734, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 735.5, "completions/mean_terminated_length": 735.5, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 3.932098765432099, "grad_norm": 2.143376594191243, "kl": 0.2288818359375, "learning_rate": 5.781218777176744e-08, "loss": -0.0411, "num_tokens": 36358698.0, "reward": 0.04697496443986893, "reward_std": 0.028916679322719574, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.003583295736461878, "rewards/logprob_reward/std": 0.005220974329859018, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 760.5, "completions/mean_terminated_length": 722.857177734375, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 3.935185185185185, "grad_norm": 1.7644029435866255, "kl": 0.2239990234375, "learning_rate": 5.749283976763186e-08, "loss": -0.0994, "num_tokens": 36389534.0, "reward": 0.042009782046079636, "reward_std": 0.045120202004909515, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0015386473387479782, "rewards/logprob_reward/std": 0.0025592546444386244, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 768.53125, "completions/mean_terminated_length": 732.0357666015625, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 3.9382716049382713, "grad_norm": 2.140303836367389, "kl": 0.2249755859375, "learning_rate": 5.717426159223204e-08, "loss": -0.1209, "num_tokens": 36420583.0, "reward": 0.026612577959895134, "reward_std": 0.04570880904793739, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0017917529912665486, "rewards/logprob_reward/std": 0.004275859799236059, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 795.875, "completions/mean_terminated_length": 753.629638671875, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 3.941358024691358, "grad_norm": 1.719896804050668, "kl": 0.22021484375, "learning_rate": 5.685645451954976e-08, "loss": -0.062, "num_tokens": 36453515.0, "reward": 0.041013821959495544, "reward_std": 0.04107628017663956, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.003904248122125864, "rewards/logprob_reward/std": 0.006139431614428759, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 828.4375, "completions/mean_terminated_length": 783.3077392578125, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 3.9444444444444446, "grad_norm": 2.233144436167193, "kl": 0.19189453125, "learning_rate": 5.653941982048333e-08, "loss": -0.1086, "num_tokens": 36486521.0, "reward": 0.048560962080955505, "reward_std": 0.04907785356044769, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001873290748335421, "rewards/logprob_reward/std": 0.003017270006239414, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 736.34375, "completions/mean_terminated_length": 706.586181640625, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 3.947530864197531, "grad_norm": 2.7394252149497937, "kl": 0.198486328125, "learning_rate": 5.6223158762842336e-08, "loss": -0.3063, "num_tokens": 36516664.0, "reward": 0.04506908357143402, "reward_std": 0.05461621284484863, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0014656463172286749, "rewards/logprob_reward/std": 0.0023357095196843147, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 741.84375, "completions/mean_terminated_length": 712.6551513671875, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 3.950617283950617, "grad_norm": 2.128755839405949, "kl": 0.2039794921875, "learning_rate": 5.59076726113426e-08, "loss": -0.1533, "num_tokens": 36546963.0, "reward": 0.04205043986439705, "reward_std": 0.054709989577531815, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0015838217223063111, "rewards/logprob_reward/std": 0.0026091295294463634, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 700.1875, "completions/mean_terminated_length": 653.9285888671875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 3.9537037037037037, "grad_norm": 2.0218528154180695, "kl": 0.248779296875, "learning_rate": 5.55929626276011e-08, "loss": -0.1079, "num_tokens": 36575569.0, "reward": 0.045324429869651794, "reward_std": 0.041458792984485626, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0017493651248514652, "rewards/logprob_reward/std": 0.0029430529102683067, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 803.5, "completions/mean_terminated_length": 772.0000610351562, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "epoch": 3.9567901234567904, "grad_norm": 1.8581704646255597, "kl": 0.1986083984375, "learning_rate": 5.527903007013099e-08, "loss": 0.0187, "num_tokens": 36607609.0, "reward": 0.03619496896862984, "reward_std": 0.041637122631073, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0020221881568431854, "rewards/logprob_reward/std": 0.0022943217772990465, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 753.125, "completions/mean_terminated_length": 725.1034545898438, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 3.9598765432098766, "grad_norm": 1.8148890962873936, "kl": 0.2056884765625, "learning_rate": 5.4965876194336567e-08, "loss": -0.0677, "num_tokens": 36638517.0, "reward": 0.05739375948905945, "reward_std": 0.04625089839100838, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0047430675476789474, "rewards/logprob_reward/std": 0.006590752396732569, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 738.65625, "completions/mean_terminated_length": 685.8148193359375, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 3.962962962962963, "grad_norm": 2.1778843692935577, "kl": 0.2286376953125, "learning_rate": 5.465350225250801e-08, "loss": -0.1401, "num_tokens": 36668778.0, "reward": 0.02642800100147724, "reward_std": 0.03563851863145828, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0015866662142798305, "rewards/logprob_reward/std": 0.002376316348090768, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 684.875, "completions/mean_terminated_length": 673.9354858398438, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 3.9660493827160495, "grad_norm": 2.012474825375182, "kl": 0.2205810546875, "learning_rate": 5.4341909493816786e-08, "loss": -0.3313, "num_tokens": 36697326.0, "reward": 0.05250188708305359, "reward_std": 0.04848799109458923, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.002779874252155423, "rewards/logprob_reward/std": 0.00423223152756691, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 705.71875, "completions/mean_terminated_length": 695.4515991210938, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 3.9691358024691357, "grad_norm": 3.177058697222281, "kl": 0.2593994140625, "learning_rate": 5.4031099164310314e-08, "loss": -0.4042, "num_tokens": 36725925.0, "reward": 0.025932677090168, "reward_std": 0.0403948575258255, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0010363080073148012, "rewards/logprob_reward/std": 0.0019922624342143536, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 733.09375, "completions/mean_terminated_length": 691.5357666015625, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 3.9722222222222223, "grad_norm": 2.0828686651994848, "kl": 0.233642578125, "learning_rate": 5.372107250690719e-08, "loss": -0.1359, "num_tokens": 36755440.0, "reward": 0.04142270237207413, "reward_std": 0.04101049154996872, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.000886336958501488, "rewards/logprob_reward/std": 0.001125569804571569, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 669.03125, "completions/mean_terminated_length": 645.36669921875, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 3.9753086419753085, "grad_norm": 2.113311396315076, "kl": 0.2552490234375, "learning_rate": 5.341183076139219e-08, "loss": -0.1674, "num_tokens": 36783025.0, "reward": 0.0463469922542572, "reward_std": 0.040201857686042786, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0028855446726083755, "rewards/logprob_reward/std": 0.007167118601500988, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 698.25, "completions/mean_terminated_length": 698.25, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 3.978395061728395, "grad_norm": 3.0365780822405837, "kl": 0.224365234375, "learning_rate": 5.310337516441102e-08, "loss": -0.1641, "num_tokens": 36811629.0, "reward": 0.06484922766685486, "reward_std": 0.04804805666208267, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0026102494448423386, "rewards/logprob_reward/std": 0.002949668560177088, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 720.125, "completions/mean_terminated_length": 699.86669921875, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 3.9814814814814814, "grad_norm": 2.058199466935219, "kl": 0.22705078125, "learning_rate": 5.279570694946581e-08, "loss": -0.3072, "num_tokens": 36841117.0, "reward": 0.05823837220668793, "reward_std": 0.041144661605358124, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0022093013394623995, "rewards/logprob_reward/std": 0.0028478633612394333, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 738.28125, "completions/mean_terminated_length": 719.2333984375, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 3.984567901234568, "grad_norm": 1.9958376821667123, "kl": 0.205078125, "learning_rate": 5.2488827346910015e-08, "loss": -0.0794, "num_tokens": 36871182.0, "reward": 0.05141979455947876, "reward_std": 0.04218154773116112, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0015775496140122414, "rewards/logprob_reward/std": 0.0016699270345270634, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 731.53125, "completions/mean_terminated_length": 712.0333862304688, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 3.9876543209876543, "grad_norm": 2.0667616398771655, "kl": 0.2208251953125, "learning_rate": 5.21827375839432e-08, "loss": -0.0656, "num_tokens": 36901519.0, "reward": 0.0433460995554924, "reward_std": 0.04242153465747833, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0030234456062316895, "rewards/logprob_reward/std": 0.004557882435619831, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 747.4375, "completions/mean_terminated_length": 707.9285888671875, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 3.9907407407407405, "grad_norm": 2.3584565129407253, "kl": 0.2208251953125, "learning_rate": 5.187743888460669e-08, "loss": -0.2497, "num_tokens": 36932101.0, "reward": 0.04608844593167305, "reward_std": 0.04741070419549942, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0025982714723795652, "rewards/logprob_reward/std": 0.0033916078973561525, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 729.53125, "completions/mean_terminated_length": 709.9000244140625, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 3.993827160493827, "grad_norm": 1.8885638626101118, "kl": 0.205810546875, "learning_rate": 5.15729324697782e-08, "loss": -0.1506, "num_tokens": 36961610.0, "reward": 0.05801812931895256, "reward_std": 0.04801099747419357, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.001964585855603218, "rewards/logprob_reward/std": 0.003837130730971694, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 682.71875, "completions/mean_terminated_length": 671.7096557617188, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 3.996913580246914, "grad_norm": 2.107292429850407, "kl": 0.244873046875, "learning_rate": 5.126921955716723e-08, "loss": -0.1056, "num_tokens": 36989857.0, "reward": 0.04603494703769684, "reward_std": 0.04803794249892235, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.002538828644901514, "rewards/logprob_reward/std": 0.003044616896659136, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 743.1875, "completions/mean_terminated_length": 734.1290283203125, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 4.0, "grad_norm": 2.142740634714608, "kl": 0.2208251953125, "learning_rate": 5.096630136131e-08, "loss": -0.1877, "num_tokens": 37019783.0, "reward": 0.03996527940034866, "reward_std": 0.04848010092973709, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0027391978073865175, "rewards/logprob_reward/std": 0.0035382481291890144, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 747.34375, "completions/mean_terminated_length": 728.9000244140625, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 4.003086419753086, "grad_norm": 2.8270253368033162, "kl": 0.214599609375, "learning_rate": 5.0664179093564765e-08, "loss": -0.3346, "num_tokens": 37049810.0, "reward": 0.04774084687232971, "reward_std": 0.05377994477748871, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0009620493510738015, "rewards/logprob_reward/std": 0.0015795612707734108, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 755.09375, "completions/mean_terminated_length": 727.27587890625, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 4.006172839506172, "grad_norm": 2.6042754047768164, "kl": 0.2205810546875, "learning_rate": 5.036285396210685e-08, "loss": -0.2129, "num_tokens": 37080717.0, "reward": 0.02016562968492508, "reward_std": 0.033509328961372375, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0015729216393083334, "rewards/logprob_reward/std": 0.002818315988406539, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 734.0625, "completions/mean_terminated_length": 704.0689697265625, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 4.0092592592592595, "grad_norm": 2.207656836279446, "kl": 0.2156982421875, "learning_rate": 5.0062327171923935e-08, "loss": -0.1427, "num_tokens": 37110499.0, "reward": 0.05492895096540451, "reward_std": 0.05219528079032898, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.00200439291074872, "rewards/logprob_reward/std": 0.002018099185079336, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 706.15625, "completions/mean_terminated_length": 647.2963256835938, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 4.012345679012346, "grad_norm": 1.8314022728111405, "kl": 0.2392578125, "learning_rate": 4.976259992481097e-08, "loss": -0.1421, "num_tokens": 37139552.0, "reward": 0.05195342376828194, "reward_std": 0.03563021868467331, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0021704700775444508, "rewards/logprob_reward/std": 0.004170523025095463, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 698.75, "completions/mean_terminated_length": 677.0667114257812, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 4.015432098765432, "grad_norm": 1.7378077558782936, "kl": 0.219970703125, "learning_rate": 4.946367341936578e-08, "loss": -0.1484, "num_tokens": 37168652.0, "reward": 0.0628993883728981, "reward_std": 0.04155369848012924, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.003915988374501467, "rewards/logprob_reward/std": 0.003737315535545349, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 747.34375, "completions/mean_terminated_length": 738.4193115234375, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 4.018518518518518, "grad_norm": 2.029197340086473, "kl": 0.2149658203125, "learning_rate": 4.916554885098403e-08, "loss": -0.1367, "num_tokens": 37198923.0, "reward": 0.0451449491083622, "reward_std": 0.04612676426768303, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.001549942884594202, "rewards/logprob_reward/std": 0.0025856001302599907, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 686.15625, "completions/mean_terminated_length": 651.2069091796875, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 4.021604938271605, "grad_norm": 3.3882430774395242, "kl": 0.249267578125, "learning_rate": 4.8868227411854287e-08, "loss": -0.2959, "num_tokens": 37227284.0, "reward": 0.04487443342804909, "reward_std": 0.04793141037225723, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.001249370165169239, "rewards/logprob_reward/std": 0.0020175762474536896, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 703.78125, "completions/mean_terminated_length": 670.6551513671875, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 4.0246913580246915, "grad_norm": 1.9171852354819447, "kl": 0.2298583984375, "learning_rate": 4.857171029095364e-08, "loss": -0.0861, "num_tokens": 37256053.0, "reward": 0.048420801758766174, "reward_std": 0.03434731066226959, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0017175577813759446, "rewards/logprob_reward/std": 0.002520865062251687, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 749.71875, "completions/mean_terminated_length": 721.3448486328125, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 4.027777777777778, "grad_norm": 2.2563786099900054, "kl": 0.24560546875, "learning_rate": 4.827599867404261e-08, "loss": -0.266, "num_tokens": 37286588.0, "reward": 0.04632047563791275, "reward_std": 0.05095044896006584, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0028560864739120007, "rewards/logprob_reward/std": 0.0039827702566981316, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 707.28125, "completions/mean_terminated_length": 674.5172119140625, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 4.030864197530864, "grad_norm": 1.8861105968798706, "kl": 0.220458984375, "learning_rate": 4.7981093743660634e-08, "loss": -0.0226, "num_tokens": 37315921.0, "reward": 0.052245382219552994, "reward_std": 0.03538450226187706, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.002494871150702238, "rewards/logprob_reward/std": 0.002427044091746211, "step": 1306 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 736.28125, "completions/mean_terminated_length": 736.28125, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 4.033950617283951, "grad_norm": 2.401898927796471, "kl": NaN, "learning_rate": 4.768699667912118e-08, "loss": -0.3143, "num_tokens": 37346710.0, "reward": 0.03275703638792038, "reward_std": 0.04243364930152893, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0016744830645620823, "rewards/logprob_reward/std": 0.0034032058902084827, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 684.3125, "completions/mean_terminated_length": 661.6666870117188, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 4.037037037037037, "grad_norm": 1.7161828447911658, "kl": 0.2215576171875, "learning_rate": 4.739370865650716e-08, "loss": -0.1491, "num_tokens": 37374792.0, "reward": 0.05248447135090828, "reward_std": 0.04930894821882248, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.002760524395853281, "rewards/logprob_reward/std": 0.007413600105792284, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 756.78125, "completions/mean_terminated_length": 707.2963256835938, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 4.040123456790123, "grad_norm": 2.3968823354359907, "kl": 0.2513427734375, "learning_rate": 4.710123084866602e-08, "loss": -0.144, "num_tokens": 37406137.0, "reward": 0.026766877621412277, "reward_std": 0.03382844850420952, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.001963196322321892, "rewards/logprob_reward/std": 0.0037106643430888653, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 667.375, "completions/mean_terminated_length": 667.375, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 4.04320987654321, "grad_norm": 1.6743847336337871, "kl": 0.2391357421875, "learning_rate": 4.6809564425205286e-08, "loss": -0.0379, "num_tokens": 37434045.0, "reward": 0.04869449511170387, "reward_std": 0.028283096849918365, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0020216605626046658, "rewards/logprob_reward/std": 0.0020715193822979927, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 741.96875, "completions/mean_terminated_length": 723.1666870117188, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 4.046296296296297, "grad_norm": 2.6716410427355513, "kl": 0.2393798828125, "learning_rate": 4.6518710552487796e-08, "loss": -0.271, "num_tokens": 37464308.0, "reward": 0.03659878298640251, "reward_std": 0.03978034108877182, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0024708695709705353, "rewards/logprob_reward/std": 0.003682315582409501, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 680.4375, "completions/mean_terminated_length": 644.8965454101562, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 4.049382716049383, "grad_norm": 2.2261017763514888, "kl": 0.2520751953125, "learning_rate": 4.6228670393627014e-08, "loss": -0.2621, "num_tokens": 37492222.0, "reward": 0.054087117314338684, "reward_std": 0.048690445721149445, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001069014542736113, "rewards/logprob_reward/std": 0.0017158350674435496, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 777.84375, "completions/mean_terminated_length": 721.0385131835938, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 4.052469135802469, "grad_norm": 1.990666661613414, "kl": 0.2265625, "learning_rate": 4.5939445108482466e-08, "loss": -0.0802, "num_tokens": 37523349.0, "reward": 0.020132973790168762, "reward_std": 0.02895219996571541, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.3965577781200409, "rewards/logprob_reward/mean": 0.0015366380102932453, "rewards/logprob_reward/std": 0.002819651272147894, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 737.90625, "completions/mean_terminated_length": 708.3103637695312, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 4.055555555555555, "grad_norm": 2.089280188424189, "kl": 0.21875, "learning_rate": 4.565103585365479e-08, "loss": -0.1981, "num_tokens": 37553238.0, "reward": 0.057640429586172104, "reward_std": 0.04725799337029457, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0015449193306267262, "rewards/logprob_reward/std": 0.0019049550173804164, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 666.4375, "completions/mean_terminated_length": 666.4375, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 4.058641975308642, "grad_norm": 1.8829293078272262, "kl": 0.2283935546875, "learning_rate": 4.536344378248161e-08, "loss": -0.2137, "num_tokens": 37580592.0, "reward": 0.07006511092185974, "reward_std": 0.045542240142822266, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0014612269587814808, "rewards/logprob_reward/std": 0.001866519683972001, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 719.5625, "completions/mean_terminated_length": 699.2667236328125, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 4.061728395061729, "grad_norm": 2.1688479938586833, "kl": 0.2685546875, "learning_rate": 4.50766700450326e-08, "loss": -0.0683, "num_tokens": 37610270.0, "reward": 0.0513153001666069, "reward_std": 0.04761355742812157, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0014614476822316647, "rewards/logprob_reward/std": 0.0018111151875928044, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 649.3125, "completions/mean_terminated_length": 649.3125, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 4.064814814814815, "grad_norm": 2.2932016624952336, "kl": 0.2490234375, "learning_rate": 4.479071578810481e-08, "loss": -0.1529, "num_tokens": 37637104.0, "reward": 0.03536916524171829, "reward_std": 0.04151330143213272, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.001104630995541811, "rewards/logprob_reward/std": 0.0021797718945890665, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 663.84375, "completions/mean_terminated_length": 663.84375, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 4.067901234567901, "grad_norm": 2.001613550630462, "kl": 0.236083984375, "learning_rate": 4.450558215521838e-08, "loss": -0.0023, "num_tokens": 37664911.0, "reward": 0.051741935312747955, "reward_std": 0.05239560455083847, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0019354848191142082, "rewards/logprob_reward/std": 0.0026733444537967443, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 684.90625, "completions/mean_terminated_length": 684.90625, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 4.070987654320987, "grad_norm": 2.1952092388258446, "kl": 0.225341796875, "learning_rate": 4.4221270286611765e-08, "loss": -0.2355, "num_tokens": 37693312.0, "reward": 0.036763377487659454, "reward_std": 0.04703374579548836, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0026537529192864895, "rewards/logprob_reward/std": 0.0032631775829941034, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 739.28125, "completions/mean_terminated_length": 720.300048828125, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 4.074074074074074, "grad_norm": 1.8023025242789739, "kl": 0.221923828125, "learning_rate": 4.3937781319237175e-08, "loss": -0.1363, "num_tokens": 37723541.0, "reward": 0.038583770394325256, "reward_std": 0.03296907618641853, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0012041892623528838, "rewards/logprob_reward/std": 0.0015725565608590841, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 825.6875, "completions/mean_terminated_length": 748.0869750976562, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 4.077160493827161, "grad_norm": 2.5226843610208944, "kl": 0.2186279296875, "learning_rate": 4.365511638675612e-08, "loss": -0.3079, "num_tokens": 37756983.0, "reward": 0.032925527542829514, "reward_std": 0.02233968675136566, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.008806141093373299, "rewards/logprob_reward/std": 0.017590748146176338, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 704.09375, "completions/mean_terminated_length": 693.774169921875, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 4.080246913580247, "grad_norm": 1.5330479855867665, "kl": 0.2310791015625, "learning_rate": 4.337327661953477e-08, "loss": -0.07, "num_tokens": 37785814.0, "reward": 0.054545577615499496, "reward_std": 0.041127897799015045, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001578418887220323, "rewards/logprob_reward/std": 0.0018107314826920629, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 730.1875, "completions/mean_terminated_length": 699.7930908203125, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 4.083333333333333, "grad_norm": 3.4392079834484965, "kl": 0.2095947265625, "learning_rate": 4.3092263144639565e-08, "loss": -0.3143, "num_tokens": 37815484.0, "reward": 0.05121327191591263, "reward_std": 0.05133683979511261, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0013480777852237225, "rewards/logprob_reward/std": 0.0025840946473181248, "step": 1323 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 743.90625, "completions/mean_terminated_length": 725.2333984375, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 4.08641975308642, "grad_norm": 1.9214781567583756, "kl": NaN, "learning_rate": 4.281207708583256e-08, "loss": -0.0852, "num_tokens": 37846061.0, "reward": 0.04986279085278511, "reward_std": 0.04641471430659294, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.003319765906780958, "rewards/logprob_reward/std": 0.004062604624778032, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 743.5625, "completions/mean_terminated_length": 714.5516967773438, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 4.089506172839506, "grad_norm": 2.366127547715954, "kl": 0.218994140625, "learning_rate": 4.253271956356713e-08, "loss": -0.1787, "num_tokens": 37876043.0, "reward": 0.051838453859090805, "reward_std": 0.04091264307498932, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0020427240524441004, "rewards/logprob_reward/std": 0.002830656711012125, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 763.28125, "completions/mean_terminated_length": 736.3103637695312, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 4.092592592592593, "grad_norm": 2.2531084527035894, "kl": 0.2030029296875, "learning_rate": 4.2254191694983096e-08, "loss": -0.0391, "num_tokens": 37907116.0, "reward": 0.0456613227725029, "reward_std": 0.040709540247917175, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0021236930042505264, "rewards/logprob_reward/std": 0.002349415561184287, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 738.78125, "completions/mean_terminated_length": 709.27587890625, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 4.095679012345679, "grad_norm": 3.8007254694939427, "kl": 0.22900390625, "learning_rate": 4.197649459390287e-08, "loss": -0.3119, "num_tokens": 37937241.0, "reward": 0.02436847798526287, "reward_std": 0.04079977050423622, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.420013427734375, "rewards/logprob_reward/mean": 0.0027705305255949497, "rewards/logprob_reward/std": 0.004038138780742884, "step": 1327 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 753.0625, "completions/mean_terminated_length": 735.0000610351562, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 4.098765432098766, "grad_norm": 2.401523271494944, "kl": NaN, "learning_rate": 4.169962937082635e-08, "loss": -0.1261, "num_tokens": 37968615.0, "reward": 0.031107213348150253, "reward_std": 0.027722710743546486, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0033135702833533287, "rewards/logprob_reward/std": 0.004784339107573032, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 677.3125, "completions/mean_terminated_length": 654.2000122070312, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 4.101851851851852, "grad_norm": 1.9800827156216652, "kl": 0.2451171875, "learning_rate": 4.142359713292698e-08, "loss": -0.032, "num_tokens": 37996817.0, "reward": 0.05676977336406708, "reward_std": 0.03475790470838547, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.004049746319651604, "rewards/logprob_reward/std": 0.003584577701985836, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 703.9375, "completions/mean_terminated_length": 682.6000366210938, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 4.104938271604938, "grad_norm": 2.2064144612947807, "kl": 0.241455078125, "learning_rate": 4.11483989840471e-08, "loss": -0.2097, "num_tokens": 38025819.0, "reward": 0.04861665517091751, "reward_std": 0.04186452925205231, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001935172826051712, "rewards/logprob_reward/std": 0.0028661531396210194, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 766.90625, "completions/mean_terminated_length": 740.3103637695312, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 4.1080246913580245, "grad_norm": 2.255648257423402, "kl": 0.208251953125, "learning_rate": 4.087403602469347e-08, "loss": -0.132, "num_tokens": 38057124.0, "reward": 0.05841232091188431, "reward_std": 0.054690130054950714, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0024025829043239355, "rewards/logprob_reward/std": 0.003108304226770997, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 711.40625, "completions/mean_terminated_length": 701.3225708007812, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 4.111111111111111, "grad_norm": 2.5811801352317443, "kl": 0.2423095703125, "learning_rate": 4.060050935203307e-08, "loss": -0.231, "num_tokens": 38086841.0, "reward": 0.04872202128171921, "reward_std": 0.044247761368751526, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.005524467211216688, "rewards/logprob_reward/std": 0.01583181507885456, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 738.59375, "completions/mean_terminated_length": 697.8214721679688, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 4.114197530864198, "grad_norm": 2.185577263120209, "kl": 0.2388916015625, "learning_rate": 4.032782005988861e-08, "loss": -0.011, "num_tokens": 38117336.0, "reward": 0.041978344321250916, "reward_std": 0.03934880346059799, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0015037164557725191, "rewards/logprob_reward/std": 0.0026061886455863714, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 758.65625, "completions/mean_terminated_length": 740.9666748046875, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "epoch": 4.117283950617284, "grad_norm": 1.8785321943038986, "kl": 0.218505859375, "learning_rate": 4.0055969238733945e-08, "loss": -0.0958, "num_tokens": 38147733.0, "reward": 0.03570020571351051, "reward_std": 0.04679879546165466, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0014724504435434937, "rewards/logprob_reward/std": 0.0028168363496661186, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 751.84375, "completions/mean_terminated_length": 701.4444580078125, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 4.12037037037037, "grad_norm": 1.931916647953982, "kl": 0.2218017578125, "learning_rate": 3.978495797569012e-08, "loss": -0.0929, "num_tokens": 38178636.0, "reward": 0.043558232486248016, "reward_std": 0.042037613689899445, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0032591475173830986, "rewards/logprob_reward/std": 0.0042504193261265755, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 687.375, "completions/mean_terminated_length": 652.5516967773438, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 4.1234567901234565, "grad_norm": 2.478648178978233, "kl": 0.236083984375, "learning_rate": 3.95147873545208e-08, "loss": -0.2018, "num_tokens": 38207168.0, "reward": 0.051688190549612045, "reward_std": 0.04069924354553223, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0018757663201540709, "rewards/logprob_reward/std": 0.0025266511365771294, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 757.5, "completions/mean_terminated_length": 739.7333984375, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 4.1265432098765435, "grad_norm": 2.024372736436707, "kl": 0.22265625, "learning_rate": 3.924545845562791e-08, "loss": -0.2783, "num_tokens": 38237936.0, "reward": 0.027438897639513016, "reward_std": 0.03564389795064926, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.00270988536067307, "rewards/logprob_reward/std": 0.0028934788424521685, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 730.125, "completions/mean_terminated_length": 688.1428833007812, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 4.12962962962963, "grad_norm": 2.0183830172881025, "kl": 0.2080078125, "learning_rate": 3.8976972356047325e-08, "loss": -0.0717, "num_tokens": 38268200.0, "reward": 0.03374570980668068, "reward_std": 0.04171259328722954, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0027730122674256563, "rewards/logprob_reward/std": 0.006873125210404396, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 709.625, "completions/mean_terminated_length": 677.1034545898438, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 4.132716049382716, "grad_norm": 1.9753625961217376, "kl": 0.2222900390625, "learning_rate": 3.870933012944472e-08, "loss": -0.1405, "num_tokens": 38297508.0, "reward": 0.049009934067726135, "reward_std": 0.04322347044944763, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.002372150309383869, "rewards/logprob_reward/std": 0.0031108625698834658, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 694.15625, "completions/mean_terminated_length": 672.1666870117188, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 4.135802469135802, "grad_norm": 1.7466544081334954, "kl": 0.2421875, "learning_rate": 3.844253284611096e-08, "loss": -0.1519, "num_tokens": 38326157.0, "reward": 0.05518614873290062, "reward_std": 0.046291884034872055, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0022901634220033884, "rewards/logprob_reward/std": 0.0030652545392513275, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 656.625, "completions/mean_terminated_length": 644.774169921875, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 4.138888888888889, "grad_norm": 1.923109938186103, "kl": 0.247802734375, "learning_rate": 3.817658157295819e-08, "loss": -0.1293, "num_tokens": 38353365.0, "reward": 0.06859253346920013, "reward_std": 0.054425276815891266, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0032972614280879498, "rewards/logprob_reward/std": 0.0040869941003620625, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 730.625, "completions/mean_terminated_length": 711.0667114257812, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 4.1419753086419755, "grad_norm": 2.170302985266013, "kl": 0.227294921875, "learning_rate": 3.791147737351541e-08, "loss": -0.1534, "num_tokens": 38383141.0, "reward": 0.04599742218852043, "reward_std": 0.048829492181539536, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0024971363600343466, "rewards/logprob_reward/std": 0.0028372197411954403, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 752.1875, "completions/mean_terminated_length": 689.4615478515625, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 4.145061728395062, "grad_norm": 2.2790726792037233, "kl": 0.23046875, "learning_rate": 3.7647221307923946e-08, "loss": -0.2172, "num_tokens": 38413699.0, "reward": 0.050827592611312866, "reward_std": 0.045222967863082886, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0009195467573590577, "rewards/logprob_reward/std": 0.001251706387847662, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 752.34375, "completions/mean_terminated_length": 734.2333984375, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 4.148148148148148, "grad_norm": 2.228274365183252, "kl": 0.2052001953125, "learning_rate": 3.738381443293376e-08, "loss": -0.2613, "num_tokens": 38444310.0, "reward": 0.026713576167821884, "reward_std": 0.040478356182575226, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0019039744511246681, "rewards/logprob_reward/std": 0.0031483571510761976, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 690.65625, "completions/mean_terminated_length": 679.9031982421875, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 4.151234567901234, "grad_norm": 2.3378643080570183, "kl": 0.24658203125, "learning_rate": 3.7121257801898814e-08, "loss": -0.3647, "num_tokens": 38472995.0, "reward": 0.04639032483100891, "reward_std": 0.04832562804222107, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.002933695912361145, "rewards/logprob_reward/std": 0.0039755916222929955, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 689.5625, "completions/mean_terminated_length": 654.9655151367188, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 4.154320987654321, "grad_norm": 1.982373303359418, "kl": 0.2449951171875, "learning_rate": 3.685955246477296e-08, "loss": -0.1405, "num_tokens": 38501369.0, "reward": 0.04185140132904053, "reward_std": 0.04646693170070648, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0013626698637381196, "rewards/logprob_reward/std": 0.0017280688043683767, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 682.75, "completions/mean_terminated_length": 671.741943359375, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 4.157407407407407, "grad_norm": 1.5521349183526207, "kl": 0.2357177734375, "learning_rate": 3.659869946810581e-08, "loss": -0.1462, "num_tokens": 38529137.0, "reward": 0.06384395062923431, "reward_std": 0.03511165454983711, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0014932783087715507, "rewards/logprob_reward/std": 0.0016576785128563643, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 758.40625, "completions/mean_terminated_length": 730.9310302734375, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 4.160493827160494, "grad_norm": 2.1717397888291554, "kl": 0.2120361328125, "learning_rate": 3.6338699855038486e-08, "loss": -0.1148, "num_tokens": 38560274.0, "reward": 0.045266292989254, "reward_std": 0.04792287200689316, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.001684766379185021, "rewards/logprob_reward/std": 0.00244696531444788, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 726.875, "completions/mean_terminated_length": 696.137939453125, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 4.16358024691358, "grad_norm": 1.9587763234845543, "kl": 0.222412109375, "learning_rate": 3.6079554665299414e-08, "loss": -0.1509, "num_tokens": 38590426.0, "reward": 0.04997875541448593, "reward_std": 0.04206637293100357, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.003448616247624159, "rewards/logprob_reward/std": 0.00576774962246418, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 741.5, "completions/mean_terminated_length": 712.27587890625, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 4.166666666666667, "grad_norm": 2.354385124226021, "kl": 0.2376708984375, "learning_rate": 3.5821264935200294e-08, "loss": -0.2053, "num_tokens": 38620898.0, "reward": 0.04564407467842102, "reward_std": 0.03507474064826965, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.002104530343785882, "rewards/logprob_reward/std": 0.003115322906523943, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 637.5625, "completions/mean_terminated_length": 637.5625, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 4.169753086419753, "grad_norm": 1.6875140352824833, "kl": 0.2469482421875, "learning_rate": 3.5563831697631776e-08, "loss": -0.1708, "num_tokens": 38647516.0, "reward": 0.05164702236652374, "reward_std": 0.04170852527022362, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.001830025459639728, "rewards/logprob_reward/std": 0.002140195807442069, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 719.03125, "completions/mean_terminated_length": 709.1935424804688, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 4.172839506172839, "grad_norm": 2.392309633220599, "kl": 0.2503662109375, "learning_rate": 3.53072559820595e-08, "loss": -0.1483, "num_tokens": 38677569.0, "reward": 0.03700459375977516, "reward_std": 0.04947003722190857, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0029217731207609177, "rewards/logprob_reward/std": 0.0046602776274085045, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 719.4375, "completions/mean_terminated_length": 687.9310302734375, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 4.175925925925926, "grad_norm": 2.979264233519347, "kl": 0.2257080078125, "learning_rate": 3.505153881451997e-08, "loss": -0.1962, "num_tokens": 38707275.0, "reward": 0.04795299842953682, "reward_std": 0.048882901668548584, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001197776640765369, "rewards/logprob_reward/std": 0.0021472196094691753, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 719.3125, "completions/mean_terminated_length": 699.0000610351562, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 4.179012345679013, "grad_norm": 1.9787664379018888, "kl": 0.2291259765625, "learning_rate": 3.479668121761617e-08, "loss": -0.2475, "num_tokens": 38737113.0, "reward": 0.035641416907310486, "reward_std": 0.04145059734582901, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0014071294572204351, "rewards/logprob_reward/std": 0.0019757109694182873, "step": 1354 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 648.0, "completions/mean_terminated_length": 635.8709716796875, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 4.182098765432099, "grad_norm": 2.2545876367542177, "kl": NaN, "learning_rate": 3.45426842105139e-08, "loss": -0.1652, "num_tokens": 38764177.0, "reward": 0.052417002618312836, "reward_std": 0.04732451215386391, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0026855559553951025, "rewards/logprob_reward/std": 0.00396285206079483, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 726.21875, "completions/mean_terminated_length": 706.36669921875, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 4.185185185185185, "grad_norm": 2.0719623368070024, "kl": 0.234130859375, "learning_rate": 3.428954880893745e-08, "loss": -0.1586, "num_tokens": 38794124.0, "reward": 0.03140906244516373, "reward_std": 0.04791799187660217, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0036489595659077168, "rewards/logprob_reward/std": 0.005231354385614395, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 682.125, "completions/mean_terminated_length": 671.0967407226562, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 4.188271604938271, "grad_norm": 2.167518236895074, "kl": 0.24072265625, "learning_rate": 3.403727602516554e-08, "loss": -0.4284, "num_tokens": 38822296.0, "reward": 0.06536111980676651, "reward_std": 0.049857065081596375, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.003179023740813136, "rewards/logprob_reward/std": 0.004562661051750183, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 694.8125, "completions/mean_terminated_length": 684.1935424804688, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 4.191358024691358, "grad_norm": 2.097909096541004, "kl": 0.234375, "learning_rate": 3.3785866868027426e-08, "loss": -0.2141, "num_tokens": 38851246.0, "reward": 0.03531426936388016, "reward_std": 0.03942541778087616, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0010436322772875428, "rewards/logprob_reward/std": 0.0015748720616102219, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 657.5, "completions/mean_terminated_length": 645.6774291992188, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 4.194444444444445, "grad_norm": 2.4790533056536614, "kl": 0.2320556640625, "learning_rate": 3.353532234289849e-08, "loss": -0.0841, "num_tokens": 38878366.0, "reward": 0.06946359574794769, "reward_std": 0.039860717952251434, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0007928848499432206, "rewards/logprob_reward/std": 0.0010307712946087122, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 730.59375, "completions/mean_terminated_length": 688.6785888671875, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 4.197530864197531, "grad_norm": 1.8830012574848274, "kl": 0.2530517578125, "learning_rate": 3.3285643451696796e-08, "loss": -0.0983, "num_tokens": 38908225.0, "reward": 0.026378247886896133, "reward_std": 0.021089550107717514, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0015313858166337013, "rewards/logprob_reward/std": 0.0027785678394138813, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 683.96875, "completions/mean_terminated_length": 683.96875, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 4.200617283950617, "grad_norm": 1.966890784894002, "kl": 0.2239990234375, "learning_rate": 3.303683119287859e-08, "loss": -0.16, "num_tokens": 38936436.0, "reward": 0.0617934986948967, "reward_std": 0.04686152935028076, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0026872195303440094, "rewards/logprob_reward/std": 0.0031066283117979765, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 718.90625, "completions/mean_terminated_length": 698.5667114257812, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 4.203703703703703, "grad_norm": 1.8711672569110238, "kl": 0.2818603515625, "learning_rate": 3.278888656143453e-08, "loss": -0.3152, "num_tokens": 38965705.0, "reward": 0.04152519628405571, "reward_std": 0.04094218462705612, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0010002164635807276, "rewards/logprob_reward/std": 0.001981014385819435, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 697.125, "completions/mean_terminated_length": 675.3333740234375, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 4.20679012345679, "grad_norm": 2.076659231615485, "kl": 0.2169189453125, "learning_rate": 3.254181054888569e-08, "loss": -0.1851, "num_tokens": 38994201.0, "reward": 0.054154179990291595, "reward_std": 0.053396470844745636, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0011435305932536721, "rewards/logprob_reward/std": 0.0014338643522933125, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 749.90625, "completions/mean_terminated_length": 710.7500610351562, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 4.209876543209877, "grad_norm": 2.8981736423388247, "kl": 0.23486328125, "learning_rate": 3.2295604143279534e-08, "loss": -0.3005, "num_tokens": 39024930.0, "reward": 0.03994644433259964, "reward_std": 0.048851221799850464, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.002718270756304264, "rewards/logprob_reward/std": 0.00442732498049736, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 667.96875, "completions/mean_terminated_length": 656.4838256835938, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 4.212962962962963, "grad_norm": 2.030662329879176, "kl": 0.2457275390625, "learning_rate": 3.205026832918606e-08, "loss": -0.2063, "num_tokens": 39052381.0, "reward": 0.06386682391166687, "reward_std": 0.047239288687705994, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0015186977107077837, "rewards/logprob_reward/std": 0.0020372355356812477, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 734.4375, "completions/mean_terminated_length": 704.4827270507812, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 4.216049382716049, "grad_norm": 1.888684409060091, "kl": 0.242919921875, "learning_rate": 3.1805804087693676e-08, "loss": -0.1763, "num_tokens": 39082511.0, "reward": 0.03575155884027481, "reward_std": 0.03975746035575867, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0015295101329684258, "rewards/logprob_reward/std": 0.002644570544362068, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 700.25, "completions/mean_terminated_length": 700.25, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 4.219135802469136, "grad_norm": 2.370034597374273, "kl": 0.21728515625, "learning_rate": 3.156221239640558e-08, "loss": -0.1534, "num_tokens": 39111487.0, "reward": 0.04550845921039581, "reward_std": 0.04069419205188751, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.001953842118382454, "rewards/logprob_reward/std": 0.0026862630620598793, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 701.59375, "completions/mean_terminated_length": 680.1000366210938, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 4.222222222222222, "grad_norm": 2.271384703623736, "kl": 0.261474609375, "learning_rate": 3.13194942294355e-08, "loss": -0.1969, "num_tokens": 39140682.0, "reward": 0.030107038095593452, "reward_std": 0.04138566926121712, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.002202265430241823, "rewards/logprob_reward/std": 0.0035037552006542683, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 745.6875, "completions/mean_terminated_length": 727.1333618164062, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 4.2253086419753085, "grad_norm": 1.8019759463625786, "kl": 0.22705078125, "learning_rate": 3.1077650557404076e-08, "loss": -0.1772, "num_tokens": 39171316.0, "reward": 0.047880593687295914, "reward_std": 0.036105748265981674, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0011173278326168656, "rewards/logprob_reward/std": 0.001846394268795848, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 706.21875, "completions/mean_terminated_length": 695.9677124023438, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 4.228395061728395, "grad_norm": 1.9308349370910018, "kl": 0.3421630859375, "learning_rate": 3.083668234743489e-08, "loss": -0.2876, "num_tokens": 39200239.0, "reward": 0.03226442262530327, "reward_std": 0.02791484259068966, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0011271354742348194, "rewards/logprob_reward/std": 0.00173663510940969, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 669.8125, "completions/mean_terminated_length": 669.8125, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 4.231481481481482, "grad_norm": 3.255575472138576, "kl": 0.2352294921875, "learning_rate": 3.059659056315053e-08, "loss": -0.189, "num_tokens": 39227765.0, "reward": 0.04805910587310791, "reward_std": 0.04100500047206879, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0013156697386875749, "rewards/logprob_reward/std": 0.0016503233928233385, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 645.96875, "completions/mean_terminated_length": 645.96875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 4.234567901234568, "grad_norm": 2.1378184413681165, "kl": 0.296875, "learning_rate": 3.035737616466885e-08, "loss": -0.092, "num_tokens": 39254680.0, "reward": 0.0521601140499115, "reward_std": 0.039865702390670776, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.002400127239525318, "rewards/logprob_reward/std": 0.0025403748732060194, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 699.28125, "completions/mean_terminated_length": 677.6333618164062, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 4.237654320987654, "grad_norm": 1.8555821513419986, "kl": 0.2142333984375, "learning_rate": 3.0119040108598974e-08, "loss": -0.0071, "num_tokens": 39283413.0, "reward": 0.03886808454990387, "reward_std": 0.02776341140270233, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.001520095276646316, "rewards/logprob_reward/std": 0.0023669328074902296, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 695.71875, "completions/mean_terminated_length": 673.8333740234375, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 4.2407407407407405, "grad_norm": 1.9709080116462059, "kl": 0.2943115234375, "learning_rate": 2.98815833480377e-08, "loss": -0.0631, "num_tokens": 39312128.0, "reward": 0.05827533081173897, "reward_std": 0.0337703675031662, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.002250362653285265, "rewards/logprob_reward/std": 0.0022385185584425926, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 712.5, "completions/mean_terminated_length": 702.4515991210938, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 4.243827160493828, "grad_norm": 2.245351937601914, "kl": 0.2398681640625, "learning_rate": 2.964500683256549e-08, "loss": -0.1131, "num_tokens": 39341464.0, "reward": 0.04220541566610336, "reward_std": 0.04653865844011307, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0017560124397277832, "rewards/logprob_reward/std": 0.0026445803232491016, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 782.03125, "completions/mean_terminated_length": 747.4642944335938, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 4.246913580246914, "grad_norm": 2.8433615922445834, "kl": 0.2310791015625, "learning_rate": 2.9409311508242663e-08, "loss": -0.2622, "num_tokens": 39373565.0, "reward": 0.05018014460802078, "reward_std": 0.04673204571008682, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.003672380466014147, "rewards/logprob_reward/std": 0.00520910881459713, "step": 1376 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 693.0, "completions/mean_terminated_length": 616.6154174804688, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 4.25, "grad_norm": 1.808520915238301, "kl": NaN, "learning_rate": 2.9174498317605794e-08, "loss": -0.1076, "num_tokens": 39402245.0, "reward": 0.051088765263557434, "reward_std": 0.03256085515022278, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.001209740643389523, "rewards/logprob_reward/std": 0.0015752612380310893, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 734.09375, "completions/mean_terminated_length": 704.1034545898438, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 4.253086419753086, "grad_norm": 1.7529749814005902, "kl": 0.210205078125, "learning_rate": 2.894056819966384e-08, "loss": -0.0914, "num_tokens": 39432356.0, "reward": 0.046020276844501495, "reward_std": 0.04837331920862198, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0025225291028618813, "rewards/logprob_reward/std": 0.0031702974811196327, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 702.46875, "completions/mean_terminated_length": 702.46875, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 4.256172839506172, "grad_norm": 2.1795713564780277, "kl": 0.22412109375, "learning_rate": 2.8707522089894354e-08, "loss": -0.1041, "num_tokens": 39461255.0, "reward": 0.05204486846923828, "reward_std": 0.055112261325120926, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.002272073645144701, "rewards/logprob_reward/std": 0.003844283288344741, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 678.46875, "completions/mean_terminated_length": 678.46875, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 4.2592592592592595, "grad_norm": 2.391937387997386, "kl": 0.2279052734375, "learning_rate": 2.8475360920239723e-08, "loss": -0.1423, "num_tokens": 39489006.0, "reward": 0.04850921779870987, "reward_std": 0.053589120507240295, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0018157969461753964, "rewards/logprob_reward/std": 0.0021827947348356247, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 671.78125, "completions/mean_terminated_length": 660.4193115234375, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 4.262345679012346, "grad_norm": 2.406046061001516, "kl": 0.2579345703125, "learning_rate": 2.8244085619103546e-08, "loss": -0.218, "num_tokens": 39516895.0, "reward": 0.056279636919498444, "reward_std": 0.0488351508975029, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0035051533486694098, "rewards/logprob_reward/std": 0.004182832781225443, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 757.75, "completions/mean_terminated_length": 696.3077392578125, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 4.265432098765432, "grad_norm": 1.904087467486132, "kl": 0.210205078125, "learning_rate": 2.8013697111346906e-08, "loss": -0.1864, "num_tokens": 39548031.0, "reward": 0.05306212604045868, "reward_std": 0.0427543930709362, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.006874579470604658, "rewards/logprob_reward/std": 0.011082598939538002, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 756.5625, "completions/mean_terminated_length": 707.0370483398438, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 4.268518518518518, "grad_norm": 1.6790001394875598, "kl": 0.2911376953125, "learning_rate": 2.778419631828463e-08, "loss": -0.2758, "num_tokens": 39578361.0, "reward": 0.03895171731710434, "reward_std": 0.03504975885152817, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.001613016938790679, "rewards/logprob_reward/std": 0.0025580748915672302, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 770.625, "completions/mean_terminated_length": 753.7333984375, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "epoch": 4.271604938271605, "grad_norm": 2.083072600862685, "kl": 0.22509765625, "learning_rate": 2.755558415768147e-08, "loss": -0.2419, "num_tokens": 39609649.0, "reward": 0.046697311103343964, "reward_std": 0.051193609833717346, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.003274786751717329, "rewards/logprob_reward/std": 0.006480569951236248, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 706.46875, "completions/mean_terminated_length": 685.300048828125, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 4.2746913580246915, "grad_norm": 1.883983869269119, "kl": 0.2314453125, "learning_rate": 2.732786154374869e-08, "loss": -0.1087, "num_tokens": 39638088.0, "reward": 0.06074865534901619, "reward_std": 0.041123319417238235, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0015262835659086704, "rewards/logprob_reward/std": 0.001343188458122313, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 734.875, "completions/mean_terminated_length": 704.9655151367188, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 4.277777777777778, "grad_norm": 2.012086222635098, "kl": 0.21826171875, "learning_rate": 2.7101029387140318e-08, "loss": -0.064, "num_tokens": 39667924.0, "reward": 0.04400114715099335, "reward_std": 0.04902297258377075, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0037512709386646748, "rewards/logprob_reward/std": 0.004525472410023212, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 770.03125, "completions/mean_terminated_length": 743.7586059570312, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 4.280864197530864, "grad_norm": 2.3338081925878735, "kl": 0.1871337890625, "learning_rate": 2.6875088594949387e-08, "loss": -0.3447, "num_tokens": 39698941.0, "reward": 0.03246118873357773, "reward_std": 0.04250557720661163, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0013457657769322395, "rewards/logprob_reward/std": 0.0020066280849277973, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 646.9375, "completions/mean_terminated_length": 646.9375, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 4.283950617283951, "grad_norm": 2.157794785448398, "kl": 0.2391357421875, "learning_rate": 2.6650040070704484e-08, "loss": -0.1648, "num_tokens": 39725727.0, "reward": 0.05416993424296379, "reward_std": 0.04828999191522598, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0011610384099185467, "rewards/logprob_reward/std": 0.0022571422159671783, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 705.625, "completions/mean_terminated_length": 684.4000244140625, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 4.287037037037037, "grad_norm": 1.7919849200281552, "kl": 0.302001953125, "learning_rate": 2.6425884714365966e-08, "loss": -0.1799, "num_tokens": 39755059.0, "reward": 0.04657496511936188, "reward_std": 0.03497311845421791, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0031388518400490284, "rewards/logprob_reward/std": 0.007875935174524784, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 733.0, "completions/mean_terminated_length": 713.6000366210938, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 4.290123456790123, "grad_norm": 2.5103468795796444, "kl": 0.228515625, "learning_rate": 2.6202623422322546e-08, "loss": -0.3152, "num_tokens": 39784891.0, "reward": 0.04510669782757759, "reward_std": 0.04089469462633133, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.001507441746070981, "rewards/logprob_reward/std": 0.0021598199382424355, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 633.53125, "completions/mean_terminated_length": 620.9354858398438, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 4.29320987654321, "grad_norm": 3.7293156285813556, "kl": 0.2447509765625, "learning_rate": 2.5980257087387546e-08, "loss": -0.2582, "num_tokens": 39811096.0, "reward": 0.06991346180438995, "reward_std": 0.03495609015226364, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0012927292846143246, "rewards/logprob_reward/std": 0.0014992320211604238, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 724.4375, "completions/mean_terminated_length": 704.4666748046875, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 4.296296296296296, "grad_norm": 2.0659177773554687, "kl": 0.226318359375, "learning_rate": 2.5758786598795325e-08, "loss": -0.3243, "num_tokens": 39841130.0, "reward": 0.04882894828915596, "reward_std": 0.04868233948945999, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0021710540167987347, "rewards/logprob_reward/std": 0.002986626233905554, "step": 1392 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 679.84375, "completions/mean_terminated_length": 679.84375, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 4.299382716049383, "grad_norm": 1.8252166403646315, "kl": NaN, "learning_rate": 2.5538212842197926e-08, "loss": -0.1682, "num_tokens": 39869257.0, "reward": 0.05994633585214615, "reward_std": 0.02662118338048458, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0006348142633214593, "rewards/logprob_reward/std": 0.0010241686832159758, "step": 1393 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 745.875, "completions/mean_terminated_length": 717.1034545898438, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 4.302469135802469, "grad_norm": 3.0771552623544185, "kl": NaN, "learning_rate": 2.5318536699661246e-08, "loss": -0.3404, "num_tokens": 39900081.0, "reward": 0.045463353395462036, "reward_std": 0.03326955810189247, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.001903725671581924, "rewards/logprob_reward/std": 0.0034816248808056116, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 671.9375, "completions/mean_terminated_length": 635.5172119140625, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 4.305555555555555, "grad_norm": 2.067104103791924, "kl": 0.2216796875, "learning_rate": 2.5099759049661802e-08, "loss": -0.057, "num_tokens": 39928071.0, "reward": 0.04660975933074951, "reward_std": 0.040935173630714417, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.003177509643137455, "rewards/logprob_reward/std": 0.0034811077639460564, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 689.46875, "completions/mean_terminated_length": 678.6774291992188, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 4.308641975308642, "grad_norm": 1.9975501121958554, "kl": 0.2283935546875, "learning_rate": 2.4881880767083002e-08, "loss": -0.1026, "num_tokens": 39956278.0, "reward": 0.05892914906144142, "reward_std": 0.047882527112960815, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.00297682941891253, "rewards/logprob_reward/std": 0.0047245929017663, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 796.3125, "completions/mean_terminated_length": 781.1333618164062, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 4.311728395061729, "grad_norm": 2.8479114820602693, "kl": 0.21240234375, "learning_rate": 2.4664902723211674e-08, "loss": -0.3645, "num_tokens": 39988672.0, "reward": 0.043073683977127075, "reward_std": 0.054729074239730835, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0027207641396671534, "rewards/logprob_reward/std": 0.002963118953630328, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 736.90625, "completions/mean_terminated_length": 707.2069091796875, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 4.314814814814815, "grad_norm": 2.2835337282673587, "kl": 0.2237548828125, "learning_rate": 2.444882578573476e-08, "loss": -0.3028, "num_tokens": 40018517.0, "reward": 0.04196564108133316, "reward_std": 0.04831092804670334, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0014895980712026358, "rewards/logprob_reward/std": 0.002160093979910016, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 705.09375, "completions/mean_terminated_length": 659.5357666015625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 4.317901234567901, "grad_norm": 1.7934255303390536, "kl": 0.248046875, "learning_rate": 2.4233650818735573e-08, "loss": -0.1481, "num_tokens": 40047524.0, "reward": 0.04166494309902191, "reward_std": 0.04144533723592758, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0011554912198334932, "rewards/logprob_reward/std": 0.004127013962715864, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 693.71875, "completions/mean_terminated_length": 683.0645141601562, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 4.320987654320987, "grad_norm": 2.318450278702292, "kl": 0.2708740234375, "learning_rate": 2.401937868269058e-08, "loss": -0.1239, "num_tokens": 40076567.0, "reward": 0.03664903715252876, "reward_std": 0.04152647405862808, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0025267070159316063, "rewards/logprob_reward/std": 0.003993797581642866, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 702.96875, "completions/mean_terminated_length": 692.6128540039062, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 4.324074074074074, "grad_norm": 2.0341144644978972, "kl": 0.23876953125, "learning_rate": 2.380601023446577e-08, "loss": -0.1947, "num_tokens": 40105406.0, "reward": 0.05468962714076042, "reward_std": 0.04895017668604851, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0017384699312970042, "rewards/logprob_reward/std": 0.0022966843098402023, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 690.03125, "completions/mean_terminated_length": 667.7667236328125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 4.327160493827161, "grad_norm": 2.171140198917671, "kl": 0.221923828125, "learning_rate": 2.3593546327313364e-08, "loss": -0.2481, "num_tokens": 40133907.0, "reward": 0.04149216040968895, "reward_std": 0.046369604766368866, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0009635099559091032, "rewards/logprob_reward/std": 0.0016679001273587346, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 684.90625, "completions/mean_terminated_length": 684.90625, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 4.330246913580247, "grad_norm": 2.0959285786124457, "kl": 0.2274169921875, "learning_rate": 2.338198781086842e-08, "loss": -0.2086, "num_tokens": 40161580.0, "reward": 0.04514283686876297, "reward_std": 0.04740273579955101, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0015475992113351822, "rewards/logprob_reward/std": 0.0022797882556915283, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 740.25, "completions/mean_terminated_length": 731.0967407226562, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 4.333333333333333, "grad_norm": 3.032831550053061, "kl": 0.24462890625, "learning_rate": 2.317133553114525e-08, "loss": -0.2835, "num_tokens": 40191800.0, "reward": 0.029408439993858337, "reward_std": 0.03851061314344406, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.001426044967956841, "rewards/logprob_reward/std": 0.002517278306186199, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 775.6875, "completions/mean_terminated_length": 767.6773681640625, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 4.33641975308642, "grad_norm": 2.115883231576716, "kl": 0.2034912109375, "learning_rate": 2.2961590330534298e-08, "loss": -0.2766, "num_tokens": 40223006.0, "reward": 0.04527488350868225, "reward_std": 0.045665763318538666, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.001694312784820795, "rewards/logprob_reward/std": 0.0019908961839973927, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 734.28125, "completions/mean_terminated_length": 714.9666748046875, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 4.339506172839506, "grad_norm": 2.0970833008940555, "kl": 0.2313232421875, "learning_rate": 2.2752753047798502e-08, "loss": -0.1546, "num_tokens": 40253323.0, "reward": 0.03802924603223801, "reward_std": 0.04966511204838753, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.004060272127389908, "rewards/logprob_reward/std": 0.0048335883766412735, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 680.25, "completions/mean_terminated_length": 644.6896362304688, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 4.342592592592593, "grad_norm": 2.4367237598130904, "kl": 0.2589111328125, "learning_rate": 2.2544824518070104e-08, "loss": -0.2076, "num_tokens": 40281567.0, "reward": 0.05269753560423851, "reward_std": 0.0432589054107666, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0029972584452480078, "rewards/logprob_reward/std": 0.004715714603662491, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 703.1875, "completions/mean_terminated_length": 703.1875, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 4.345679012345679, "grad_norm": 2.2303496249660135, "kl": 0.2296142578125, "learning_rate": 2.2337805572847425e-08, "loss": -0.2227, "num_tokens": 40310705.0, "reward": 0.05487576127052307, "reward_std": 0.05081789195537567, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0019452865235507488, "rewards/logprob_reward/std": 0.0020653873216360807, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 744.8125, "completions/mean_terminated_length": 726.2000122070312, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 4.348765432098766, "grad_norm": 2.095914288307202, "kl": 0.2137451171875, "learning_rate": 2.2131697039991127e-08, "loss": -0.2036, "num_tokens": 40340931.0, "reward": 0.04556164890527725, "reward_std": 0.05527123063802719, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0020129410549998283, "rewards/logprob_reward/std": 0.0027564240153878927, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 822.25, "completions/mean_terminated_length": 784.888916015625, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 4.351851851851852, "grad_norm": 2.64787034770904, "kl": 0.202392578125, "learning_rate": 2.1926499743721405e-08, "loss": -0.2434, "num_tokens": 40374287.0, "reward": 0.037518374621868134, "reward_std": 0.049233656376600266, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.003492637537419796, "rewards/logprob_reward/std": 0.007689587771892548, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 686.96875, "completions/mean_terminated_length": 686.96875, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 4.354938271604938, "grad_norm": 1.821016041915507, "kl": 0.2220458984375, "learning_rate": 2.1722214504614313e-08, "loss": -0.1042, "num_tokens": 40402726.0, "reward": 0.05894976854324341, "reward_std": 0.03396419435739517, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0029997399542480707, "rewards/logprob_reward/std": 0.004429431166499853, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 710.96875, "completions/mean_terminated_length": 700.8709716796875, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 4.3580246913580245, "grad_norm": 3.34726012963772, "kl": 0.212646484375, "learning_rate": 2.1518842139598674e-08, "loss": -0.4076, "num_tokens": 40431489.0, "reward": 0.04314707964658737, "reward_std": 0.045407362282276154, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.002802309114485979, "rewards/logprob_reward/std": 0.00475440826267004, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 731.3125, "completions/mean_terminated_length": 731.3125, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 4.361111111111111, "grad_norm": 2.2836553819565273, "kl": 0.2249755859375, "learning_rate": 2.1316383461952804e-08, "loss": -0.4415, "num_tokens": 40461763.0, "reward": 0.03941406309604645, "reward_std": 0.04996800422668457, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0021267374977469444, "rewards/logprob_reward/std": 0.003304164856672287, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 722.65625, "completions/mean_terminated_length": 691.4827270507812, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 4.364197530864198, "grad_norm": 2.596303238393781, "kl": 0.252197265625, "learning_rate": 2.1114839281301143e-08, "loss": -0.2563, "num_tokens": 40491432.0, "reward": 0.033059924840927124, "reward_std": 0.03590691089630127, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0020110271871089935, "rewards/logprob_reward/std": 0.0032212280202656984, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 743.03125, "completions/mean_terminated_length": 724.300048828125, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 4.367283950617284, "grad_norm": 2.092102307553979, "kl": 0.222412109375, "learning_rate": 2.0914210403611132e-08, "loss": -0.1412, "num_tokens": 40521597.0, "reward": 0.04807814210653305, "reward_std": 0.04798971116542816, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0013368211220949888, "rewards/logprob_reward/std": 0.002446343656629324, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 713.5625, "completions/mean_terminated_length": 692.86669921875, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 4.37037037037037, "grad_norm": 3.123376111092823, "kl": 0.2293701171875, "learning_rate": 2.071449763118993e-08, "loss": -0.3544, "num_tokens": 40551099.0, "reward": 0.052133187651634216, "reward_std": 0.04033924266695976, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.002370205707848072, "rewards/logprob_reward/std": 0.0027446511667221785, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 749.125, "completions/mean_terminated_length": 740.258056640625, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 4.3734567901234565, "grad_norm": 1.9764372852447862, "kl": 0.2183837890625, "learning_rate": 2.0515701762681304e-08, "loss": -0.1034, "num_tokens": 40581775.0, "reward": 0.04872114956378937, "reward_std": 0.04157783091068268, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0020512789487838745, "rewards/logprob_reward/std": 0.0028042506892234087, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 687.53125, "completions/mean_terminated_length": 652.72412109375, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 4.3765432098765435, "grad_norm": 2.365794988097494, "kl": 0.240966796875, "learning_rate": 2.0317823593062165e-08, "loss": -0.2624, "num_tokens": 40610432.0, "reward": 0.04879935085773468, "reward_std": 0.046986665576696396, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0021381659898906946, "rewards/logprob_reward/std": 0.002953203860670328, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 675.34375, "completions/mean_terminated_length": 639.27587890625, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 4.37962962962963, "grad_norm": 2.9093568539631067, "kl": 0.250244140625, "learning_rate": 2.0120863913639874e-08, "loss": -0.364, "num_tokens": 40638335.0, "reward": 0.045894354581832886, "reward_std": 0.04815865308046341, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0023826141841709614, "rewards/logprob_reward/std": 0.003841497004032135, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 768.65625, "completions/mean_terminated_length": 732.1785888671875, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 4.382716049382716, "grad_norm": 2.0744272803030923, "kl": 0.1971435546875, "learning_rate": 1.9924823512048438e-08, "loss": -0.0882, "num_tokens": 40669696.0, "reward": 0.026432940736413002, "reward_std": 0.03985881432890892, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0015921569429337978, "rewards/logprob_reward/std": 0.0023353670258075, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 682.5625, "completions/mean_terminated_length": 682.5625, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 4.385802469135802, "grad_norm": 2.4074119485832135, "kl": 0.2091064453125, "learning_rate": 1.972970317224601e-08, "loss": -0.2167, "num_tokens": 40698014.0, "reward": 0.04284990206360817, "reward_std": 0.052097100764513016, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0024721133522689342, "rewards/logprob_reward/std": 0.0028523337095975876, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 738.46875, "completions/mean_terminated_length": 697.6785888671875, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 4.388888888888889, "grad_norm": 1.7022079121164682, "kl": 0.2012939453125, "learning_rate": 1.9535503674511263e-08, "loss": -0.0228, "num_tokens": 40728421.0, "reward": 0.03993780165910721, "reward_std": 0.04407007247209549, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.002708667889237404, "rewards/logprob_reward/std": 0.006404683459550142, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 756.03125, "completions/mean_terminated_length": 738.1666870117188, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 4.3919753086419755, "grad_norm": 2.98292647074308, "kl": 0.2384033203125, "learning_rate": 1.934222579544059e-08, "loss": -0.2068, "num_tokens": 40759526.0, "reward": 0.029373236000537872, "reward_std": 0.044831715524196625, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0013869295362383127, "rewards/logprob_reward/std": 0.0021238508634269238, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 742.90625, "completions/mean_terminated_length": 713.8275756835938, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 4.395061728395062, "grad_norm": 1.9792308065221802, "kl": 0.2098388671875, "learning_rate": 1.9149870307944765e-08, "loss": -0.0489, "num_tokens": 40789767.0, "reward": 0.03917968273162842, "reward_std": 0.04533226788043976, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0018663115333765745, "rewards/logprob_reward/std": 0.0018212543800473213, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 680.84375, "completions/mean_terminated_length": 680.84375, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 4.398148148148148, "grad_norm": 1.8176212772817746, "kl": 0.2724609375, "learning_rate": 1.895843798124605e-08, "loss": -0.2014, "num_tokens": 40817602.0, "reward": 0.044841744005680084, "reward_std": 0.03986024111509323, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0012130499817430973, "rewards/logprob_reward/std": 0.0013913688017055392, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 684.6875, "completions/mean_terminated_length": 673.741943359375, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 4.401234567901234, "grad_norm": 1.8734393836718208, "kl": 0.230224609375, "learning_rate": 1.8767929580874863e-08, "loss": -0.2068, "num_tokens": 40845968.0, "reward": 0.0494350790977478, "reward_std": 0.03326744586229324, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0028445315547287464, "rewards/logprob_reward/std": 0.003680290887132287, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 731.96875, "completions/mean_terminated_length": 722.54833984375, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 4.404320987654321, "grad_norm": 1.83093813038457, "kl": 0.22265625, "learning_rate": 1.8578345868666996e-08, "loss": -0.1444, "num_tokens": 40875659.0, "reward": 0.05107133463025093, "reward_std": 0.04415426403284073, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0011903708800673485, "rewards/logprob_reward/std": 0.0022207444999367, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 737.59375, "completions/mean_terminated_length": 696.6785888671875, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 4.407407407407407, "grad_norm": 2.1189025175553335, "kl": 0.2158203125, "learning_rate": 1.8389687602760495e-08, "loss": -0.1464, "num_tokens": 40905902.0, "reward": 0.04381987825036049, "reward_std": 0.03640786185860634, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0035498631186783314, "rewards/logprob_reward/std": 0.0034627187997102737, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 779.03125, "completions/mean_terminated_length": 744.0357666015625, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 4.410493827160494, "grad_norm": 2.3693258851612358, "kl": 0.22314453125, "learning_rate": 1.820195553759246e-08, "loss": -0.3637, "num_tokens": 40937051.0, "reward": 0.04203595221042633, "reward_std": 0.04921147972345352, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0015677246265113354, "rewards/logprob_reward/std": 0.0030524819158017635, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 662.8125, "completions/mean_terminated_length": 638.7333374023438, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 4.41358024691358, "grad_norm": 2.4623191831819566, "kl": 0.257080078125, "learning_rate": 1.8015150423896203e-08, "loss": -0.2101, "num_tokens": 40965205.0, "reward": 0.05275503545999527, "reward_std": 0.044201165437698364, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0030611527618020773, "rewards/logprob_reward/std": 0.005246687680482864, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 661.53125, "completions/mean_terminated_length": 661.53125, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 4.416666666666667, "grad_norm": 1.8984126600052098, "kl": 0.2613525390625, "learning_rate": 1.782927300869827e-08, "loss": -0.0581, "num_tokens": 40993094.0, "reward": 0.041716672480106354, "reward_std": 0.048243045806884766, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.004685189574956894, "rewards/logprob_reward/std": 0.005050354637205601, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 706.0, "completions/mean_terminated_length": 695.741943359375, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 4.419753086419753, "grad_norm": 2.2240420842439685, "kl": 0.2259521484375, "learning_rate": 1.7644324035315212e-08, "loss": -0.212, "num_tokens": 41022302.0, "reward": 0.04929380118846893, "reward_std": 0.04793080687522888, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0026875524781644344, "rewards/logprob_reward/std": 0.0032262559980154037, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 742.46875, "completions/mean_terminated_length": 723.7000122070312, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 4.422839506172839, "grad_norm": 2.094867202988155, "kl": 0.23046875, "learning_rate": 1.746030424335093e-08, "loss": -0.1275, "num_tokens": 41052717.0, "reward": 0.05048564821481705, "reward_std": 0.04967077821493149, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.004011832177639008, "rewards/logprob_reward/std": 0.004363081883639097, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 740.5, "completions/mean_terminated_length": 721.6000366210938, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 4.425925925925926, "grad_norm": 1.8057482736396617, "kl": 0.221435546875, "learning_rate": 1.7277214368693423e-08, "loss": -0.1854, "num_tokens": 41082757.0, "reward": 0.04497251659631729, "reward_std": 0.047094471752643585, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0013583521358668804, "rewards/logprob_reward/std": 0.0019999288488179445, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 742.90625, "completions/mean_terminated_length": 733.8386840820312, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 4.429012345679013, "grad_norm": 2.063458027364136, "kl": 0.21728515625, "learning_rate": 1.7095055143512117e-08, "loss": -0.0462, "num_tokens": 41113226.0, "reward": 0.053193461149930954, "reward_std": 0.047015219926834106, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0035482917446643114, "rewards/logprob_reward/std": 0.005106258671730757, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 721.125, "completions/mean_terminated_length": 677.857177734375, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 4.432098765432099, "grad_norm": 2.3736942178064284, "kl": 0.229248046875, "learning_rate": 1.6913827296254736e-08, "loss": -0.3049, "num_tokens": 41142422.0, "reward": 0.04816795140504837, "reward_std": 0.05544901639223099, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0014366109389811754, "rewards/logprob_reward/std": 0.0024734637700021267, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 727.9375, "completions/mean_terminated_length": 685.6428833007812, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 4.435185185185185, "grad_norm": 2.1933093433517485, "kl": 0.2091064453125, "learning_rate": 1.6733531551644503e-08, "loss": -0.1723, "num_tokens": 41172224.0, "reward": 0.042622070759534836, "reward_std": 0.04842061549425125, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.002218965208157897, "rewards/logprob_reward/std": 0.0035111114848405123, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 702.21875, "completions/mean_terminated_length": 691.8386840820312, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 4.438271604938271, "grad_norm": 2.173028707071614, "kl": 0.2318115234375, "learning_rate": 1.655416863067713e-08, "loss": -0.1134, "num_tokens": 41201363.0, "reward": 0.0569436177611351, "reward_std": 0.056063245981931686, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0042429035529494286, "rewards/logprob_reward/std": 0.003971173893660307, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 691.375, "completions/mean_terminated_length": 669.2000122070312, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 4.441358024691358, "grad_norm": 1.9605198610090238, "kl": 0.2335205078125, "learning_rate": 1.637573925061808e-08, "loss": -0.1832, "num_tokens": 41229963.0, "reward": 0.030255049467086792, "reward_std": 0.0344797745347023, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.002366722794249654, "rewards/logprob_reward/std": 0.002940770238637924, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 746.0625, "completions/mean_terminated_length": 737.0967407226562, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 4.444444444444445, "grad_norm": 3.097304803998429, "kl": 0.2078857421875, "learning_rate": 1.6198244124999592e-08, "loss": -0.4188, "num_tokens": 41260737.0, "reward": 0.04235662519931793, "reward_std": 0.04883340001106262, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0019240305991843343, "rewards/logprob_reward/std": 0.002048332476988435, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 722.03125, "completions/mean_terminated_length": 678.8928833007812, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 4.447530864197531, "grad_norm": 2.339085377303552, "kl": 0.244140625, "learning_rate": 1.6021683963617805e-08, "loss": -0.3539, "num_tokens": 41290606.0, "reward": 0.05370953679084778, "reward_std": 0.05755479633808136, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0041217077523469925, "rewards/logprob_reward/std": 0.006467892322689295, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 769.6875, "completions/mean_terminated_length": 733.357177734375, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 4.450617283950617, "grad_norm": 2.992079515886702, "kl": 0.22802734375, "learning_rate": 1.5846059472530122e-08, "loss": -0.3015, "num_tokens": 41322048.0, "reward": 0.036230962723493576, "reward_std": 0.04625852778553963, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0020621800795197487, "rewards/logprob_reward/std": 0.0030165647622197866, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 726.78125, "completions/mean_terminated_length": 706.9666748046875, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 4.453703703703704, "grad_norm": 1.86928224847978, "kl": 0.2103271484375, "learning_rate": 1.5671371354051997e-08, "loss": -0.056, "num_tokens": 41351797.0, "reward": 0.06443355232477188, "reward_std": 0.04753619059920311, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0021483921445906162, "rewards/logprob_reward/std": 0.0018268699059262872, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 714.4375, "completions/mean_terminated_length": 714.4375, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 4.45679012345679, "grad_norm": 1.9845127790703219, "kl": 0.228515625, "learning_rate": 1.5497620306754582e-08, "loss": -0.0882, "num_tokens": 41381399.0, "reward": 0.0543513149023056, "reward_std": 0.04788962006568909, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0048347944393754005, "rewards/logprob_reward/std": 0.005204932298511267, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 704.5625, "completions/mean_terminated_length": 694.258056640625, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 4.459876543209877, "grad_norm": 1.830456933116766, "kl": 0.2293701171875, "learning_rate": 1.5324807025461656e-08, "loss": -0.0102, "num_tokens": 41410661.0, "reward": 0.06272735446691513, "reward_std": 0.041232675313949585, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.003724841633811593, "rewards/logprob_reward/std": 0.003868513973429799, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 741.375, "completions/mean_terminated_length": 647.1666870117188, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 4.462962962962963, "grad_norm": 1.6299599810837322, "kl": 0.221923828125, "learning_rate": 1.515293220124683e-08, "loss": -0.0934, "num_tokens": 41440813.0, "reward": 0.058781467378139496, "reward_std": 0.040851958096027374, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0028127399273216724, "rewards/logprob_reward/std": 0.003479891689494252, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 657.75, "completions/mean_terminated_length": 633.3333740234375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 4.466049382716049, "grad_norm": 1.952834187605714, "kl": 0.25390625, "learning_rate": 1.498199652143092e-08, "loss": -0.0952, "num_tokens": 41468229.0, "reward": 0.03513897582888603, "reward_std": 0.035858407616615295, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0008488652529194951, "rewards/logprob_reward/std": 0.0015317859360948205, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 713.78125, "completions/mean_terminated_length": 693.1000366210938, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 4.469135802469136, "grad_norm": 2.1990872625180407, "kl": 0.22412109375, "learning_rate": 1.4812000669579188e-08, "loss": -0.0744, "num_tokens": 41497506.0, "reward": 0.04628187045454979, "reward_std": 0.04848606139421463, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0028131892904639244, "rewards/logprob_reward/std": 0.003807139117270708, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 719.3125, "completions/mean_terminated_length": 687.7930908203125, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 4.472222222222222, "grad_norm": 2.709511431835891, "kl": 0.2344970703125, "learning_rate": 1.4642945325498507e-08, "loss": -0.3521, "num_tokens": 41526672.0, "reward": 0.05511452257633209, "reward_std": 0.05433478578925133, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0022105767857283354, "rewards/logprob_reward/std": 0.0026712631806731224, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 775.1875, "completions/mean_terminated_length": 767.1612548828125, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 4.4753086419753085, "grad_norm": 1.846995978285614, "kl": 0.2335205078125, "learning_rate": 1.4474831165234707e-08, "loss": -0.0631, "num_tokens": 41558186.0, "reward": 0.03356396034359932, "reward_std": 0.043709829449653625, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0025710673071444035, "rewards/logprob_reward/std": 0.005059478338807821, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 716.34375, "completions/mean_terminated_length": 695.8333740234375, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 4.478395061728395, "grad_norm": 2.5699828993214893, "kl": 0.22216796875, "learning_rate": 1.4307658861069799e-08, "loss": -0.2157, "num_tokens": 41587285.0, "reward": 0.06077142804861069, "reward_std": 0.0416644886136055, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0015515873674303293, "rewards/logprob_reward/std": 0.0024837180972099304, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 734.71875, "completions/mean_terminated_length": 681.1481323242188, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 4.481481481481482, "grad_norm": 1.9556211887061505, "kl": 0.212890625, "learning_rate": 1.414142908151944e-08, "loss": -0.0813, "num_tokens": 41617472.0, "reward": 0.03968391567468643, "reward_std": 0.041357140988111496, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0024265716783702374, "rewards/logprob_reward/std": 0.003852495225146413, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 675.0, "completions/mean_terminated_length": 663.741943359375, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 4.484567901234568, "grad_norm": 1.8457607288324611, "kl": 0.251708984375, "learning_rate": 1.3976142491330111e-08, "loss": -0.0615, "num_tokens": 41645440.0, "reward": 0.042487286031246185, "reward_std": 0.04263980686664581, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0020692090038210154, "rewards/logprob_reward/std": 0.0031202638056129217, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 706.28125, "completions/mean_terminated_length": 696.0322265625, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 4.487654320987654, "grad_norm": 1.8081229291005072, "kl": 0.2325439453125, "learning_rate": 1.3811799751476588e-08, "loss": -0.1682, "num_tokens": 41674581.0, "reward": 0.04569516330957413, "reward_std": 0.046075932681560516, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.002161290729418397, "rewards/logprob_reward/std": 0.0034060017205774784, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 745.71875, "completions/mean_terminated_length": 716.9310302734375, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 4.4907407407407405, "grad_norm": 2.086739826205387, "kl": 0.2305908203125, "learning_rate": 1.3648401519159109e-08, "loss": -0.1724, "num_tokens": 41705036.0, "reward": 0.05555104464292526, "reward_std": 0.05617810785770416, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0026956028304994106, "rewards/logprob_reward/std": 0.0028205085545778275, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 709.15625, "completions/mean_terminated_length": 699.0, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 4.493827160493828, "grad_norm": 1.8750074933357899, "kl": 0.2535400390625, "learning_rate": 1.348594844780096e-08, "loss": -0.1478, "num_tokens": 41734553.0, "reward": 0.04616174101829529, "reward_std": 0.027845583856105804, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0026797144673764706, "rewards/logprob_reward/std": 0.0039057082030922174, "step": 1456 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 710.84375, "completions/mean_terminated_length": 700.741943359375, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 4.496913580246914, "grad_norm": 2.1763105296110963, "kl": NaN, "learning_rate": 1.332444118704576e-08, "loss": -0.0882, "num_tokens": 41763700.0, "reward": 0.03868158906698227, "reward_std": 0.039767876267433167, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0013128730934113264, "rewards/logprob_reward/std": 0.002047280315309763, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 722.875, "completions/mean_terminated_length": 667.1111450195312, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 4.5, "grad_norm": 2.353725973027106, "kl": 0.238525390625, "learning_rate": 1.3163880382754761e-08, "loss": -0.2077, "num_tokens": 41793028.0, "reward": 0.05095834285020828, "reward_std": 0.05258800834417343, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0010648233583196998, "rewards/logprob_reward/std": 0.001401307643391192, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 661.59375, "completions/mean_terminated_length": 649.9031982421875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 4.503086419753086, "grad_norm": 2.6339210231801538, "kl": 0.266357421875, "learning_rate": 1.3004266677004522e-08, "loss": -0.1457, "num_tokens": 41820539.0, "reward": 0.03324113041162491, "reward_std": 0.0461549237370491, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0022123625967651606, "rewards/logprob_reward/std": 0.0030606554355472326, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 694.9375, "completions/mean_terminated_length": 684.3225708007812, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 4.506172839506172, "grad_norm": 3.6026827986687056, "kl": 0.2335205078125, "learning_rate": 1.2845600708084076e-08, "loss": -0.2701, "num_tokens": 41849217.0, "reward": 0.04239705204963684, "reward_std": 0.04868008941411972, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.001968946773558855, "rewards/logprob_reward/std": 0.002376778516918421, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 745.96875, "completions/mean_terminated_length": 706.2500610351562, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 4.5092592592592595, "grad_norm": 2.2401348755430135, "kl": 0.2501220703125, "learning_rate": 1.2687883110492515e-08, "loss": -0.0612, "num_tokens": 41879856.0, "reward": 0.035798899829387665, "reward_std": 0.03567592054605484, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0015821070410311222, "rewards/logprob_reward/std": 0.003936965949833393, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 674.34375, "completions/mean_terminated_length": 674.34375, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 4.512345679012346, "grad_norm": 1.7618294060687367, "kl": 0.249267578125, "learning_rate": 1.2531114514936491e-08, "loss": -0.2088, "num_tokens": 41908243.0, "reward": 0.049269404262304306, "reward_std": 0.042050741612911224, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.002660449594259262, "rewards/logprob_reward/std": 0.00429100776091218, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 657.0, "completions/mean_terminated_length": 645.1612548828125, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 4.515432098765432, "grad_norm": 2.331651801288181, "kl": 0.2408447265625, "learning_rate": 1.2375295548327557e-08, "loss": -0.0584, "num_tokens": 41935819.0, "reward": 0.07170861214399338, "reward_std": 0.04488102346658707, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.003287344705313444, "rewards/logprob_reward/std": 0.004149049986153841, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 768.28125, "completions/mean_terminated_length": 741.8275756835938, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 4.518518518518518, "grad_norm": 3.652022522373431, "kl": 0.218017578125, "learning_rate": 1.222042683377983e-08, "loss": -0.285, "num_tokens": 41966936.0, "reward": 0.03882598876953125, "reward_std": 0.04753109812736511, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0014733191346749663, "rewards/logprob_reward/std": 0.0019834123086184263, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 677.59375, "completions/mean_terminated_length": 666.4193115234375, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 4.521604938271605, "grad_norm": 2.261481769322289, "kl": 0.23388671875, "learning_rate": 1.2066508990607293e-08, "loss": -0.117, "num_tokens": 41994939.0, "reward": 0.054833658039569855, "reward_std": 0.034459188580513, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0018985085189342499, "rewards/logprob_reward/std": 0.0021323813125491142, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 702.9375, "completions/mean_terminated_length": 681.5333862304688, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 4.5246913580246915, "grad_norm": 1.9807012356553644, "kl": 0.22998046875, "learning_rate": 1.1913542634321538e-08, "loss": -0.1366, "num_tokens": 42024409.0, "reward": 0.04338686540722847, "reward_std": 0.049047164618968964, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0030687383841723204, "rewards/logprob_reward/std": 0.004367289133369923, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 700.59375, "completions/mean_terminated_length": 690.1612548828125, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 4.527777777777778, "grad_norm": 1.927326373765374, "kl": 0.2569580078125, "learning_rate": 1.1761528376629137e-08, "loss": -0.1766, "num_tokens": 42053420.0, "reward": 0.046754371374845505, "reward_std": 0.04168115183711052, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.003338192356750369, "rewards/logprob_reward/std": 0.005756665021181107, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 706.65625, "completions/mean_terminated_length": 673.8275756835938, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 4.530864197530864, "grad_norm": 5.224802349532697, "kl": 0.2313232421875, "learning_rate": 1.1610466825429182e-08, "loss": -0.4865, "num_tokens": 42082273.0, "reward": 0.045427750796079636, "reward_std": 0.05316973477602005, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0018641665810719132, "rewards/logprob_reward/std": 0.002744182012975216, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 723.84375, "completions/mean_terminated_length": 680.9642944335938, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 4.533950617283951, "grad_norm": 2.165261482173354, "kl": 0.228759765625, "learning_rate": 1.1460358584811091e-08, "loss": -0.2355, "num_tokens": 42111716.0, "reward": 0.05476970225572586, "reward_std": 0.04580404981970787, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0018274460453540087, "rewards/logprob_reward/std": 0.003082863288000226, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 711.34375, "completions/mean_terminated_length": 690.5000610351562, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 4.537037037037037, "grad_norm": 2.2478714434765923, "kl": 0.3079833984375, "learning_rate": 1.1311204255051942e-08, "loss": -0.3927, "num_tokens": 42140659.0, "reward": 0.05164527893066406, "reward_std": 0.05694463104009628, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.001828084234148264, "rewards/logprob_reward/std": 0.0025435511488467455, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 660.09375, "completions/mean_terminated_length": 660.09375, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 4.540123456790123, "grad_norm": 2.1010252200970534, "kl": 0.2474365234375, "learning_rate": 1.116300443261417e-08, "loss": -0.0856, "num_tokens": 42167706.0, "reward": 0.05780253931879997, "reward_std": 0.047235894948244095, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0017250396776944399, "rewards/logprob_reward/std": 0.0027445878367871046, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 705.125, "completions/mean_terminated_length": 694.8386840820312, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 4.54320987654321, "grad_norm": 2.129030236758863, "kl": 0.2327880859375, "learning_rate": 1.1015759710143124e-08, "loss": -0.118, "num_tokens": 42196554.0, "reward": 0.05206025391817093, "reward_std": 0.03269369155168533, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.002289172261953354, "rewards/logprob_reward/std": 0.0026913422625511885, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 702.625, "completions/mean_terminated_length": 692.258056640625, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 4.546296296296296, "grad_norm": 2.0399069526424154, "kl": 0.2205810546875, "learning_rate": 1.0869470676464848e-08, "loss": -0.1414, "num_tokens": 42225342.0, "reward": 0.04880621284246445, "reward_std": 0.0432891882956028, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0021457888651639223, "rewards/logprob_reward/std": 0.0022100957576185465, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 736.59375, "completions/mean_terminated_length": 706.862060546875, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 4.549382716049383, "grad_norm": 2.563073558337134, "kl": 0.24822998046875, "learning_rate": 1.0724137916583525e-08, "loss": -0.2259, "num_tokens": 42255797.0, "reward": 0.027438707649707794, "reward_std": 0.03702589124441147, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4399413466453552, "rewards/logprob_reward/mean": 0.0027096762787550688, "rewards/logprob_reward/std": 0.0035238105338066816, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 672.78125, "completions/mean_terminated_length": 649.36669921875, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 4.552469135802469, "grad_norm": 2.1662622318861273, "kl": 0.2301025390625, "learning_rate": 1.0579762011679317e-08, "loss": -0.2272, "num_tokens": 42283374.0, "reward": 0.0521787628531456, "reward_std": 0.05530615895986557, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.002420843578875065, "rewards/logprob_reward/std": 0.0027172542177140713, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 647.9375, "completions/mean_terminated_length": 647.9375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 4.555555555555555, "grad_norm": 2.5338564161310284, "kl": 0.2235107421875, "learning_rate": 1.0436343539105857e-08, "loss": -0.4645, "num_tokens": 42310576.0, "reward": 0.05766887590289116, "reward_std": 0.047901451587677, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0015765284188091755, "rewards/logprob_reward/std": 0.0021711974404752254, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 732.125, "completions/mean_terminated_length": 701.9310302734375, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 4.5586419753086425, "grad_norm": 1.7270498962110772, "kl": 0.1954345703125, "learning_rate": 1.0293883072388154e-08, "loss": -0.0339, "num_tokens": 42340660.0, "reward": 0.05205375701189041, "reward_std": 0.04099983721971512, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0022819519508630037, "rewards/logprob_reward/std": 0.0033365946728736162, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 731.0, "completions/mean_terminated_length": 711.4666748046875, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 4.561728395061729, "grad_norm": 2.6254814053042845, "kl": 0.3048095703125, "learning_rate": 1.015238118122011e-08, "loss": -0.1352, "num_tokens": 42370484.0, "reward": 0.04639824107289314, "reward_std": 0.05790696293115616, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0029424885287880898, "rewards/logprob_reward/std": 0.010873885825276375, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 729.90625, "completions/mean_terminated_length": 710.300048828125, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 4.564814814814815, "grad_norm": 2.143129440770457, "kl": 0.2186279296875, "learning_rate": 1.0011838431462389e-08, "loss": -0.2439, "num_tokens": 42400293.0, "reward": 0.049573834985494614, "reward_std": 0.05485403537750244, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0029987043235450983, "rewards/logprob_reward/std": 0.0052993991412222385, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 769.3125, "completions/mean_terminated_length": 722.1481323242188, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 4.567901234567901, "grad_norm": 2.194235872064849, "kl": 0.190185546875, "learning_rate": 9.872255385140027e-09, "loss": -0.2519, "num_tokens": 42431567.0, "reward": 0.03871108219027519, "reward_std": 0.04252662882208824, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0013456502929329872, "rewards/logprob_reward/std": 0.001864749239757657, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 705.5, "completions/mean_terminated_length": 695.2257690429688, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 4.570987654320987, "grad_norm": 1.7435056472001802, "kl": 0.247314453125, "learning_rate": 9.733632600440245e-09, "loss": -0.1419, "num_tokens": 42460563.0, "reward": 0.0358835905790329, "reward_std": 0.02217365801334381, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0016762100858613849, "rewards/logprob_reward/std": 0.002464568242430687, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 711.78125, "completions/mean_terminated_length": 667.1785888671875, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 4.574074074074074, "grad_norm": 2.990968568617281, "kl": 0.2244873046875, "learning_rate": 9.595970631710248e-09, "loss": -0.3499, "num_tokens": 42489456.0, "reward": 0.028968021273612976, "reward_std": 0.029035020619630814, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0009366908925585449, "rewards/logprob_reward/std": 0.0016238440293818712, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 659.5625, "completions/mean_terminated_length": 659.5625, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 4.577160493827161, "grad_norm": 2.202255369039433, "kl": 0.24755859375, "learning_rate": 9.459270029454986e-09, "loss": -0.0597, "num_tokens": 42517030.0, "reward": 0.05343109369277954, "reward_std": 0.03393975645303726, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.003812328912317753, "rewards/logprob_reward/std": 0.004090317524969578, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 693.59375, "completions/mean_terminated_length": 682.9354858398438, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 4.580246913580247, "grad_norm": 2.18173859494406, "kl": 0.2265625, "learning_rate": 9.323531340334868e-09, "loss": -0.1217, "num_tokens": 42546053.0, "reward": 0.049290746450424194, "reward_std": 0.04943094775080681, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.002684164559468627, "rewards/logprob_reward/std": 0.00423125596717, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 666.21875, "completions/mean_terminated_length": 654.6774291992188, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 4.583333333333333, "grad_norm": 2.067985335920416, "kl": 0.2274169921875, "learning_rate": 9.188755107163743e-09, "loss": -0.1437, "num_tokens": 42573588.0, "reward": 0.04903118312358856, "reward_std": 0.04188349097967148, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.002395759802311659, "rewards/logprob_reward/std": 0.00443643145263195, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 688.34375, "completions/mean_terminated_length": 626.1851806640625, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 4.58641975308642, "grad_norm": 1.5822348621800693, "kl": 0.231689453125, "learning_rate": 9.054941868906513e-09, "loss": -0.0769, "num_tokens": 42601931.0, "reward": 0.0462818518280983, "reward_std": 0.03339197486639023, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.002813167404383421, "rewards/logprob_reward/std": 0.007091212552040815, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 746.40625, "completions/mean_terminated_length": 727.9000244140625, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 4.589506172839506, "grad_norm": 2.276546124105462, "kl": 0.2197265625, "learning_rate": 8.922092160677242e-09, "loss": -0.2447, "num_tokens": 42632344.0, "reward": 0.055323585867881775, "reward_std": 0.051866527646780014, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.002442875411361456, "rewards/logprob_reward/std": 0.0038433405570685863, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 673.15625, "completions/mean_terminated_length": 661.8386840820312, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 4.592592592592593, "grad_norm": 3.684538766166176, "kl": 0.2410888671875, "learning_rate": 8.79020651373677e-09, "loss": -0.2779, "num_tokens": 42660513.0, "reward": 0.06448155641555786, "reward_std": 0.04148316755890846, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.002201724797487259, "rewards/logprob_reward/std": 0.00433498527854681, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 702.4375, "completions/mean_terminated_length": 669.1724243164062, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 4.595679012345679, "grad_norm": 2.1059833651414617, "kl": 0.235107421875, "learning_rate": 8.659285455490745e-09, "loss": 0.0114, "num_tokens": 42689363.0, "reward": 0.05539703741669655, "reward_std": 0.04169042780995369, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0025244837161153555, "rewards/logprob_reward/std": 0.0028246596921235323, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 666.46875, "completions/mean_terminated_length": 666.46875, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 4.598765432098766, "grad_norm": 3.542601292283675, "kl": 0.277587890625, "learning_rate": 8.529329509487455e-09, "loss": -0.2771, "num_tokens": 42717690.0, "reward": 0.04708962142467499, "reward_std": 0.041435856372117996, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0037106885574758053, "rewards/logprob_reward/std": 0.005584856029599905, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 762.96875, "completions/mean_terminated_length": 725.6785888671875, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 4.601851851851852, "grad_norm": 2.1466412930937917, "kl": 0.2491455078125, "learning_rate": 8.400339195415718e-09, "loss": -0.0973, "num_tokens": 42748813.0, "reward": 0.02977364882826805, "reward_std": 0.041639409959316254, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0018318291986361146, "rewards/logprob_reward/std": 0.0028566711116582155, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 662.3125, "completions/mean_terminated_length": 662.3125, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 4.604938271604938, "grad_norm": 1.937843807352332, "kl": 0.257568359375, "learning_rate": 8.272315029102888e-09, "loss": -0.2336, "num_tokens": 42776471.0, "reward": 0.07350407540798187, "reward_std": 0.04663775861263275, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0018100799061357975, "rewards/logprob_reward/std": 0.0023184253368526697, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 725.375, "completions/mean_terminated_length": 705.4666748046875, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 4.6080246913580245, "grad_norm": 2.548582697170482, "kl": 0.2386474609375, "learning_rate": 8.145257522512606e-09, "loss": -0.2308, "num_tokens": 42806551.0, "reward": 0.03310152143239975, "reward_std": 0.048257678747177124, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0020572440698742867, "rewards/logprob_reward/std": 0.003052032319828868, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 727.65625, "completions/mean_terminated_length": 685.3214721679688, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 4.611111111111111, "grad_norm": 1.8255106970483308, "kl": 0.214599609375, "learning_rate": 8.019167183743041e-09, "loss": -0.1165, "num_tokens": 42836004.0, "reward": 0.04193819314241409, "reward_std": 0.05020206421613693, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0014591040089726448, "rewards/logprob_reward/std": 0.0024056523106992245, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 697.1875, "completions/mean_terminated_length": 675.4000244140625, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 4.614197530864198, "grad_norm": 1.8193479924266922, "kl": 0.2515869140625, "learning_rate": 7.89404451702455e-09, "loss": -0.0577, "num_tokens": 42864538.0, "reward": 0.04537242278456688, "reward_std": 0.034791670739650726, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0018026924226433039, "rewards/logprob_reward/std": 0.002865416230633855, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 710.9375, "completions/mean_terminated_length": 678.5516967773438, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 4.617283950617284, "grad_norm": 2.132767058374767, "kl": 0.2283935546875, "learning_rate": 7.769890022717884e-09, "loss": -0.2006, "num_tokens": 42893748.0, "reward": 0.04399431496858597, "reward_std": 0.03478696942329407, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.003743681125342846, "rewards/logprob_reward/std": 0.0048117870464921, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 714.6875, "completions/mean_terminated_length": 694.0667114257812, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 4.62037037037037, "grad_norm": 2.2418023665666853, "kl": 0.2222900390625, "learning_rate": 7.646704197312143e-09, "loss": -0.1643, "num_tokens": 42923290.0, "reward": 0.04454777017235756, "reward_std": 0.043588101863861084, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.004358632955700159, "rewards/logprob_reward/std": 0.006084616295993328, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 711.125, "completions/mean_terminated_length": 690.2667236328125, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 4.6234567901234565, "grad_norm": 3.553604400037332, "kl": 0.2537841796875, "learning_rate": 7.524487533422635e-09, "loss": -0.3947, "num_tokens": 42952566.0, "reward": 0.03583184629678726, "reward_std": 0.0420500822365284, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0016187188448384404, "rewards/logprob_reward/std": 0.0026539175305515528, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 755.0, "completions/mean_terminated_length": 746.3225708007812, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 4.6265432098765435, "grad_norm": 2.144779694803398, "kl": 0.2220458984375, "learning_rate": 7.403240519789161e-09, "loss": -0.1333, "num_tokens": 42983842.0, "reward": 0.04972454160451889, "reward_std": 0.035087473690509796, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0031661540269851685, "rewards/logprob_reward/std": 0.003860093653202057, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 732.59375, "completions/mean_terminated_length": 713.1666870117188, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 4.62962962962963, "grad_norm": 1.9280861009947392, "kl": 0.2061767578125, "learning_rate": 7.282963641273842e-09, "loss": -0.1129, "num_tokens": 43013929.0, "reward": 0.05114641785621643, "reward_std": 0.053203750401735306, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0012737988727167249, "rewards/logprob_reward/std": 0.002420877804979682, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 666.625, "completions/mean_terminated_length": 642.800048828125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 4.632716049382716, "grad_norm": 3.0161430734232613, "kl": 0.231201171875, "learning_rate": 7.163657378859267e-09, "loss": -0.4704, "num_tokens": 43042165.0, "reward": 0.03871387615799904, "reward_std": 0.047514379024505615, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.001348750782199204, "rewards/logprob_reward/std": 0.002594175050035119, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 693.1875, "completions/mean_terminated_length": 682.51611328125, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 4.635802469135802, "grad_norm": 1.938330864772763, "kl": 0.2303466796875, "learning_rate": 7.045322209646654e-09, "loss": -0.2025, "num_tokens": 43070563.0, "reward": 0.06738285720348358, "reward_std": 0.027124961838126183, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.001953174127265811, "rewards/logprob_reward/std": 0.0018129246309399605, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 651.78125, "completions/mean_terminated_length": 639.774169921875, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 4.638888888888889, "grad_norm": 2.135924664049642, "kl": 0.2335205078125, "learning_rate": 6.927958606853746e-09, "loss": -0.0142, "num_tokens": 43097824.0, "reward": 0.04533563554286957, "reward_std": 0.05339189991354942, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0017618188867345452, "rewards/logprob_reward/std": 0.0021908129565417767, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 646.9375, "completions/mean_terminated_length": 634.774169921875, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 4.6419753086419755, "grad_norm": 2.320433802131267, "kl": 0.2471923828125, "learning_rate": 6.811567039813087e-09, "loss": -0.1788, "num_tokens": 43124434.0, "reward": 0.05148729309439659, "reward_std": 0.04719836264848709, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0016525487881153822, "rewards/logprob_reward/std": 0.0019936866592615843, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 744.78125, "completions/mean_terminated_length": 715.8965454101562, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 4.645061728395062, "grad_norm": 2.1228448380201153, "kl": 0.2081298828125, "learning_rate": 6.696147973970112e-09, "loss": -0.0033, "num_tokens": 43154599.0, "reward": 0.05481375381350517, "reward_std": 0.05460665374994278, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0018763924017548561, "rewards/logprob_reward/std": 0.0029246618505567312, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 719.65625, "completions/mean_terminated_length": 709.8386840820312, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 4.648148148148148, "grad_norm": 2.436848909203649, "kl": 0.2147216796875, "learning_rate": 6.581701870881196e-09, "loss": -0.3078, "num_tokens": 43183764.0, "reward": 0.051449697464704514, "reward_std": 0.046063363552093506, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0016107733827084303, "rewards/logprob_reward/std": 0.002504688920453191, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 662.0, "completions/mean_terminated_length": 650.3225708007812, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 4.651234567901234, "grad_norm": 2.014260519970987, "kl": 0.229248046875, "learning_rate": 6.4682291882119375e-09, "loss": -0.1049, "num_tokens": 43211264.0, "reward": 0.07040219008922577, "reward_std": 0.0403471514582634, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0018357643857598305, "rewards/logprob_reward/std": 0.0019348099594935775, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 676.71875, "completions/mean_terminated_length": 640.7930908203125, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 4.654320987654321, "grad_norm": 2.460559046510438, "kl": 0.26416015625, "learning_rate": 6.355730379735219e-09, "loss": -0.0752, "num_tokens": 43239091.0, "reward": 0.04869456589221954, "reward_std": 0.04262574017047882, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0020217373967170715, "rewards/logprob_reward/std": 0.005422488786280155, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 711.09375, "completions/mean_terminated_length": 666.3928833007812, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 4.657407407407407, "grad_norm": 2.293936676742903, "kl": 0.2314453125, "learning_rate": 6.244205895329452e-09, "loss": -0.0937, "num_tokens": 43268330.0, "reward": 0.06514693796634674, "reward_std": 0.04283153638243675, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.002941043581813574, "rewards/logprob_reward/std": 0.002941833809018135, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 656.0, "completions/mean_terminated_length": 656.0, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 4.660493827160494, "grad_norm": 2.3794356247810753, "kl": 0.2310791015625, "learning_rate": 6.133656180976776e-09, "loss": -0.2736, "num_tokens": 43296014.0, "reward": 0.04233257472515106, "reward_std": 0.027813980355858803, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.001897301059216261, "rewards/logprob_reward/std": 0.0024011824280023575, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 631.78125, "completions/mean_terminated_length": 631.78125, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 4.66358024691358, "grad_norm": 2.207114514188508, "kl": 0.2685546875, "learning_rate": 6.024081678761228e-09, "loss": -0.1798, "num_tokens": 43322323.0, "reward": 0.05467233061790466, "reward_std": 0.04698755964636803, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0017192568629980087, "rewards/logprob_reward/std": 0.002006774302572012, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 756.71875, "completions/mean_terminated_length": 681.8800048828125, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 4.666666666666667, "grad_norm": 4.018916638189679, "kl": 0.250244140625, "learning_rate": 5.915482826867047e-09, "loss": -0.4026, "num_tokens": 43353330.0, "reward": 0.048289455473423004, "reward_std": 0.04794459789991379, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001571614877320826, "rewards/logprob_reward/std": 0.004118291661143303, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 713.71875, "completions/mean_terminated_length": 693.0333862304688, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 4.669753086419753, "grad_norm": 2.683404102549931, "kl": 0.220703125, "learning_rate": 5.807860059576841e-09, "loss": -0.3136, "num_tokens": 43382557.0, "reward": 0.05862727761268616, "reward_std": 0.04001375660300255, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0026414182502776384, "rewards/logprob_reward/std": 0.005117638502269983, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 781.03125, "completions/mean_terminated_length": 755.8965454101562, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 4.672839506172839, "grad_norm": 2.159997404539417, "kl": 0.199462890625, "learning_rate": 5.701213807269956e-09, "loss": -0.2434, "num_tokens": 43413994.0, "reward": 0.055472537875175476, "reward_std": 0.04255915433168411, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0026083746924996376, "rewards/logprob_reward/std": 0.004415425937622786, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 707.9375, "completions/mean_terminated_length": 697.741943359375, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 4.675925925925926, "grad_norm": 2.621603568875648, "kl": 0.2293701171875, "learning_rate": 5.5955444964206345e-09, "loss": -0.2414, "num_tokens": 43442832.0, "reward": 0.05145654082298279, "reward_std": 0.03780600428581238, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.001618378795683384, "rewards/logprob_reward/std": 0.0022882563062012196, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 689.8125, "completions/mean_terminated_length": 689.8125, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 4.679012345679013, "grad_norm": 1.8923983034097638, "kl": 0.225830078125, "learning_rate": 5.490852549596387e-09, "loss": -0.1754, "num_tokens": 43471962.0, "reward": 0.050171319395303726, "reward_std": 0.04255622625350952, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0036625780630856752, "rewards/logprob_reward/std": 0.004384476691484451, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 738.8125, "completions/mean_terminated_length": 686.0, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 4.682098765432099, "grad_norm": 1.8941827081878828, "kl": 0.2362060546875, "learning_rate": 5.387138385456319e-09, "loss": -0.0436, "num_tokens": 43501872.0, "reward": 0.05162039399147034, "reward_std": 0.02073848992586136, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.00180044153239578, "rewards/logprob_reward/std": 0.002051715040579438, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 758.46875, "completions/mean_terminated_length": 731.0, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "epoch": 4.685185185185185, "grad_norm": 2.2133668662481414, "kl": 0.2122802734375, "learning_rate": 5.284402418749362e-09, "loss": -0.1702, "num_tokens": 43532927.0, "reward": 0.050726257264614105, "reward_std": 0.04197445139288902, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.004279176238924265, "rewards/logprob_reward/std": 0.007665267214179039, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 723.5, "completions/mean_terminated_length": 667.8518676757812, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 4.688271604938271, "grad_norm": 2.005056288520359, "kl": 0.23681640625, "learning_rate": 5.182645060312685e-09, "loss": -0.1067, "num_tokens": 43562927.0, "reward": 0.033513665199279785, "reward_std": 0.027011813595891, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0025151814334094524, "rewards/logprob_reward/std": 0.004920200444757938, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 690.34375, "completions/mean_terminated_length": 690.34375, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 4.6913580246913575, "grad_norm": 2.57913775796617, "kl": 0.2568359375, "learning_rate": 5.081866717070088e-09, "loss": -0.4261, "num_tokens": 43591182.0, "reward": 0.043917812407016754, "reward_std": 0.034200187772512436, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.003658680710941553, "rewards/logprob_reward/std": 0.004496648907661438, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 735.8125, "completions/mean_terminated_length": 694.6428833007812, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 4.694444444444445, "grad_norm": 2.029400397554303, "kl": 0.214111328125, "learning_rate": 4.9820677920302534e-09, "loss": -0.1665, "num_tokens": 43621208.0, "reward": 0.06271371245384216, "reward_std": 0.03440491110086441, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.003709673648700118, "rewards/logprob_reward/std": 0.003139941254630685, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 671.53125, "completions/mean_terminated_length": 660.1612548828125, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 4.697530864197531, "grad_norm": 2.308131113431546, "kl": 0.2225341796875, "learning_rate": 4.883248684285302e-09, "loss": -0.2318, "num_tokens": 43649165.0, "reward": 0.05717732012271881, "reward_std": 0.05596880614757538, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.004502573981881142, "rewards/logprob_reward/std": 0.005215025972574949, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 754.375, "completions/mean_terminated_length": 736.4000244140625, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 4.700617283950617, "grad_norm": 1.9382971045463089, "kl": 0.2247314453125, "learning_rate": 4.785409789008988e-09, "loss": -0.3379, "num_tokens": 43679437.0, "reward": 0.03940431401133537, "reward_std": 0.04100026935338974, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.00211590644903481, "rewards/logprob_reward/std": 0.0024238715413957834, "step": 1523 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 754.84375, "completions/mean_terminated_length": 649.521728515625, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 4.703703703703704, "grad_norm": 3.7134581259618886, "kl": NaN, "learning_rate": 4.68855149745534e-09, "loss": -0.228, "num_tokens": 43710480.0, "reward": 0.03510964661836624, "reward_std": 0.02885759249329567, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0008162733865901828, "rewards/logprob_reward/std": 0.0019914295990020037, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 705.0625, "completions/mean_terminated_length": 672.0689697265625, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 4.70679012345679, "grad_norm": 3.5558333106059634, "kl": 0.234375, "learning_rate": 4.592674196956914e-09, "loss": -0.3241, "num_tokens": 43739658.0, "reward": 0.04232990741729736, "reward_std": 0.050770103931427, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.00189433922059834, "rewards/logprob_reward/std": 0.0025692505296319723, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 703.75, "completions/mean_terminated_length": 670.6206665039062, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 4.709876543209877, "grad_norm": 2.982549396593798, "kl": 0.2117919921875, "learning_rate": 4.497778270923374e-09, "loss": -0.3901, "num_tokens": 43768566.0, "reward": 0.05457794666290283, "reward_std": 0.04171760752797127, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0016143880784511566, "rewards/logprob_reward/std": 0.002547757001593709, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 691.34375, "completions/mean_terminated_length": 680.6128540039062, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 4.712962962962963, "grad_norm": 1.8374788501829256, "kl": 0.230712890625, "learning_rate": 4.403864098839833e-09, "loss": -0.0062, "num_tokens": 43796969.0, "reward": 0.05811849236488342, "reward_std": 0.03246118128299713, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.002076097298413515, "rewards/logprob_reward/std": 0.002303823595866561, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 689.71875, "completions/mean_terminated_length": 678.9354858398438, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 4.716049382716049, "grad_norm": 2.0907773916798154, "kl": 0.227783203125, "learning_rate": 4.31093205626551e-09, "loss": -0.1669, "num_tokens": 43825252.0, "reward": 0.061343710869550705, "reward_std": 0.05555129051208496, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0021874543745070696, "rewards/logprob_reward/std": 0.0020785327069461346, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 653.6875, "completions/mean_terminated_length": 653.6875, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 4.719135802469136, "grad_norm": 2.5871332535777176, "kl": 0.2325439453125, "learning_rate": 4.218982514832048e-09, "loss": -0.401, "num_tokens": 43852822.0, "reward": 0.04688587039709091, "reward_std": 0.0505446121096611, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.003484301269054413, "rewards/logprob_reward/std": 0.004275031853467226, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 702.71875, "completions/mean_terminated_length": 692.3547973632812, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 4.722222222222222, "grad_norm": 2.9221650012489384, "kl": 0.2108154296875, "learning_rate": 4.128015842242122e-09, "loss": -0.2255, "num_tokens": 43881733.0, "reward": 0.04261261597275734, "reward_std": 0.04130998253822327, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.002208464778959751, "rewards/logprob_reward/std": 0.00345027563162148, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 749.09375, "completions/mean_terminated_length": 720.6551513671875, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 4.7253086419753085, "grad_norm": 2.404028400713213, "kl": 0.2393798828125, "learning_rate": 4.0380324022679935e-09, "loss": -0.281, "num_tokens": 43913168.0, "reward": 0.049170732498168945, "reward_std": 0.051856983453035355, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.019911926239728928, "rewards/logprob_reward/std": 0.05080845579504967, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 737.53125, "completions/mean_terminated_length": 728.290283203125, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 4.728395061728395, "grad_norm": 2.0370504351113516, "kl": 0.20068359375, "learning_rate": 3.9490325547499316e-09, "loss": -0.0427, "num_tokens": 43943845.0, "reward": 0.04996287077665329, "reward_std": 0.04933923855423927, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0034309634938836098, "rewards/logprob_reward/std": 0.004405571613460779, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 744.25, "completions/mean_terminated_length": 715.3103637695312, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 4.731481481481482, "grad_norm": 2.2135796312301093, "kl": 0.2001953125, "learning_rate": 3.861016655594962e-09, "loss": -0.2235, "num_tokens": 43974121.0, "reward": 0.055599234998226166, "reward_std": 0.047450773417949677, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0027491520158946514, "rewards/logprob_reward/std": 0.00328139984048903, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 703.6875, "completions/mean_terminated_length": 693.3547973632812, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 4.734567901234568, "grad_norm": 1.9148919795249735, "kl": 0.2255859375, "learning_rate": 3.773985056775258e-09, "loss": -0.0338, "num_tokens": 44002847.0, "reward": 0.06723778694868088, "reward_std": 0.04859152436256409, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0017919890815392137, "rewards/logprob_reward/std": 0.002055313903838396, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 686.34375, "completions/mean_terminated_length": 675.4515991210938, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 4.737654320987654, "grad_norm": 2.016759515633251, "kl": 0.2088623046875, "learning_rate": 3.68793810632681e-09, "loss": -0.2412, "num_tokens": 44031374.0, "reward": 0.054454416036605835, "reward_std": 0.050731875002384186, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0014771256828680634, "rewards/logprob_reward/std": 0.002718152478337288, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 724.0625, "completions/mean_terminated_length": 724.0625, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 4.7407407407407405, "grad_norm": 1.9265106398716043, "kl": 0.2135009765625, "learning_rate": 3.602876148348116e-09, "loss": -0.1655, "num_tokens": 44060856.0, "reward": 0.042583052068948746, "reward_std": 0.03934343159198761, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.002175615169107914, "rewards/logprob_reward/std": 0.002967662876471877, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 657.6875, "completions/mean_terminated_length": 645.8709716796875, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 4.743827160493828, "grad_norm": 3.299285365438242, "kl": 0.226318359375, "learning_rate": 3.518799522998661e-09, "loss": -0.3395, "num_tokens": 44088018.0, "reward": 0.041676074266433716, "reward_std": 0.053768157958984375, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0011678591836243868, "rewards/logprob_reward/std": 0.0019146768609061837, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 776.40625, "completions/mean_terminated_length": 719.269287109375, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 4.746913580246914, "grad_norm": 1.6565600344802227, "kl": 0.20068359375, "learning_rate": 3.435708566497608e-09, "loss": -0.0962, "num_tokens": 44119195.0, "reward": 0.040542714297771454, "reward_std": 0.04417931288480759, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.003380796406418085, "rewards/logprob_reward/std": 0.006241412367671728, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 665.3125, "completions/mean_terminated_length": 641.4000244140625, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 4.75, "grad_norm": 1.7865661626285811, "kl": 0.258544921875, "learning_rate": 3.353603611122524e-09, "loss": -0.2299, "num_tokens": 44146489.0, "reward": 0.06413263827562332, "reward_std": 0.04776488244533539, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0018140419851988554, "rewards/logprob_reward/std": 0.0035549812018871307, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 665.34375, "completions/mean_terminated_length": 641.433349609375, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 4.753086419753086, "grad_norm": 2.079446226907431, "kl": 0.2318115234375, "learning_rate": 3.2724849852079628e-09, "loss": -0.1236, "num_tokens": 44173964.0, "reward": 0.044496528804302216, "reward_std": 0.04535555839538574, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0008294780272990465, "rewards/logprob_reward/std": 0.001505252905189991, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 725.4375, "completions/mean_terminated_length": 715.8064575195312, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 4.756172839506172, "grad_norm": 3.1618937626713195, "kl": 0.2412109375, "learning_rate": 3.192353013144189e-09, "loss": -0.2779, "num_tokens": 44203642.0, "reward": 0.038170669227838516, "reward_std": 0.03268398344516754, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0007451868150383234, "rewards/logprob_reward/std": 0.001686312723904848, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 721.03125, "completions/mean_terminated_length": 677.75, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 4.7592592592592595, "grad_norm": 2.050281713419281, "kl": 0.2061767578125, "learning_rate": 3.113208015375901e-09, "loss": -0.1335, "num_tokens": 44233011.0, "reward": 0.05213935300707817, "reward_std": 0.045283056795597076, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.002377056982368231, "rewards/logprob_reward/std": 0.0033256742171943188, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 717.53125, "completions/mean_terminated_length": 697.1000366210938, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 4.762345679012346, "grad_norm": 1.7723029762016207, "kl": 0.2257080078125, "learning_rate": 3.0350503084008995e-09, "loss": -0.1376, "num_tokens": 44262740.0, "reward": 0.05031438544392586, "reward_std": 0.03975418582558632, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0038215392269194126, "rewards/logprob_reward/std": 0.006404296960681677, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 656.1875, "completions/mean_terminated_length": 644.3225708007812, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 4.765432098765432, "grad_norm": 1.778152137398992, "kl": 0.2470703125, "learning_rate": 2.957880204768809e-09, "loss": -0.0711, "num_tokens": 44289642.0, "reward": 0.05754018574953079, "reward_std": 0.03257685899734497, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.001433537108823657, "rewards/logprob_reward/std": 0.0015094984555616975, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 757.375, "completions/mean_terminated_length": 719.2857666015625, "completions/min_length": 542.0, "completions/min_terminated_length": 542.0, "epoch": 4.768518518518518, "grad_norm": 2.3460314398604862, "kl": 0.220703125, "learning_rate": 2.8816980130799418e-09, "loss": -0.1334, "num_tokens": 44320898.0, "reward": 0.04554835706949234, "reward_std": 0.04604505002498627, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0019981749355793, "rewards/logprob_reward/std": 0.002213164698332548, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 635.8125, "completions/mean_terminated_length": 623.290283203125, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 4.771604938271605, "grad_norm": 1.8312174372739913, "kl": 0.2366943359375, "learning_rate": 2.806504037983992e-09, "loss": -0.0846, "num_tokens": 44347812.0, "reward": 0.05764756724238396, "reward_std": 0.04722445458173752, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0015528519870713353, "rewards/logprob_reward/std": 0.0020970383193343878, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 758.84375, "completions/mean_terminated_length": 731.413818359375, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 4.7746913580246915, "grad_norm": 2.053868627912756, "kl": 0.21142578125, "learning_rate": 2.7322985801787046e-09, "loss": -0.121, "num_tokens": 44378323.0, "reward": 0.04052641987800598, "reward_std": 0.041819311678409576, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.003362688235938549, "rewards/logprob_reward/std": 0.004958003293722868, "step": 1547 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 733.40625, "completions/mean_terminated_length": 714.0333862304688, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 4.777777777777778, "grad_norm": 1.865677782906322, "kl": NaN, "learning_rate": 2.6590819364088746e-09, "loss": -0.0866, "num_tokens": 44408708.0, "reward": 0.04511159658432007, "reward_std": 0.027660556137561798, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.001512888353317976, "rewards/logprob_reward/std": 0.002074025571346283, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 715.5625, "completions/mean_terminated_length": 695.0000610351562, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 4.780864197530864, "grad_norm": 2.2663529037789614, "kl": 0.2218017578125, "learning_rate": 2.5868543994650993e-09, "loss": -0.1056, "num_tokens": 44437970.0, "reward": 0.05412521958351135, "reward_std": 0.0419880636036396, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.004583575762808323, "rewards/logprob_reward/std": 0.009514378383755684, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 705.40625, "completions/mean_terminated_length": 684.1666870117188, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 4.783950617283951, "grad_norm": 2.170932131963563, "kl": 0.2294921875, "learning_rate": 2.5156162581824736e-09, "loss": -0.1829, "num_tokens": 44467083.0, "reward": 0.05616780370473862, "reward_std": 0.04913990572094917, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0033808897715061903, "rewards/logprob_reward/std": 0.004154331050813198, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 761.8125, "completions/mean_terminated_length": 713.25927734375, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 4.787037037037037, "grad_norm": 1.9210242421969543, "kl": 0.200439453125, "learning_rate": 2.44536779743959e-09, "loss": -0.2208, "num_tokens": 44497877.0, "reward": 0.048709139227867126, "reward_std": 0.032009467482566833, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.002037932863458991, "rewards/logprob_reward/std": 0.0029220767319202423, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 737.21875, "completions/mean_terminated_length": 696.2500610351562, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 4.790123456790123, "grad_norm": 1.993314091973738, "kl": 0.2431640625, "learning_rate": 2.376109298157347e-09, "loss": -0.1321, "num_tokens": 44528140.0, "reward": 0.059702493250370026, "reward_std": 0.03522733226418495, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0038360999897122383, "rewards/logprob_reward/std": 0.0049207513220608234, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 670.75, "completions/mean_terminated_length": 659.3547973632812, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 4.79320987654321, "grad_norm": 2.3662139425006297, "kl": 0.244873046875, "learning_rate": 2.3078410372978084e-09, "loss": -0.1522, "num_tokens": 44555908.0, "reward": 0.05789976567029953, "reward_std": 0.04513705521821976, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0018330684397369623, "rewards/logprob_reward/std": 0.002843984169885516, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 710.8125, "completions/mean_terminated_length": 678.413818359375, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 4.796296296296296, "grad_norm": 2.1918860901621886, "kl": 0.245361328125, "learning_rate": 2.240563287863151e-09, "loss": -0.2624, "num_tokens": 44585338.0, "reward": 0.052353691309690475, "reward_std": 0.05389145761728287, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0026152110658586025, "rewards/logprob_reward/std": 0.0035162579733878374, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 709.0625, "completions/mean_terminated_length": 698.9031982421875, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 4.799382716049383, "grad_norm": 1.8584598367804426, "kl": 0.2265625, "learning_rate": 2.174276318894497e-09, "loss": 0.0024, "num_tokens": 44614944.0, "reward": 0.048598550260066986, "reward_std": 0.04688963294029236, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.001915052765980363, "rewards/logprob_reward/std": 0.002664490370079875, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 704.53125, "completions/mean_terminated_length": 704.53125, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 4.802469135802469, "grad_norm": 2.0321359065426527, "kl": 0.2298583984375, "learning_rate": 2.1089803954708884e-09, "loss": -0.0483, "num_tokens": 44644045.0, "reward": 0.0562812015414238, "reward_std": 0.04687017574906349, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.003506890032440424, "rewards/logprob_reward/std": 0.0037236586213111877, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 704.3125, "completions/mean_terminated_length": 694.0, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 4.805555555555555, "grad_norm": 2.496444769411076, "kl": 0.2139892578125, "learning_rate": 2.0446757787082324e-09, "loss": -0.3015, "num_tokens": 44672883.0, "reward": 0.03860919177532196, "reward_std": 0.045600421726703644, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0012324335984885693, "rewards/logprob_reward/std": 0.0019448957173153758, "step": 1557 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 671.78125, "completions/mean_terminated_length": 635.3448486328125, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 4.8086419753086425, "grad_norm": 1.736623584195244, "kl": NaN, "learning_rate": 1.98136272575819e-09, "loss": -0.1128, "num_tokens": 44700868.0, "reward": 0.061923883855342865, "reward_std": 0.03648325800895691, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.002832094905897975, "rewards/logprob_reward/std": 0.00374964764341712, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 701.28125, "completions/mean_terminated_length": 679.7667236328125, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 4.811728395061729, "grad_norm": 1.956746107073524, "kl": 0.227294921875, "learning_rate": 1.919041489807233e-09, "loss": -0.0294, "num_tokens": 44729313.0, "reward": 0.06404221057891846, "reward_std": 0.04741805046796799, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0017135670641437173, "rewards/logprob_reward/std": 0.002254784805700183, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 703.375, "completions/mean_terminated_length": 703.375, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 4.814814814814815, "grad_norm": 2.025904015910776, "kl": 0.22314453125, "learning_rate": 1.857712320075616e-09, "loss": -0.1686, "num_tokens": 44758361.0, "reward": 0.05339666083455086, "reward_std": 0.04650755599141121, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0037740650586783886, "rewards/logprob_reward/std": 0.005097648594528437, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 726.125, "completions/mean_terminated_length": 706.2667236328125, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 4.817901234567901, "grad_norm": 2.485263575089205, "kl": 0.2247314453125, "learning_rate": 1.7973754618162972e-09, "loss": -0.2606, "num_tokens": 44788377.0, "reward": 0.04539719969034195, "reward_std": 0.04811866953969002, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.001830223249271512, "rewards/logprob_reward/std": 0.0031537904869765043, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 664.09375, "completions/mean_terminated_length": 626.862060546875, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 4.820987654320987, "grad_norm": 2.3784729224611487, "kl": 0.2540283203125, "learning_rate": 1.7380311563140737e-09, "loss": -0.1922, "num_tokens": 44815652.0, "reward": 0.04746240749955177, "reward_std": 0.04821339249610901, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0006526728975586593, "rewards/logprob_reward/std": 0.0009571689297445118, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 742.8125, "completions/mean_terminated_length": 702.6428833007812, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 4.824074074074074, "grad_norm": 2.075657716941629, "kl": 0.2567138671875, "learning_rate": 1.6796796408845292e-09, "loss": -0.121, "num_tokens": 44846126.0, "reward": 0.04571472108364105, "reward_std": 0.04788792133331299, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0021830215118825436, "rewards/logprob_reward/std": 0.0033464357256889343, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 725.96875, "completions/mean_terminated_length": 706.1000366210938, "completions/min_length": 542.0, "completions/min_terminated_length": 542.0, "epoch": 4.827160493827161, "grad_norm": 2.018749742959247, "kl": 0.24560546875, "learning_rate": 1.622321148873146e-09, "loss": -0.0439, "num_tokens": 44875653.0, "reward": 0.04587321728467941, "reward_std": 0.05631254240870476, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0023591280914843082, "rewards/logprob_reward/std": 0.003474861616268754, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 709.53125, "completions/mean_terminated_length": 688.5667114257812, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 4.830246913580247, "grad_norm": 2.0734281347079837, "kl": 0.2442626953125, "learning_rate": 1.5659559096543318e-09, "loss": -0.1977, "num_tokens": 44905174.0, "reward": 0.050982531160116196, "reward_std": 0.04027026891708374, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.004563921596854925, "rewards/logprob_reward/std": 0.006054244004189968, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 672.625, "completions/mean_terminated_length": 661.290283203125, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 4.833333333333333, "grad_norm": 1.822317558433637, "kl": 0.21923828125, "learning_rate": 1.5105841486304783e-09, "loss": -0.0985, "num_tokens": 44932874.0, "reward": 0.057907603681087494, "reward_std": 0.035353753715753555, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.001841781078837812, "rewards/logprob_reward/std": 0.0017611915245652199, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 681.09375, "completions/mean_terminated_length": 658.2333374023438, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 4.83641975308642, "grad_norm": 2.123560453204114, "kl": 0.2528076171875, "learning_rate": 1.456206087231182e-09, "loss": -0.2365, "num_tokens": 44960993.0, "reward": 0.05193224549293518, "reward_std": 0.045639969408512115, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.002146940678358078, "rewards/logprob_reward/std": 0.0024786905851215124, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 735.25, "completions/mean_terminated_length": 716.0000610351562, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 4.839506172839506, "grad_norm": 2.050238233273911, "kl": 0.22021484375, "learning_rate": 1.4028219429121912e-09, "loss": -0.135, "num_tokens": 44990989.0, "reward": 0.048916611820459366, "reward_std": 0.05314203351736069, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.002268455922603607, "rewards/logprob_reward/std": 0.0028817281126976013, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 709.375, "completions/mean_terminated_length": 709.375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 4.842592592592593, "grad_norm": 2.1466709104865003, "kl": 0.2158203125, "learning_rate": 1.350431929154655e-09, "loss": -0.0916, "num_tokens": 45021045.0, "reward": 0.05392808839678764, "reward_std": 0.04955313727259636, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.004364541731774807, "rewards/logprob_reward/std": 0.005988817662000656, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 701.34375, "completions/mean_terminated_length": 701.34375, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 4.845679012345679, "grad_norm": 2.3927513095530353, "kl": 0.4453125, "learning_rate": 1.2990362554642087e-09, "loss": -0.1533, "num_tokens": 45050400.0, "reward": 0.05046911537647247, "reward_std": 0.05384611338376999, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.003993459045886993, "rewards/logprob_reward/std": 0.004564306698739529, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 710.90625, "completions/mean_terminated_length": 678.5172119140625, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 4.848765432098766, "grad_norm": 2.1825906101881607, "kl": 0.2135009765625, "learning_rate": 1.2486351273701678e-09, "loss": -0.0554, "num_tokens": 45079781.0, "reward": 0.0676162913441658, "reward_std": 0.047531113028526306, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.005684769246727228, "rewards/logprob_reward/std": 0.009872187860310078, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 737.5, "completions/mean_terminated_length": 671.3846435546875, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 4.851851851851852, "grad_norm": 1.7570850946419518, "kl": 0.23876953125, "learning_rate": 1.199228746424752e-09, "loss": 0.0072, "num_tokens": 45109933.0, "reward": 0.0463409498333931, "reward_std": 0.028527850285172462, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0028788335621356964, "rewards/logprob_reward/std": 0.00331366085447371, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 734.09375, "completions/mean_terminated_length": 734.09375, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "epoch": 4.854938271604938, "grad_norm": 1.9395569993524355, "kl": 0.208740234375, "learning_rate": 1.1508173102021402e-09, "loss": -0.1806, "num_tokens": 45140212.0, "reward": 0.04876071214675903, "reward_std": 0.04676129296422005, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0020952331833541393, "rewards/logprob_reward/std": 0.002500335918739438, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 658.71875, "completions/mean_terminated_length": 658.71875, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 4.8580246913580245, "grad_norm": 1.7223103094456633, "kl": 0.2281494140625, "learning_rate": 1.1034010122978332e-09, "loss": -0.1822, "num_tokens": 45167359.0, "reward": 0.06386316567659378, "reward_std": 0.03483687713742256, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0015146301593631506, "rewards/logprob_reward/std": 0.00169938278850168, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 731.0625, "completions/mean_terminated_length": 676.8148193359375, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 4.861111111111111, "grad_norm": 1.8032321069261932, "kl": 0.2025146484375, "learning_rate": 1.0569800423277652e-09, "loss": -0.0998, "num_tokens": 45197257.0, "reward": 0.04143955558538437, "reward_std": 0.02968217059969902, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.007849502377212048, "rewards/logprob_reward/std": 0.017946681007742882, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 677.65625, "completions/mean_terminated_length": 666.4838256835938, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 4.864197530864198, "grad_norm": 2.022822446915842, "kl": 0.3104248046875, "learning_rate": 1.0115545859276098e-09, "loss": -0.2381, "num_tokens": 45225094.0, "reward": 0.0643848329782486, "reward_std": 0.05549953877925873, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0020942618139088154, "rewards/logprob_reward/std": 0.002886283677071333, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 748.34375, "completions/mean_terminated_length": 708.9642944335938, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 4.867283950617284, "grad_norm": 2.4189487363698916, "kl": 0.24072265625, "learning_rate": 9.67124824752058e-10, "loss": -0.1831, "num_tokens": 45255381.0, "reward": 0.030485866591334343, "reward_std": 0.04298137128353119, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0026231841184198856, "rewards/logprob_reward/std": 0.005127353128045797, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 679.5, "completions/mean_terminated_length": 643.862060546875, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 4.87037037037037, "grad_norm": 2.132278591164677, "kl": 0.2337646484375, "learning_rate": 9.236909364739587e-10, "loss": -0.1948, "num_tokens": 45283397.0, "reward": 0.05180853605270386, "reward_std": 0.04013790562748909, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0020094849169254303, "rewards/logprob_reward/std": 0.0025368938222527504, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 671.15625, "completions/mean_terminated_length": 659.774169921875, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 4.8734567901234565, "grad_norm": 2.207935082593846, "kl": 0.2347412109375, "learning_rate": 8.812530947837904e-10, "loss": -0.1477, "num_tokens": 45311434.0, "reward": 0.05283327028155327, "reward_std": 0.0455731637775898, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.003148077055811882, "rewards/logprob_reward/std": 0.004595972131937742, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 666.0, "completions/mean_terminated_length": 666.0, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 4.8765432098765435, "grad_norm": 1.9307497057717353, "kl": 0.25244140625, "learning_rate": 8.39811469388857e-10, "loss": -0.123, "num_tokens": 45339170.0, "reward": 0.06806036829948425, "reward_std": 0.053993478417396545, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0027059619314968586, "rewards/logprob_reward/std": 0.003276693867519498, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 729.8125, "completions/mean_terminated_length": 710.2000122070312, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 4.87962962962963, "grad_norm": 1.920348156533404, "kl": 0.221435546875, "learning_rate": 7.99366226012621e-10, "loss": -0.0808, "num_tokens": 45369232.0, "reward": 0.0456579253077507, "reward_std": 0.04085057973861694, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0021199120674282312, "rewards/logprob_reward/std": 0.0024745571427047253, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 620.65625, "completions/mean_terminated_length": 620.65625, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 4.882716049382716, "grad_norm": 2.190429693186302, "kl": 0.2633056640625, "learning_rate": 7.59917526394066e-10, "loss": -0.089, "num_tokens": 45395173.0, "reward": 0.05531671270728111, "reward_std": 0.04867561534047127, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0024352334439754486, "rewards/logprob_reward/std": 0.002892302582040429, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 762.34375, "completions/mean_terminated_length": 713.888916015625, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 4.885802469135802, "grad_norm": 3.064749837510519, "kl": 0.3494873046875, "learning_rate": 7.214655282870019e-10, "loss": -0.4241, "num_tokens": 45426068.0, "reward": 0.03897803649306297, "reward_std": 0.04806748032569885, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0016422626795247197, "rewards/logprob_reward/std": 0.003233575262129307, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 698.25, "completions/mean_terminated_length": 664.5516967773438, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 4.888888888888889, "grad_norm": 2.5170887270911484, "kl": 0.2440185546875, "learning_rate": 6.840103854595103e-10, "loss": -0.373, "num_tokens": 45455060.0, "reward": 0.04967810958623886, "reward_std": 0.0429002121090889, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0031145629473030567, "rewards/logprob_reward/std": 0.003188327420502901, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 670.28125, "completions/mean_terminated_length": 658.8709716796875, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 4.8919753086419755, "grad_norm": 2.473694955379459, "kl": 0.2425537109375, "learning_rate": 6.475522476932504e-10, "loss": -0.159, "num_tokens": 45482785.0, "reward": 0.02962297573685646, "reward_std": 0.05181487649679184, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.0016644163988530636, "rewards/logprob_reward/std": 0.0021899265702813864, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 726.4375, "completions/mean_terminated_length": 695.6551513671875, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 4.895061728395062, "grad_norm": 1.7364290674636216, "kl": 0.25, "learning_rate": 6.120912607829598e-10, "loss": -0.0823, "num_tokens": 45512679.0, "reward": 0.05661670118570328, "reward_std": 0.04919694364070892, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0038796698208898306, "rewards/logprob_reward/std": 0.006622091867029667, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 726.59375, "completions/mean_terminated_length": 684.107177734375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 4.898148148148148, "grad_norm": 2.2950158286306257, "kl": 0.250244140625, "learning_rate": 5.776275665357045e-10, "loss": -0.2739, "num_tokens": 45542862.0, "reward": 0.0551067590713501, "reward_std": 0.054269008338451385, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0022019576281309128, "rewards/logprob_reward/std": 0.0028854061383754015, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 681.65625, "completions/mean_terminated_length": 670.6128540039062, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 4.901234567901234, "grad_norm": 2.6068100350830052, "kl": 0.22119140625, "learning_rate": 5.441613027704905e-10, "loss": -0.2717, "num_tokens": 45570955.0, "reward": 0.0395234078168869, "reward_std": 0.05497060716152191, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.002248229691758752, "rewards/logprob_reward/std": 0.002661328064277768, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 727.8125, "completions/mean_terminated_length": 708.0667114257812, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 4.904320987654321, "grad_norm": 1.9920277043367673, "kl": 0.2139892578125, "learning_rate": 5.116926033176261e-10, "loss": -0.0148, "num_tokens": 45600705.0, "reward": 0.06153532490134239, "reward_std": 0.046502865850925446, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0024003551807254553, "rewards/logprob_reward/std": 0.002641119994223118, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 761.09375, "completions/mean_terminated_length": 752.6128540039062, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 4.907407407407407, "grad_norm": 3.0123446210350835, "kl": 0.2215576171875, "learning_rate": 4.802215980182212e-10, "loss": -0.3666, "num_tokens": 45631872.0, "reward": 0.05237887427210808, "reward_std": 0.04788391292095184, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.002643193816766143, "rewards/logprob_reward/std": 0.0025566760450601578, "step": 1590 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 712.5, "completions/mean_terminated_length": 702.4515991210938, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 4.910493827160494, "grad_norm": 1.7712953033443068, "kl": NaN, "learning_rate": 4.4974841272357734e-10, "loss": -0.1552, "num_tokens": 45661572.0, "reward": 0.05617368221282959, "reward_std": 0.044310979545116425, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0033874278888106346, "rewards/logprob_reward/std": 0.005647736601531506, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 686.78125, "completions/mean_terminated_length": 675.9031982421875, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 4.91358024691358, "grad_norm": 2.0125235515929623, "kl": 0.2332763671875, "learning_rate": 4.2027316929479916e-10, "loss": -0.1436, "num_tokens": 45689749.0, "reward": 0.07134392112493515, "reward_std": 0.038457900285720825, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0028821297455579042, "rewards/logprob_reward/std": 0.003696457017213106, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 750.6875, "completions/mean_terminated_length": 732.4666748046875, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 4.916666666666667, "grad_norm": 1.983940238140831, "kl": 0.206787109375, "learning_rate": 3.917959856022668e-10, "loss": -0.2454, "num_tokens": 45720159.0, "reward": 0.043411463499069214, "reward_std": 0.05083286762237549, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.00309606920927763, "rewards/logprob_reward/std": 0.004363714717328548, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 670.4375, "completions/mean_terminated_length": 670.4375, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 4.919753086419753, "grad_norm": 2.366956041561088, "kl": 0.2435302734375, "learning_rate": 3.6431697552510853e-10, "loss": -0.3542, "num_tokens": 45747777.0, "reward": 0.05452614277601242, "reward_std": 0.04330441728234291, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0015568279195576906, "rewards/logprob_reward/std": 0.0021392430644482374, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 628.28125, "completions/mean_terminated_length": 601.9000244140625, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 4.922839506172839, "grad_norm": 3.0493038641598247, "kl": 0.25439453125, "learning_rate": 3.3783624895086795e-10, "loss": -0.252, "num_tokens": 45774074.0, "reward": 0.0488990917801857, "reward_std": 0.039302758872509, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0022489873226732016, "rewards/logprob_reward/std": 0.0034289683680981398, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 732.84375, "completions/mean_terminated_length": 691.2500610351562, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 4.925925925925926, "grad_norm": 1.9956180922077569, "kl": 0.226318359375, "learning_rate": 3.123539117749485e-10, "loss": -0.3358, "num_tokens": 45803733.0, "reward": 0.045016758143901825, "reward_std": 0.04055551066994667, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.00140750827267766, "rewards/logprob_reward/std": 0.0021567540243268013, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 656.28125, "completions/mean_terminated_length": 656.28125, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 4.929012345679013, "grad_norm": 1.9060611947842503, "kl": 0.2230224609375, "learning_rate": 2.8787006590022535e-10, "loss": -0.0451, "num_tokens": 45831046.0, "reward": 0.062060076743364334, "reward_std": 0.033758118748664856, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0029834159649908543, "rewards/logprob_reward/std": 0.0034631541930139065, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 770.21875, "completions/mean_terminated_length": 753.300048828125, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 4.932098765432099, "grad_norm": 2.74209572330302, "kl": 0.2210693359375, "learning_rate": 2.6438480923665627e-10, "loss": -0.3143, "num_tokens": 45862649.0, "reward": 0.037834376096725464, "reward_std": 0.03432668745517731, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0038437512703239918, "rewards/logprob_reward/std": 0.005583377555012703, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 729.9375, "completions/mean_terminated_length": 729.9375, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 4.935185185185185, "grad_norm": 2.2021310832433216, "kl": 0.203857421875, "learning_rate": 2.418982357008936e-10, "loss": -0.2982, "num_tokens": 45892687.0, "reward": 0.059219736605882645, "reward_std": 0.04891116917133331, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.003299708478152752, "rewards/logprob_reward/std": 0.00543432030826807, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 676.90625, "completions/mean_terminated_length": 653.7667236328125, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 4.938271604938271, "grad_norm": 2.350324710579544, "kl": 0.2523193359375, "learning_rate": 2.2041043521586756e-10, "loss": -0.3098, "num_tokens": 45920724.0, "reward": 0.057065851986408234, "reward_std": 0.0470709502696991, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0009064996265806258, "rewards/logprob_reward/std": 0.0013932627625763416, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 731.6875, "completions/mean_terminated_length": 701.4483032226562, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 4.9413580246913575, "grad_norm": 2.3862283050818807, "kl": 0.215576171875, "learning_rate": 1.999214937104532e-10, "loss": -0.0814, "num_tokens": 45950466.0, "reward": 0.04002302512526512, "reward_std": 0.053182486444711685, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0028033629059791565, "rewards/logprob_reward/std": 0.004257589112967253, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 747.53125, "completions/mean_terminated_length": 670.1199951171875, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 4.944444444444445, "grad_norm": 1.89502426343952, "kl": 0.2392578125, "learning_rate": 1.8043149311916529e-10, "loss": -0.1025, "num_tokens": 45981219.0, "reward": 0.04924574866890907, "reward_std": 0.047872144728899, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.002634164411574602, "rewards/logprob_reward/std": 0.004099159501492977, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 704.90625, "completions/mean_terminated_length": 694.6128540039062, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 4.947530864197531, "grad_norm": 1.967147765334899, "kl": 0.242919921875, "learning_rate": 1.6194051138176955e-10, "loss": -0.103, "num_tokens": 46010308.0, "reward": 0.05199815705418587, "reward_std": 0.034758321940898895, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5080004930496216, "rewards/logprob_reward/mean": 0.0022201724350452423, "rewards/logprob_reward/std": 0.0031493811402469873, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 773.9375, "completions/mean_terminated_length": 716.2307739257812, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 4.950617283950617, "grad_norm": 2.121344620306122, "kl": 0.203857421875, "learning_rate": 1.444486224429775e-10, "loss": -0.0935, "num_tokens": 46041862.0, "reward": 0.033243902027606964, "reward_std": 0.0474444180727005, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4709290862083435, "rewards/logprob_reward/mean": 0.0022154483012855053, "rewards/logprob_reward/std": 0.0028852999676018953, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 688.09375, "completions/mean_terminated_length": 665.7000122070312, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 4.953703703703704, "grad_norm": 1.8824868639397032, "kl": 0.243408203125, "learning_rate": 1.2795589625216875e-10, "loss": -0.0987, "num_tokens": 46070293.0, "reward": 0.03957396745681763, "reward_std": 0.04261380434036255, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.002304406836628914, "rewards/logprob_reward/std": 0.003390522440895438, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 736.75, "completions/mean_terminated_length": 695.7142944335938, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 4.95679012345679, "grad_norm": 2.083293342426221, "kl": 0.2130126953125, "learning_rate": 1.1246239876316899e-10, "loss": -0.1156, "num_tokens": 46100737.0, "reward": 0.037211790680885315, "reward_std": 0.02014215663075447, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.45680341124534607, "rewards/logprob_reward/mean": 0.010096433572471142, "rewards/logprob_reward/std": 0.027432050555944443, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 707.59375, "completions/mean_terminated_length": 697.3870849609375, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 4.959876543209877, "grad_norm": 2.4448589061778687, "kl": 0.24462890625, "learning_rate": 9.796819193383376e-11, "loss": -0.2613, "num_tokens": 46129972.0, "reward": 0.04535143822431564, "reward_std": 0.04745528846979141, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0017793738516047597, "rewards/logprob_reward/std": 0.0021039326675236225, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 721.21875, "completions/mean_terminated_length": 677.9642944335938, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 4.962962962962963, "grad_norm": 2.1682792903217996, "kl": 0.228515625, "learning_rate": 8.447333372593735e-11, "loss": -0.2039, "num_tokens": 46159891.0, "reward": 0.04062517732381821, "reward_std": 0.04160744324326515, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0034724189899861813, "rewards/logprob_reward/std": 0.006191283464431763, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 709.21875, "completions/mean_terminated_length": 709.21875, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 4.966049382716049, "grad_norm": 1.7180152269203575, "kl": 0.2156982421875, "learning_rate": 7.197787810492295e-11, "loss": -0.095, "num_tokens": 46189178.0, "reward": 0.04858778417110443, "reward_std": 0.04863159358501434, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0019030930707231164, "rewards/logprob_reward/std": 0.0023480455856770277, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 696.65625, "completions/mean_terminated_length": 686.0967407226562, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 4.969135802469136, "grad_norm": 2.0715313841857736, "kl": 0.23046875, "learning_rate": 6.04818750396252e-11, "loss": -0.0434, "num_tokens": 46218183.0, "reward": 0.051609545946121216, "reward_std": 0.05690581351518631, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.005260605365037918, "rewards/logprob_reward/std": 0.0065270280465483665, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 725.46875, "completions/mean_terminated_length": 682.8214721679688, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 4.972222222222222, "grad_norm": 2.1515640106787317, "kl": 0.2322998046875, "learning_rate": 4.9985370502131366e-11, "loss": -0.2735, "num_tokens": 46247914.0, "reward": 0.042196281254291534, "reward_std": 0.04903978109359741, "rewards/format_reward_func/mean": 0.40625, "rewards/format_reward_func/std": 0.49899089336395264, "rewards/logprob_reward/mean": 0.0017458668444305658, "rewards/logprob_reward/std": 0.003071760293096304, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 770.8125, "completions/mean_terminated_length": 753.933349609375, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 4.9753086419753085, "grad_norm": 2.0670932493705667, "kl": 0.1849365234375, "learning_rate": 4.0488406467559245e-11, "loss": -0.0363, "num_tokens": 46279428.0, "reward": 0.046390168368816376, "reward_std": 0.04829657822847366, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0029335212893784046, "rewards/logprob_reward/std": 0.0032161264680325985, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 710.3125, "completions/mean_terminated_length": 710.3125, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 4.978395061728395, "grad_norm": 2.3356190397023013, "kl": 0.2431640625, "learning_rate": 3.1991020913890723e-11, "loss": -0.2064, "num_tokens": 46308822.0, "reward": 0.037897706031799316, "reward_std": 0.030762355774641037, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.003914116881787777, "rewards/logprob_reward/std": 0.004848901182413101, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 770.5625, "completions/mean_terminated_length": 723.629638671875, "completions/min_length": 549.0, "completions/min_terminated_length": 549.0, "epoch": 4.981481481481482, "grad_norm": 2.1778873910598273, "kl": 0.2266845703125, "learning_rate": 2.449324782183293e-11, "loss": -0.2537, "num_tokens": 46340384.0, "reward": 0.039623111486434937, "reward_std": 0.04137878119945526, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0023590121418237686, "rewards/logprob_reward/std": 0.003141367109492421, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 773.46875, "completions/mean_terminated_length": 765.3870849609375, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 4.984567901234568, "grad_norm": 1.7731235111279475, "kl": 0.2000732421875, "learning_rate": 1.799511717470725e-11, "loss": -0.1573, "num_tokens": 46371883.0, "reward": 0.055310651659965515, "reward_std": 0.053711898624897, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.002428502542898059, "rewards/logprob_reward/std": 0.00242333160713315, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 761.0625, "completions/mean_terminated_length": 723.5000610351562, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 4.987654320987654, "grad_norm": 3.5376521395514438, "kl": 0.240234375, "learning_rate": 1.2496654958310537e-11, "loss": -0.4812, "num_tokens": 46402961.0, "reward": 0.035960301756858826, "reward_std": 0.04699845612049103, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0017614441458135843, "rewards/logprob_reward/std": 0.002412389265373349, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 642.65625, "completions/mean_terminated_length": 642.65625, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 4.9907407407407405, "grad_norm": 2.83895635142214, "kl": 0.2552490234375, "learning_rate": 7.997883160748563e-12, "loss": -0.1941, "num_tokens": 46429974.0, "reward": 0.06556230783462524, "reward_std": 0.046890728175640106, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49186936020851135, "rewards/logprob_reward/mean": 0.0034025656059384346, "rewards/logprob_reward/std": 0.004946530796587467, "step": 1617 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 705.8125, "completions/mean_terminated_length": 672.8965454101562, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 4.993827160493828, "grad_norm": 1.7022382281741213, "kl": NaN, "learning_rate": 4.4988197724360465e-12, "loss": 0.0113, "num_tokens": 46459332.0, "reward": 0.04789476841688156, "reward_std": 0.03340337052941322, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.507007360458374, "rewards/logprob_reward/mean": 0.0011330802226439118, "rewards/logprob_reward/std": 0.0016689967596903443, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 733.5, "completions/mean_terminated_length": 714.1333618164062, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 4.996913580246914, "grad_norm": 2.1619800951258923, "kl": 0.2430419921875, "learning_rate": 1.9994787860133646e-12, "loss": -0.1459, "num_tokens": 46489456.0, "reward": 0.045339275151491165, "reward_std": 0.03515031188726425, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.504016101360321, "rewards/logprob_reward/mean": 0.0017658602446317673, "rewards/logprob_reward/std": 0.002554364735260606, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 644.125, "completions/mean_terminated_length": 644.125, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 5.0, "grad_norm": 2.1215296053549455, "kl": 0.2723388671875, "learning_rate": 4.998701962355412e-13, "loss": -0.0872, "num_tokens": 46516164.0, "reward": 0.06689511239528656, "reward_std": 0.026477597653865814, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.4825586974620819, "rewards/logprob_reward/mean": 0.0014112326316535473, "rewards/logprob_reward/std": 0.0015478808199986815, "step": 1620 }, { "epoch": 5.0, "step": 1620, "total_flos": 0.0, "train_loss": -0.14862212164189392, "train_runtime": 20021.4112, "train_samples_per_second": 0.647, "train_steps_per_second": 0.081 } ], "logging_steps": 1, "max_steps": 1620, "num_input_tokens_seen": 46516164, "num_train_epochs": 5, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }