{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982507288629737, "eval_steps": 500, "global_step": 428, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2524.0, "completions/mean_length": 585.0535888671875, "completions/mean_terminated_length": 553.4234619140625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.0023323615160349854, "grad_norm": 0.27251678705215454, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 154148.0, "reward": 0.455357164144516, "reward_std": 0.26333752274513245, "rewards/verify_math_reward/mean": 0.4553571343421936, "rewards/verify_math_reward/std": 0.4991183280944824, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 522.1116333007812, "completions/mean_terminated_length": 506.0852355957031, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.004664723032069971, "grad_norm": 0.38745930790901184, "kl": 0.0004086494445800781, "learning_rate": 1e-06, "loss": 0.0205, "num_tokens": 288997.0, "reward": 0.6428571939468384, "reward_std": 0.33575403690338135, "rewards/verify_math_reward/mean": 0.6428571343421936, "rewards/verify_math_reward/std": 0.48023054003715515, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2408.0, "completions/mean_length": 611.8125, "completions/mean_terminated_length": 548.463623046875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.006997084548104956, "grad_norm": 0.2856990098953247, "kl": 0.000415802001953125, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 450323.0, "reward": 0.4732142984867096, "reward_std": 0.2688922584056854, "rewards/verify_math_reward/mean": 0.4732142984867096, "rewards/verify_math_reward/std": 0.5004002451896667, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 570.8125, "completions/mean_terminated_length": 522.9592895507812, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.009329446064139942, "grad_norm": 0.27500009536743164, "kl": 0.0004596710205078125, "learning_rate": 1e-06, "loss": 0.0309, "num_tokens": 599417.0, "reward": 0.504464328289032, "reward_std": 0.24814742803573608, "rewards/verify_math_reward/mean": 0.5044642686843872, "rewards/verify_math_reward/std": 0.5010998845100403, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2712.0, "completions/mean_length": 654.7589721679688, "completions/mean_terminated_length": 592.19091796875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.011661807580174927, "grad_norm": 0.2343212068080902, "kl": 0.0005488395690917969, "learning_rate": 1e-06, "loss": 0.0526, "num_tokens": 766739.0, "reward": 0.4508928656578064, "reward_std": 0.21899083256721497, "rewards/verify_math_reward/mean": 0.4508928656578064, "rewards/verify_math_reward/std": 0.49869707226753235, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3986.0, "completions/mean_length": 557.53125, "completions/mean_terminated_length": 541.6636962890625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.013994169096209912, "grad_norm": 0.2720922529697418, "kl": 0.0006399154663085938, "learning_rate": 1e-06, "loss": 0.0228, "num_tokens": 912194.0, "reward": 0.535714328289032, "reward_std": 0.23205281794071198, "rewards/verify_math_reward/mean": 0.5357142686843872, "rewards/verify_math_reward/std": 0.49983981251716614, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1314.0, "completions/mean_length": 552.513427734375, "completions/mean_terminated_length": 536.6233520507812, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.0163265306122449, "grad_norm": 0.25227245688438416, "kl": 0.000701904296875, "learning_rate": 1e-06, "loss": 0.0249, "num_tokens": 1056189.0, "reward": 0.4107142984867096, "reward_std": 0.262125164270401, "rewards/verify_math_reward/mean": 0.4107142984867096, "rewards/verify_math_reward/std": 0.4930652976036072, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1745.0, "completions/mean_length": 593.9955444335938, "completions/mean_terminated_length": 546.45703125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.018658892128279883, "grad_norm": 0.21280020475387573, "kl": 0.0007581710815429688, "learning_rate": 1e-06, "loss": 0.0164, "num_tokens": 1215044.0, "reward": 0.486607164144516, "reward_std": 0.18367049098014832, "rewards/verify_math_reward/mean": 0.4866071343421936, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1892.0, "completions/mean_length": 607.5491333007812, "completions/mean_terminated_length": 576.1216430664062, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.02099125364431487, "grad_norm": 0.2193998098373413, "kl": 0.0007028579711914062, "learning_rate": 1e-06, "loss": 0.0316, "num_tokens": 1375951.0, "reward": 0.4732142984867096, "reward_std": 0.20501871407032013, "rewards/verify_math_reward/mean": 0.4732142984867096, "rewards/verify_math_reward/std": 0.5004002451896667, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2161.0, "completions/mean_length": 672.5848388671875, "completions/mean_terminated_length": 610.3408813476562, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.023323615160349854, "grad_norm": 0.24384577572345734, "kl": 0.0007524490356445312, "learning_rate": 1e-06, "loss": 0.0274, "num_tokens": 1551898.0, "reward": 0.4330357313156128, "reward_std": 0.25236257910728455, "rewards/verify_math_reward/mean": 0.4330357015132904, "rewards/verify_math_reward/std": 0.49660524725914, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1710.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 464.33929443359375, "completions/mean_terminated_length": 464.33929443359375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.02565597667638484, "grad_norm": 0.27325257658958435, "kl": 0.0012264251708984375, "learning_rate": 1e-06, "loss": 0.0394, "num_tokens": 1674478.0, "reward": 0.7276785969734192, "reward_std": 0.20471924543380737, "rewards/verify_math_reward/mean": 0.7276785969734192, "rewards/verify_math_reward/std": 0.44615140557289124, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 607.9285888671875, "completions/mean_terminated_length": 511.92657470703125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.027988338192419825, "grad_norm": 0.2339106947183609, "kl": 0.001262664794921875, "learning_rate": 1e-06, "loss": 0.0289, "num_tokens": 1840526.0, "reward": 0.5089285969734192, "reward_std": 0.1704346090555191, "rewards/verify_math_reward/mean": 0.5089285969734192, "rewards/verify_math_reward/std": 0.5010399222373962, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 591.8080444335938, "completions/mean_terminated_length": 544.2398681640625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.030320699708454812, "grad_norm": 0.2372410148382187, "kl": 0.0013179779052734375, "learning_rate": 1e-06, "loss": 0.0248, "num_tokens": 1994643.0, "reward": 0.566964328289032, "reward_std": 0.17013515532016754, "rewards/verify_math_reward/mean": 0.5669642686843872, "rewards/verify_math_reward/std": 0.49660524725914, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 559.125, "completions/mean_terminated_length": 543.2645874023438, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.0326530612244898, "grad_norm": 0.23604567348957062, "kl": 0.001369476318359375, "learning_rate": 1e-06, "loss": 0.0494, "num_tokens": 2138215.0, "reward": 0.535714328289032, "reward_std": 0.22064557671546936, "rewards/verify_math_reward/mean": 0.5357142686843872, "rewards/verify_math_reward/std": 0.49983981251716614, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1934.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 506.263427734375, "completions/mean_terminated_length": 506.263427734375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.03498542274052478, "grad_norm": 0.24074804782867432, "kl": 0.00183868408203125, "learning_rate": 1e-06, "loss": 0.0354, "num_tokens": 2273434.0, "reward": 0.598214328289032, "reward_std": 0.18922802805900574, "rewards/verify_math_reward/mean": 0.5982142686843872, "rewards/verify_math_reward/std": 0.49135705828666687, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3522.0, "completions/mean_length": 573.4152221679688, "completions/mean_terminated_length": 509.3681640625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.037317784256559766, "grad_norm": 0.27736690640449524, "kl": 0.0018978118896484375, "learning_rate": 1e-06, "loss": 0.0253, "num_tokens": 2419615.0, "reward": 0.6071428656578064, "reward_std": 0.22229303419589996, "rewards/verify_math_reward/mean": 0.6071428656578064, "rewards/verify_math_reward/std": 0.48947933316230774, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3601.0, "completions/mean_length": 633.9732666015625, "completions/mean_terminated_length": 586.9774169921875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.03965014577259475, "grad_norm": 0.2832850515842438, "kl": 0.0014324188232421875, "learning_rate": 1e-06, "loss": 0.0401, "num_tokens": 2581097.0, "reward": 0.6205357313156128, "reward_std": 0.32946476340293884, "rewards/verify_math_reward/mean": 0.6205357313156128, "rewards/verify_math_reward/std": 0.4863404929637909, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 625.482177734375, "completions/mean_terminated_length": 529.9632568359375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.04198250728862974, "grad_norm": 0.2542360723018646, "kl": 0.00144195556640625, "learning_rate": 1e-06, "loss": 0.0673, "num_tokens": 2743269.0, "reward": 0.5223214626312256, "reward_std": 0.2027650624513626, "rewards/verify_math_reward/mean": 0.5223214030265808, "rewards/verify_math_reward/std": 0.5006201863288879, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2094.0, "completions/mean_length": 609.84375, "completions/mean_terminated_length": 530.2510986328125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.044314868804664724, "grad_norm": 0.2551509737968445, "kl": 0.0028018951416015625, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 2904186.0, "reward": 0.5223214626312256, "reward_std": 0.1973431557416916, "rewards/verify_math_reward/mean": 0.5223214030265808, "rewards/verify_math_reward/std": 0.5006201863288879, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2832.0, "completions/max_terminated_length": 2832.0, "completions/mean_length": 555.638427734375, "completions/mean_terminated_length": 555.638427734375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.04664723032069971, "grad_norm": 0.25999799370765686, "kl": 0.0018978118896484375, "learning_rate": 1e-06, "loss": 0.0443, "num_tokens": 3046369.0, "reward": 0.5401785969734192, "reward_std": 0.19990073144435883, "rewards/verify_math_reward/mean": 0.5401785969734192, "rewards/verify_math_reward/std": 0.49949929118156433, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1879.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 522.3973388671875, "completions/mean_terminated_length": 522.3973388671875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.04897959183673469, "grad_norm": 0.2707967460155487, "kl": 0.0012912750244140625, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 3186754.0, "reward": 0.6026785969734192, "reward_std": 0.2604704201221466, "rewards/verify_math_reward/mean": 0.6026785969734192, "rewards/verify_math_reward/std": 0.4904395043849945, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 579.9553833007812, "completions/mean_terminated_length": 532.2262573242188, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.05131195335276968, "grad_norm": 0.2654401361942291, "kl": 0.0014190673828125, "learning_rate": 1e-06, "loss": 0.0332, "num_tokens": 3339912.0, "reward": 0.59375, "reward_std": 0.2362651526927948, "rewards/verify_math_reward/mean": 0.59375, "rewards/verify_math_reward/std": 0.4922322630882263, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 632.4910888671875, "completions/mean_terminated_length": 616.9596557617188, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.053644314868804666, "grad_norm": 0.2511657178401947, "kl": 0.0018367767333984375, "learning_rate": 1e-06, "loss": -0.0069, "num_tokens": 3503742.0, "reward": 0.5401785969734192, "reward_std": 0.24949824810028076, "rewards/verify_math_reward/mean": 0.5401785969734192, "rewards/verify_math_reward/std": 0.49949929118156433, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3793.0, "completions/mean_length": 630.2142944335938, "completions/mean_terminated_length": 567.2000122070312, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.05597667638483965, "grad_norm": 0.25249868631362915, "kl": 0.0013904571533203125, "learning_rate": 1e-06, "loss": 0.0345, "num_tokens": 3666566.0, "reward": 0.4062500298023224, "reward_std": 0.23686854541301727, "rewards/verify_math_reward/mean": 0.40625, "rewards/verify_math_reward/std": 0.4922322630882263, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3011.0, "completions/mean_length": 533.138427734375, "completions/mean_terminated_length": 517.1614379882812, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.05830903790087463, "grad_norm": 0.2844082713127136, "kl": 0.00157928466796875, "learning_rate": 1e-06, "loss": -0.0311, "num_tokens": 3808557.0, "reward": 0.566964328289032, "reward_std": 0.19538895785808563, "rewards/verify_math_reward/mean": 0.5669642686843872, "rewards/verify_math_reward/std": 0.49660524725914, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 502.5223388671875, "completions/mean_terminated_length": 470.1486511230469, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.060641399416909623, "grad_norm": 0.2154368907213211, "kl": 0.0026454925537109375, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 3940570.0, "reward": 0.6428571939468384, "reward_std": 0.12791654467582703, "rewards/verify_math_reward/mean": 0.6428571343421936, "rewards/verify_math_reward/std": 0.48023054003715515, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 532.3928833007812, "completions/mean_terminated_length": 516.41259765625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.0629737609329446, "grad_norm": 0.2610211968421936, "kl": 0.00170135498046875, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 4087314.0, "reward": 0.5892857313156128, "reward_std": 0.21612931787967682, "rewards/verify_math_reward/mean": 0.5892857313156128, "rewards/verify_math_reward/std": 0.4930652976036072, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2753.0, "completions/mean_length": 607.638427734375, "completions/mean_terminated_length": 576.2117309570312, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.0653061224489796, "grad_norm": 0.2755652964115143, "kl": 0.0019664764404296875, "learning_rate": 1e-06, "loss": 0.0204, "num_tokens": 4252377.0, "reward": 0.5267857313156128, "reward_std": 0.2442418336868286, "rewards/verify_math_reward/mean": 0.5267857313156128, "rewards/verify_math_reward/std": 0.500400185585022, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2203.0, "completions/mean_length": 573.5535888671875, "completions/mean_terminated_length": 541.81982421875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.06763848396501458, "grad_norm": 0.27797746658325195, "kl": 0.001743316650390625, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 4398069.0, "reward": 0.5401785969734192, "reward_std": 0.22935959696769714, "rewards/verify_math_reward/mean": 0.5401785969734192, "rewards/verify_math_reward/std": 0.49949929118156433, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 622.6116333007812, "completions/mean_terminated_length": 607.035888671875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.06997084548104957, "grad_norm": 0.2739410996437073, "kl": 0.002223968505859375, "learning_rate": 1e-06, "loss": -0.007, "num_tokens": 4559310.0, "reward": 0.5133928656578064, "reward_std": 0.25821956992149353, "rewards/verify_math_reward/mean": 0.5133928656578064, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 696.5982666015625, "completions/mean_terminated_length": 665.9729614257812, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.07230320699708455, "grad_norm": 0.23068802058696747, "kl": 0.0012111663818359375, "learning_rate": 1e-06, "loss": 0.0302, "num_tokens": 4733676.0, "reward": 0.486607164144516, "reward_std": 0.23656180500984192, "rewards/verify_math_reward/mean": 0.4866071343421936, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 565.3392944335938, "completions/mean_terminated_length": 533.5315551757812, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.07463556851311953, "grad_norm": 0.22200630605220795, "kl": 0.0016536712646484375, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 4877384.0, "reward": 0.65625, "reward_std": 0.18862184882164001, "rewards/verify_math_reward/mean": 0.65625, "rewards/verify_math_reward/std": 0.4760226309299469, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3845.0, "completions/mean_length": 504.8660888671875, "completions/mean_terminated_length": 488.7623596191406, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.07696793002915452, "grad_norm": 0.25177422165870667, "kl": 0.0014896392822265625, "learning_rate": 1e-06, "loss": -0.0148, "num_tokens": 5011674.0, "reward": 0.7321428656578064, "reward_std": 0.18922802805900574, "rewards/verify_math_reward/mean": 0.7321428656578064, "rewards/verify_math_reward/std": 0.4438345432281494, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 2071.0, "completions/mean_length": 642.0402221679688, "completions/mean_terminated_length": 546.97705078125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.0793002915451895, "grad_norm": 0.2264726758003235, "kl": 0.0017852783203125, "learning_rate": 1e-06, "loss": 0.015, "num_tokens": 5174515.0, "reward": 0.4687500298023224, "reward_std": 0.20771192014217377, "rewards/verify_math_reward/mean": 0.46875, "rewards/verify_math_reward/std": 0.5001401305198669, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3352.0, "completions/mean_length": 732.1428833007812, "completions/mean_terminated_length": 670.9818115234375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.08163265306122448, "grad_norm": 0.25507766008377075, "kl": 0.001220703125, "learning_rate": 1e-06, "loss": 0.0533, "num_tokens": 5362451.0, "reward": 0.4910714626312256, "reward_std": 0.26181840896606445, "rewards/verify_math_reward/mean": 0.4910714328289032, "rewards/verify_math_reward/std": 0.5010399222373962, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3513.0, "completions/mean_length": 766.3928833007812, "completions/mean_terminated_length": 658.9861450195312, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.08396501457725948, "grad_norm": 0.17161031067371368, "kl": 0.00157928466796875, "learning_rate": 1e-06, "loss": 0.0424, "num_tokens": 5558939.0, "reward": 0.4419642984867096, "reward_std": 0.15751104056835175, "rewards/verify_math_reward/mean": 0.4419642984867096, "rewards/verify_math_reward/std": 0.4977326989173889, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1510.0, "completions/mean_length": 483.55804443359375, "completions/mean_terminated_length": 451.0135192871094, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.08629737609329446, "grad_norm": 0.2417500913143158, "kl": 0.0022220611572265625, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 5688400.0, "reward": 0.629464328289032, "reward_std": 0.20320014655590057, "rewards/verify_math_reward/mean": 0.6294642686843872, "rewards/verify_math_reward/std": 0.4840298891067505, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 733.450927734375, "completions/mean_terminated_length": 656.6803588867188, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.08862973760932945, "grad_norm": 0.1672551929950714, "kl": 0.0013370513916015625, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 5878997.0, "reward": 0.3794642984867096, "reward_std": 0.15346980094909668, "rewards/verify_math_reward/mean": 0.3794642984867096, "rewards/verify_math_reward/std": 0.4863404929637909, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 3617.0, "completions/mean_length": 714.4375610351562, "completions/mean_terminated_length": 572.8837280273438, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.09096209912536443, "grad_norm": 0.2452102154493332, "kl": 0.001861572265625, "learning_rate": 1e-06, "loss": 0.0675, "num_tokens": 6066143.0, "reward": 0.504464328289032, "reward_std": 0.18757328391075134, "rewards/verify_math_reward/mean": 0.5044642686843872, "rewards/verify_math_reward/std": 0.5010998845100403, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2279.0, "completions/mean_length": 650.5267944335938, "completions/mean_terminated_length": 571.8629760742188, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.09329446064139942, "grad_norm": 0.27518537640571594, "kl": 0.0014820098876953125, "learning_rate": 1e-06, "loss": 0.068, "num_tokens": 6232541.0, "reward": 0.5089285969734192, "reward_std": 0.34387195110321045, "rewards/verify_math_reward/mean": 0.5089285969734192, "rewards/verify_math_reward/std": 0.5010399222373962, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 697.4955444335938, "completions/mean_terminated_length": 587.8663330078125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.0956268221574344, "grad_norm": 0.2095831036567688, "kl": 0.0014362335205078125, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 6410404.0, "reward": 0.5401785969734192, "reward_std": 0.1976453959941864, "rewards/verify_math_reward/mean": 0.5401785969734192, "rewards/verify_math_reward/std": 0.49949929118156433, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2882.0, "completions/mean_length": 610.857177734375, "completions/mean_terminated_length": 579.45947265625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.09795918367346938, "grad_norm": 0.24536412954330444, "kl": 0.00151824951171875, "learning_rate": 1e-06, "loss": 0.0564, "num_tokens": 6564876.0, "reward": 0.5848214626312256, "reward_std": 0.2439379096031189, "rewards/verify_math_reward/mean": 0.5848214030265808, "rewards/verify_math_reward/std": 0.49385640025138855, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 1801.0, "completions/mean_length": 600.1205444335938, "completions/mean_terminated_length": 520.305908203125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.10029154518950437, "grad_norm": 0.21419697999954224, "kl": 0.002147674560546875, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 6722031.0, "reward": 0.4464285969734192, "reward_std": 0.14249484241008759, "rewards/verify_math_reward/mean": 0.4464285671710968, "rewards/verify_math_reward/std": 0.49823519587516785, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 698.8616333007812, "completions/mean_terminated_length": 637.095458984375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.10262390670553936, "grad_norm": 0.18829944729804993, "kl": 0.0019207000732421875, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 6900928.0, "reward": 0.4151785969734192, "reward_std": 0.14097853004932404, "rewards/verify_math_reward/mean": 0.4151785671710968, "rewards/verify_math_reward/std": 0.49385640025138855, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 546.6517944335938, "completions/mean_terminated_length": 514.6756591796875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.10495626822157435, "grad_norm": 0.27684906125068665, "kl": 0.0021114349365234375, "learning_rate": 1e-06, "loss": -0.0102, "num_tokens": 7052698.0, "reward": 0.504464328289032, "reward_std": 0.2086220532655716, "rewards/verify_math_reward/mean": 0.5044642686843872, "rewards/verify_math_reward/std": 0.5010998845100403, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 1926.0, "completions/mean_length": 635.3795166015625, "completions/mean_terminated_length": 540.1329956054688, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.10728862973760933, "grad_norm": 0.22097277641296387, "kl": 0.0013580322265625, "learning_rate": 1e-06, "loss": 0.079, "num_tokens": 7218647.0, "reward": 0.5, "reward_std": 0.19898781180381775, "rewards/verify_math_reward/mean": 0.5, "rewards/verify_math_reward/std": 0.5011197924613953, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 565.9285888671875, "completions/mean_terminated_length": 518.0090942382812, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.10962099125364431, "grad_norm": 0.26807278394699097, "kl": 0.003177642822265625, "learning_rate": 1e-06, "loss": -0.0057, "num_tokens": 7369015.0, "reward": 0.5178571939468384, "reward_std": 0.1253589689731598, "rewards/verify_math_reward/mean": 0.5178571343421936, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2943.0, "completions/mean_length": 601.125, "completions/mean_terminated_length": 537.581787109375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.1119533527696793, "grad_norm": 0.26417699456214905, "kl": 0.0020771026611328125, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 7529603.0, "reward": 0.6339285969734192, "reward_std": 0.22289641201496124, "rewards/verify_math_reward/mean": 0.6339285969734192, "rewards/verify_math_reward/std": 0.4828082025051117, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2446.0, "completions/max_terminated_length": 2446.0, "completions/mean_length": 613.513427734375, "completions/mean_terminated_length": 613.513427734375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.11428571428571428, "grad_norm": 0.2860516607761383, "kl": 0.0013675689697265625, "learning_rate": 1e-06, "loss": 0.0341, "num_tokens": 7691030.0, "reward": 0.5535714626312256, "reward_std": 0.3531966507434845, "rewards/verify_math_reward/mean": 0.5535714030265808, "rewards/verify_math_reward/std": 0.49823519587516785, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3034.0, "completions/mean_length": 611.0223388671875, "completions/mean_terminated_length": 579.6261596679688, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.11661807580174927, "grad_norm": 0.2596966624259949, "kl": 0.001728057861328125, "learning_rate": 1e-06, "loss": 0.033, "num_tokens": 7848939.0, "reward": 0.5, "reward_std": 0.20636393129825592, "rewards/verify_math_reward/mean": 0.5, "rewards/verify_math_reward/std": 0.5011197924613953, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 641.84375, "completions/mean_terminated_length": 626.3543090820312, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.11895043731778426, "grad_norm": 0.2556436061859131, "kl": 0.0018062591552734375, "learning_rate": 1e-06, "loss": 0.0178, "num_tokens": 8013008.0, "reward": 0.4910714626312256, "reward_std": 0.231617733836174, "rewards/verify_math_reward/mean": 0.4910714328289032, "rewards/verify_math_reward/std": 0.5010399222373962, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 558.8348388671875, "completions/mean_terminated_length": 510.81903076171875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.12128279883381925, "grad_norm": 0.2848607003688812, "kl": 0.001552581787109375, "learning_rate": 1e-06, "loss": 0.0336, "num_tokens": 8160843.0, "reward": 0.7098214626312256, "reward_std": 0.21447904407978058, "rewards/verify_math_reward/mean": 0.7098214030265808, "rewards/verify_math_reward/std": 0.4548610746860504, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 552.5670166015625, "completions/mean_terminated_length": 536.6771850585938, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.12361516034985423, "grad_norm": 0.27648499608039856, "kl": 0.00194549560546875, "learning_rate": 1e-06, "loss": -0.0131, "num_tokens": 8310106.0, "reward": 0.5714285969734192, "reward_std": 0.21703942120075226, "rewards/verify_math_reward/mean": 0.5714285969734192, "rewards/verify_math_reward/std": 0.49597999453544617, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3712.0, "completions/mean_length": 583.40625, "completions/mean_terminated_length": 551.7612915039062, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.1259475218658892, "grad_norm": 0.1887330263853073, "kl": 0.00136566162109375, "learning_rate": 1e-06, "loss": 0.0156, "num_tokens": 8466965.0, "reward": 0.535714328289032, "reward_std": 0.1561630368232727, "rewards/verify_math_reward/mean": 0.5357142686843872, "rewards/verify_math_reward/std": 0.49983981251716614, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 552.1964721679688, "completions/mean_terminated_length": 487.76361083984375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.1282798833819242, "grad_norm": 0.2779422998428345, "kl": 0.0018138885498046875, "learning_rate": 1e-06, "loss": 0.0098, "num_tokens": 8613369.0, "reward": 0.535714328289032, "reward_std": 0.2553524374961853, "rewards/verify_math_reward/mean": 0.5357142686843872, "rewards/verify_math_reward/std": 0.49983981251716614, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 571.8616333007812, "completions/mean_terminated_length": 540.1126098632812, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.1306122448979592, "grad_norm": 0.28782251477241516, "kl": 0.002044677734375, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 8763202.0, "reward": 0.5223214626312256, "reward_std": 0.2879851758480072, "rewards/verify_math_reward/mean": 0.5223214030265808, "rewards/verify_math_reward/std": 0.5006201863288879, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2805.0, "completions/mean_length": 741.4063110351562, "completions/mean_terminated_length": 695.8688354492188, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.13294460641399417, "grad_norm": 0.19218860566616058, "kl": 0.001308441162109375, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 8953853.0, "reward": 0.5267857313156128, "reward_std": 0.1827620565891266, "rewards/verify_math_reward/mean": 0.5267857313156128, "rewards/verify_math_reward/std": 0.500400185585022, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 535.7410888671875, "completions/mean_terminated_length": 503.66668701171875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.13527696793002916, "grad_norm": 0.25593048334121704, "kl": 0.00180816650390625, "learning_rate": 1e-06, "loss": 0.0117, "num_tokens": 9096979.0, "reward": 0.625, "reward_std": 0.17885924875736237, "rewards/verify_math_reward/mean": 0.625, "rewards/verify_math_reward/std": 0.4852071702480316, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 543.03125, "completions/mean_terminated_length": 527.0986938476562, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.13760932944606413, "grad_norm": 0.24300231039524078, "kl": 0.0019474029541015625, "learning_rate": 1e-06, "loss": 0.0096, "num_tokens": 9240882.0, "reward": 0.4821428656578064, "reward_std": 0.23009862005710602, "rewards/verify_math_reward/mean": 0.4821428656578064, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1608.0, "completions/mean_length": 603.7410888671875, "completions/mean_terminated_length": 572.279296875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.13994169096209913, "grad_norm": 0.17551957070827484, "kl": 0.0014858245849609375, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 9399624.0, "reward": 0.5580357313156128, "reward_std": 0.11498290300369263, "rewards/verify_math_reward/mean": 0.5580357313156128, "rewards/verify_math_reward/std": 0.4977326989173889, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 553.6741333007812, "completions/mean_terminated_length": 521.7612915039062, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.1422740524781341, "grad_norm": 0.25766316056251526, "kl": 0.0018138885498046875, "learning_rate": 1e-06, "loss": 0.0334, "num_tokens": 9548591.0, "reward": 0.6875000596046448, "reward_std": 0.2413831353187561, "rewards/verify_math_reward/mean": 0.6875, "rewards/verify_math_reward/std": 0.4645504951477051, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4028.0, "completions/mean_length": 737.0267944335938, "completions/mean_terminated_length": 660.337890625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.1446064139941691, "grad_norm": 0.20428849756717682, "kl": 0.0013713836669921875, "learning_rate": 1e-06, "loss": 0.1037, "num_tokens": 9733637.0, "reward": 0.4062500298023224, "reward_std": 0.21643324196338654, "rewards/verify_math_reward/mean": 0.40625, "rewards/verify_math_reward/std": 0.4922322630882263, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2675.0, "completions/mean_length": 612.6785888671875, "completions/mean_terminated_length": 565.3936767578125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.1469387755102041, "grad_norm": 0.22185128927230835, "kl": 0.0013828277587890625, "learning_rate": 1e-06, "loss": 0.0328, "num_tokens": 9895173.0, "reward": 0.5714285969734192, "reward_std": 0.17659832537174225, "rewards/verify_math_reward/mean": 0.5714285969734192, "rewards/verify_math_reward/std": 0.49597999453544617, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 535.9330444335938, "completions/mean_terminated_length": 503.8603820800781, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.14927113702623906, "grad_norm": 0.22858451306819916, "kl": 0.0022106170654296875, "learning_rate": 1e-06, "loss": 0.0236, "num_tokens": 10037494.0, "reward": 0.598214328289032, "reward_std": 0.17269554734230042, "rewards/verify_math_reward/mean": 0.5982142686843872, "rewards/verify_math_reward/std": 0.49135705828666687, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 576.7545166015625, "completions/mean_terminated_length": 545.049560546875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.15160349854227406, "grad_norm": 0.24405136704444885, "kl": 0.00164031982421875, "learning_rate": 1e-06, "loss": 0.0253, "num_tokens": 10190239.0, "reward": 0.5491071939468384, "reward_std": 0.21057626605033875, "rewards/verify_math_reward/mean": 0.5491071343421936, "rewards/verify_math_reward/std": 0.49869707226753235, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 574.325927734375, "completions/mean_terminated_length": 558.5336303710938, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.15393586005830903, "grad_norm": 0.2724969685077667, "kl": 0.0016498565673828125, "learning_rate": 1e-06, "loss": 0.0181, "num_tokens": 10341360.0, "reward": 0.5580357313156128, "reward_std": 0.3142729699611664, "rewards/verify_math_reward/mean": 0.5580357313156128, "rewards/verify_math_reward/std": 0.4977326989173889, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 570.6473388671875, "completions/mean_terminated_length": 538.8873901367188, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.15626822157434403, "grad_norm": 0.24138103425502777, "kl": 0.001644134521484375, "learning_rate": 1e-06, "loss": 0.0472, "num_tokens": 10488001.0, "reward": 0.5758928656578064, "reward_std": 0.24649550020694733, "rewards/verify_math_reward/mean": 0.5758928656578064, "rewards/verify_math_reward/std": 0.4953135848045349, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 631.0982666015625, "completions/mean_terminated_length": 584.0633544921875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.158600583090379, "grad_norm": 0.2019968181848526, "kl": 0.0019588470458984375, "learning_rate": 1e-06, "loss": 0.0189, "num_tokens": 10650151.0, "reward": 0.4955357313156128, "reward_std": 0.19208507239818573, "rewards/verify_math_reward/mean": 0.4955357015132904, "rewards/verify_math_reward/std": 0.5010998249053955, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2120.0, "completions/mean_length": 691.7500610351562, "completions/mean_terminated_length": 614.02734375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.160932944606414, "grad_norm": 0.26620808243751526, "kl": 0.0018100738525390625, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 10824815.0, "reward": 0.486607164144516, "reward_std": 0.3650873303413391, "rewards/verify_math_reward/mean": 0.4866071343421936, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2701.0, "completions/mean_length": 673.90625, "completions/mean_terminated_length": 595.7762451171875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.16326530612244897, "grad_norm": 0.22031183540821075, "kl": 0.001644134521484375, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 10998762.0, "reward": 0.5491071939468384, "reward_std": 0.16878993809223175, "rewards/verify_math_reward/mean": 0.5491071343421936, "rewards/verify_math_reward/std": 0.49869704246520996, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 592.5803833007812, "completions/mean_terminated_length": 528.8817749023438, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.16559766763848396, "grad_norm": 0.22658729553222656, "kl": 0.0015964508056640625, "learning_rate": 1e-06, "loss": 0.0264, "num_tokens": 11152276.0, "reward": 0.5535714626312256, "reward_std": 0.17525310814380646, "rewards/verify_math_reward/mean": 0.5535714030265808, "rewards/verify_math_reward/std": 0.49823519587516785, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3690.0, "completions/mean_length": 709.1473388671875, "completions/mean_terminated_length": 599.8939819335938, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.16793002915451896, "grad_norm": 0.21466176211833954, "kl": 0.001659393310546875, "learning_rate": 1e-06, "loss": 0.0379, "num_tokens": 11328877.0, "reward": 0.5491071939468384, "reward_std": 0.19929735362529755, "rewards/verify_math_reward/mean": 0.5491071343421936, "rewards/verify_math_reward/std": 0.49869707226753235, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2910.0, "completions/mean_length": 714.0267944335938, "completions/mean_terminated_length": 636.812744140625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.17026239067055393, "grad_norm": 0.2443362921476364, "kl": 0.0016460418701171875, "learning_rate": 1e-06, "loss": 0.0826, "num_tokens": 11510291.0, "reward": 0.6160714626312256, "reward_std": 0.29188069701194763, "rewards/verify_math_reward/mean": 0.6160714030265808, "rewards/verify_math_reward/std": 0.48743006587028503, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1566.0, "completions/mean_length": 580.294677734375, "completions/mean_terminated_length": 548.6216430664062, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.17259475218658893, "grad_norm": 0.2611260414123535, "kl": 0.001880645751953125, "learning_rate": 1e-06, "loss": 0.0733, "num_tokens": 11669589.0, "reward": 0.6428571939468384, "reward_std": 0.23101434111595154, "rewards/verify_math_reward/mean": 0.6428571343421936, "rewards/verify_math_reward/std": 0.48023054003715515, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 557.6830444335938, "completions/mean_terminated_length": 509.651611328125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.1749271137026239, "grad_norm": 0.2751494348049164, "kl": 0.0023212432861328125, "learning_rate": 1e-06, "loss": 0.0422, "num_tokens": 11811990.0, "reward": 0.6517857313156128, "reward_std": 0.23101432621479034, "rewards/verify_math_reward/mean": 0.6517857313156128, "rewards/verify_math_reward/std": 0.47747132182121277, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3887.0, "completions/mean_length": 698.4152221679688, "completions/mean_terminated_length": 572.5787353515625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.1772594752186589, "grad_norm": 0.29036015272140503, "kl": 0.001857757568359375, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 11994627.0, "reward": 0.5625, "reward_std": 0.32599422335624695, "rewards/verify_math_reward/mean": 0.5625, "rewards/verify_math_reward/std": 0.49718940258026123, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1819.0, "completions/mean_length": 604.28125, "completions/mean_terminated_length": 556.8823852539062, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.17959183673469387, "grad_norm": 0.2591899335384369, "kl": 0.0017299652099609375, "learning_rate": 1e-06, "loss": 0.0814, "num_tokens": 12149490.0, "reward": 0.7187500596046448, "reward_std": 0.25084346532821655, "rewards/verify_math_reward/mean": 0.71875, "rewards/verify_math_reward/std": 0.45061618089675903, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2687.0, "completions/mean_length": 528.4375, "completions/mean_terminated_length": 512.4395141601562, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.18192419825072886, "grad_norm": 0.20739056169986725, "kl": 0.0016880035400390625, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 12291684.0, "reward": 0.6651785969734192, "reward_std": 0.1684831976890564, "rewards/verify_math_reward/mean": 0.6651785969734192, "rewards/verify_math_reward/std": 0.4729849100112915, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 4038.0, "completions/mean_length": 626.4152221679688, "completions/mean_terminated_length": 610.8565063476562, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.18425655976676386, "grad_norm": 0.24469514191150665, "kl": 0.003215789794921875, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 12459601.0, "reward": 0.6026785969734192, "reward_std": 0.24467973411083221, "rewards/verify_math_reward/mean": 0.6026785969734192, "rewards/verify_math_reward/std": 0.4904395043849945, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1375.0, "completions/mean_length": 572.2277221679688, "completions/mean_terminated_length": 540.4819946289062, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.18658892128279883, "grad_norm": 0.24989625811576843, "kl": 0.002498626708984375, "learning_rate": 1e-06, "loss": 0.0756, "num_tokens": 12608020.0, "reward": 0.5535714626312256, "reward_std": 0.18098174035549164, "rewards/verify_math_reward/mean": 0.5535714030265808, "rewards/verify_math_reward/std": 0.49823519587516785, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4076.0, "completions/mean_length": 579.3839721679688, "completions/mean_terminated_length": 547.7026977539062, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.18892128279883383, "grad_norm": 0.3420480489730835, "kl": 0.00213623046875, "learning_rate": 1e-06, "loss": 0.01, "num_tokens": 12757106.0, "reward": 0.6473214626312256, "reward_std": 0.24980498850345612, "rewards/verify_math_reward/mean": 0.6473214030265808, "rewards/verify_math_reward/std": 0.4788738489151001, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3218.0, "completions/mean_length": 696.6250610351562, "completions/mean_terminated_length": 650.4796752929688, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.1912536443148688, "grad_norm": 0.23296111822128296, "kl": 0.0013065338134765625, "learning_rate": 1e-06, "loss": 0.036, "num_tokens": 12934654.0, "reward": 0.424107164144516, "reward_std": 0.2513140141963959, "rewards/verify_math_reward/mean": 0.4241071343421936, "rewards/verify_math_reward/std": 0.4953135550022125, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2243.0, "completions/mean_length": 478.73663330078125, "completions/mean_terminated_length": 462.5157165527344, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.1935860058309038, "grad_norm": 0.23368965089321136, "kl": 0.001613616943359375, "learning_rate": 1e-06, "loss": 0.0375, "num_tokens": 13062099.0, "reward": 0.723214328289032, "reward_std": 0.15872061252593994, "rewards/verify_math_reward/mean": 0.7232142686843872, "rewards/verify_math_reward/std": 0.4484116733074188, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3744.0, "completions/mean_length": 590.7678833007812, "completions/mean_terminated_length": 543.185546875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.19591836734693877, "grad_norm": 0.2560170590877533, "kl": 0.001621246337890625, "learning_rate": 1e-06, "loss": 0.07, "num_tokens": 13215607.0, "reward": 0.6026785969734192, "reward_std": 0.2296663522720337, "rewards/verify_math_reward/mean": 0.6026785969734192, "rewards/verify_math_reward/std": 0.4904395043849945, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2096.0, "completions/mean_length": 664.6830444335938, "completions/mean_terminated_length": 602.29541015625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.19825072886297376, "grad_norm": 0.18460825085639954, "kl": 0.0015735626220703125, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 13385184.0, "reward": 0.5625, "reward_std": 0.139630526304245, "rewards/verify_math_reward/mean": 0.5625, "rewards/verify_math_reward/std": 0.49718940258026123, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3385.0, "completions/mean_length": 741.6875610351562, "completions/mean_terminated_length": 696.1538696289062, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.20058309037900873, "grad_norm": 0.2350245714187622, "kl": 0.00148773193359375, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 13578010.0, "reward": 0.4151785969734192, "reward_std": 0.17599214613437653, "rewards/verify_math_reward/mean": 0.4151785671710968, "rewards/verify_math_reward/std": 0.49385643005371094, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2418.0, "completions/mean_length": 644.90625, "completions/mean_terminated_length": 629.4305419921875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.20291545189504373, "grad_norm": 0.27613499760627747, "kl": 0.00165557861328125, "learning_rate": 1e-06, "loss": 0.0377, "num_tokens": 13744445.0, "reward": 0.5625, "reward_std": 0.2765706479549408, "rewards/verify_math_reward/mean": 0.5625, "rewards/verify_math_reward/std": 0.49718940258026123, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 580.6875, "completions/mean_terminated_length": 549.0180053710938, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.20524781341107873, "grad_norm": 0.19442224502563477, "kl": 0.0015697479248046875, "learning_rate": 1e-06, "loss": 0.0101, "num_tokens": 13900599.0, "reward": 0.5535714626312256, "reward_std": 0.16457758843898773, "rewards/verify_math_reward/mean": 0.5535714030265808, "rewards/verify_math_reward/std": 0.49823519587516785, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 640.9420166015625, "completions/mean_terminated_length": 594.040771484375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.2075801749271137, "grad_norm": 0.22410327196121216, "kl": 0.0014171600341796875, "learning_rate": 1e-06, "loss": 0.022, "num_tokens": 14068834.0, "reward": 0.629464328289032, "reward_std": 0.16878992319107056, "rewards/verify_math_reward/mean": 0.6294642686843872, "rewards/verify_math_reward/std": 0.4840298593044281, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2752.0, "completions/mean_length": 769.950927734375, "completions/mean_terminated_length": 662.6589965820312, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.2099125364431487, "grad_norm": 0.21688145399093628, "kl": 0.0013904571533203125, "learning_rate": 1e-06, "loss": 0.059, "num_tokens": 14264527.0, "reward": 0.5401785969734192, "reward_std": 0.18532243371009827, "rewards/verify_math_reward/mean": 0.5401785969734192, "rewards/verify_math_reward/std": 0.49949926137924194, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1592.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 495.21429443359375, "completions/mean_terminated_length": 495.21429443359375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.21224489795918366, "grad_norm": 0.25790855288505554, "kl": 0.0018939971923828125, "learning_rate": 1e-06, "loss": 0.0224, "num_tokens": 14393703.0, "reward": 0.754464328289032, "reward_std": 0.21222370862960815, "rewards/verify_math_reward/mean": 0.7544642686843872, "rewards/verify_math_reward/std": 0.43136832118034363, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2620.0, "completions/mean_length": 623.0178833007812, "completions/mean_terminated_length": 543.7260131835938, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.21457725947521866, "grad_norm": 0.27001768350601196, "kl": 0.0022411346435546875, "learning_rate": 1e-06, "loss": 0.0135, "num_tokens": 14551459.0, "reward": 0.65625, "reward_std": 0.252055823802948, "rewards/verify_math_reward/mean": 0.65625, "rewards/verify_math_reward/std": 0.4760226309299469, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2954.0, "completions/mean_length": 676.3348388671875, "completions/mean_terminated_length": 645.5270385742188, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.21690962099125363, "grad_norm": 0.21039845049381256, "kl": 0.001399993896484375, "learning_rate": 1e-06, "loss": 0.0147, "num_tokens": 14722078.0, "reward": 0.535714328289032, "reward_std": 0.1814168244600296, "rewards/verify_math_reward/mean": 0.5357142686843872, "rewards/verify_math_reward/std": 0.49983981251716614, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 643.388427734375, "completions/mean_terminated_length": 580.6136474609375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.21924198250728863, "grad_norm": 0.1937643587589264, "kl": 0.0016651153564453125, "learning_rate": 1e-06, "loss": 0.0282, "num_tokens": 14890149.0, "reward": 0.5133928656578064, "reward_std": 0.1668357253074646, "rewards/verify_math_reward/mean": 0.5133928656578064, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 605.794677734375, "completions/mean_terminated_length": 558.4163208007812, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.22157434402332363, "grad_norm": 0.2784527838230133, "kl": 0.0018215179443359375, "learning_rate": 1e-06, "loss": 0.0445, "num_tokens": 15045703.0, "reward": 0.598214328289032, "reward_std": 0.2530971169471741, "rewards/verify_math_reward/mean": 0.5982142686843872, "rewards/verify_math_reward/std": 0.49135705828666687, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2129.0, "completions/mean_length": 695.3035888671875, "completions/mean_terminated_length": 585.6036987304688, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.2239067055393586, "grad_norm": 0.18985792994499207, "kl": 0.0014095306396484375, "learning_rate": 1e-06, "loss": 0.0325, "num_tokens": 15222819.0, "reward": 0.4687500298023224, "reward_std": 0.20905713737010956, "rewards/verify_math_reward/mean": 0.46875, "rewards/verify_math_reward/std": 0.5001401305198669, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2440.0, "completions/mean_length": 663.0982666015625, "completions/mean_terminated_length": 616.4977416992188, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.2262390670553936, "grad_norm": 0.19274626672267914, "kl": 0.0014247894287109375, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 15393673.0, "reward": 0.5446428656578064, "reward_std": 0.16818374395370483, "rewards/verify_math_reward/mean": 0.5446428656578064, "rewards/verify_math_reward/std": 0.4991183876991272, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 2857.0, "completions/mean_length": 812.1160888671875, "completions/mean_terminated_length": 690.49072265625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.22857142857142856, "grad_norm": 0.21691066026687622, "kl": 0.00139617919921875, "learning_rate": 1e-06, "loss": -0.0172, "num_tokens": 15598387.0, "reward": 0.4419642984867096, "reward_std": 0.18953195214271545, "rewards/verify_math_reward/mean": 0.4419642984867096, "rewards/verify_math_reward/std": 0.4977326989173889, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1668.0, "completions/mean_length": 550.8705444335938, "completions/mean_terminated_length": 518.9324340820312, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.23090379008746356, "grad_norm": 0.2327047884464264, "kl": 0.00212860107421875, "learning_rate": 1e-06, "loss": 0.0458, "num_tokens": 15741534.0, "reward": 0.5848214626312256, "reward_std": 0.1973431557416916, "rewards/verify_math_reward/mean": 0.5848214030265808, "rewards/verify_math_reward/std": 0.49385640025138855, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2131.0, "completions/mean_length": 659.9152221679688, "completions/mean_terminated_length": 613.2715454101562, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.23323615160349853, "grad_norm": 0.18836602568626404, "kl": 0.0020809173583984375, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 15909715.0, "reward": 0.504464328289032, "reward_std": 0.204110249876976, "rewards/verify_math_reward/mean": 0.5044642686843872, "rewards/verify_math_reward/std": 0.5010998845100403, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 629.607177734375, "completions/mean_terminated_length": 614.0628051757812, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.23556851311953353, "grad_norm": 0.23415729403495789, "kl": 0.001430511474609375, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 16071699.0, "reward": 0.424107164144516, "reward_std": 0.21672989428043365, "rewards/verify_math_reward/mean": 0.4241071343421936, "rewards/verify_math_reward/std": 0.4953135550022125, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3279.0, "completions/mean_length": 678.6428833007812, "completions/mean_terminated_length": 600.6209716796875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.23790087463556853, "grad_norm": 0.2581215500831604, "kl": 0.0017566680908203125, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 16247475.0, "reward": 0.535714328289032, "reward_std": 0.22650256752967834, "rewards/verify_math_reward/mean": 0.5357142686843872, "rewards/verify_math_reward/std": 0.49983981251716614, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 532.5714721679688, "completions/mean_terminated_length": 516.5919799804688, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.2402332361516035, "grad_norm": 0.23102478682994843, "kl": 0.002101898193359375, "learning_rate": 1e-06, "loss": 0.0228, "num_tokens": 16386339.0, "reward": 0.566964328289032, "reward_std": 0.163971409201622, "rewards/verify_math_reward/mean": 0.5669642686843872, "rewards/verify_math_reward/std": 0.49660524725914, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 620.6875, "completions/mean_terminated_length": 589.37841796875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.2425655976676385, "grad_norm": 0.25336551666259766, "kl": 0.0013637542724609375, "learning_rate": 1e-06, "loss": 0.0506, "num_tokens": 16548565.0, "reward": 0.4776785969734192, "reward_std": 0.27609726786613464, "rewards/verify_math_reward/mean": 0.4776785671710968, "rewards/verify_math_reward/std": 0.5006202459335327, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 560.0, "completions/mean_terminated_length": 544.1435546875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.24489795918367346, "grad_norm": 0.25496363639831543, "kl": 0.0017795562744140625, "learning_rate": 1e-06, "loss": 0.0272, "num_tokens": 16693973.0, "reward": 0.6026785969734192, "reward_std": 0.24198931455612183, "rewards/verify_math_reward/mean": 0.6026785969734192, "rewards/verify_math_reward/std": 0.4904395341873169, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3834.0, "completions/mean_length": 580.0982666015625, "completions/mean_terminated_length": 532.37109375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.24723032069970846, "grad_norm": 0.2990919053554535, "kl": 0.0017261505126953125, "learning_rate": 1e-06, "loss": 0.0356, "num_tokens": 16847707.0, "reward": 0.625, "reward_std": 0.27383914589881897, "rewards/verify_math_reward/mean": 0.625, "rewards/verify_math_reward/std": 0.4852071702480316, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1318.0, "completions/mean_length": 589.3527221679688, "completions/mean_terminated_length": 557.7612915039062, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.24956268221574343, "grad_norm": 0.28804782032966614, "kl": 0.00211334228515625, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 17003810.0, "reward": 0.4776785969734192, "reward_std": 0.20905713737010956, "rewards/verify_math_reward/mean": 0.4776785671710968, "rewards/verify_math_reward/std": 0.5006202459335327, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 706.8928833007812, "completions/mean_terminated_length": 581.370361328125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.2518950437317784, "grad_norm": 0.1757209748029709, "kl": 0.0017375946044921875, "learning_rate": 1e-06, "loss": 0.0351, "num_tokens": 17182426.0, "reward": 0.5892857313156128, "reward_std": 0.1202336996793747, "rewards/verify_math_reward/mean": 0.5892857313156128, "rewards/verify_math_reward/std": 0.4930652976036072, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2962.0, "completions/mean_length": 682.0848388671875, "completions/mean_terminated_length": 635.7421264648438, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.2542274052478134, "grad_norm": 0.29106491804122925, "kl": 0.0016193389892578125, "learning_rate": 1e-06, "loss": 0.0085, "num_tokens": 17363301.0, "reward": 0.4330357313156128, "reward_std": 0.24424462020397186, "rewards/verify_math_reward/mean": 0.4330357015132904, "rewards/verify_math_reward/std": 0.49660524725914, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 519.4285888671875, "completions/mean_terminated_length": 503.3901672363281, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.2565597667638484, "grad_norm": 0.26929226517677307, "kl": 0.0020198822021484375, "learning_rate": 1e-06, "loss": 0.0284, "num_tokens": 17500813.0, "reward": 0.5892857313156128, "reward_std": 0.22229304909706116, "rewards/verify_math_reward/mean": 0.5892857313156128, "rewards/verify_math_reward/std": 0.4930652976036072, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 566.1517944335938, "completions/mean_terminated_length": 518.2352905273438, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.2588921282798834, "grad_norm": 0.19264018535614014, "kl": 0.0015964508056640625, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 17646687.0, "reward": 0.5803571939468384, "reward_std": 0.16397422552108765, "rewards/verify_math_reward/mean": 0.5803571343421936, "rewards/verify_math_reward/std": 0.49460577964782715, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 1847.0, "completions/mean_length": 661.1205444335938, "completions/mean_terminated_length": 598.6681518554688, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.2612244897959184, "grad_norm": 0.24941198527812958, "kl": 0.0016078948974609375, "learning_rate": 1e-06, "loss": 0.062, "num_tokens": 17816466.0, "reward": 0.5133928656578064, "reward_std": 0.25040388107299805, "rewards/verify_math_reward/mean": 0.5133928656578064, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3776.0, "completions/mean_length": 640.3348388671875, "completions/mean_terminated_length": 561.4383544921875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.26355685131195333, "grad_norm": 0.2555276155471802, "kl": 0.0020389556884765625, "learning_rate": 1e-06, "loss": 0.0535, "num_tokens": 17979205.0, "reward": 0.535714328289032, "reward_std": 0.2540072202682495, "rewards/verify_math_reward/mean": 0.5357142686843872, "rewards/verify_math_reward/std": 0.49983981251716614, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2324.0, "completions/mean_length": 600.3214721679688, "completions/mean_terminated_length": 568.828857421875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.26588921282798833, "grad_norm": 0.21167892217636108, "kl": 0.0017452239990234375, "learning_rate": 1e-06, "loss": 0.0304, "num_tokens": 18137877.0, "reward": 0.4732142984867096, "reward_std": 0.19086988270282745, "rewards/verify_math_reward/mean": 0.4732142984867096, "rewards/verify_math_reward/std": 0.5004002451896667, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1537.0, "completions/max_terminated_length": 1537.0, "completions/mean_length": 576.1473388671875, "completions/mean_terminated_length": 576.1473388671875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.26822157434402333, "grad_norm": 0.2546173632144928, "kl": 0.0017337799072265625, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 18293206.0, "reward": 0.4330357313156128, "reward_std": 0.2455853670835495, "rewards/verify_math_reward/mean": 0.4330357015132904, "rewards/verify_math_reward/std": 0.49660518765449524, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 2655.0, "completions/mean_length": 715.8795166015625, "completions/mean_terminated_length": 622.8485717773438, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.2705539358600583, "grad_norm": 0.2411630004644394, "kl": 0.0017833709716796875, "learning_rate": 1e-06, "loss": 0.0375, "num_tokens": 18472771.0, "reward": 0.5892857313156128, "reward_std": 0.19629742205142975, "rewards/verify_math_reward/mean": 0.5892857313156128, "rewards/verify_math_reward/std": 0.4930652976036072, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2640.0, "completions/mean_length": 624.0491333007812, "completions/mean_terminated_length": 560.9227294921875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.27288629737609327, "grad_norm": 0.2885005474090576, "kl": 0.0020599365234375, "learning_rate": 1e-06, "loss": 0.0867, "num_tokens": 18644870.0, "reward": 0.4375000298023224, "reward_std": 0.2346104085445404, "rewards/verify_math_reward/mean": 0.4375, "rewards/verify_math_reward/std": 0.49718940258026123, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 527.6473388671875, "completions/mean_terminated_length": 462.7681579589844, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.27521865889212827, "grad_norm": 0.24338369071483612, "kl": 0.0029697418212890625, "learning_rate": 1e-06, "loss": 0.0836, "num_tokens": 18784303.0, "reward": 0.7053571939468384, "reward_std": 0.17946264147758484, "rewards/verify_math_reward/mean": 0.7053571343421936, "rewards/verify_math_reward/std": 0.45690304040908813, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1669.0, "completions/mean_length": 579.0223388671875, "completions/mean_terminated_length": 547.3378295898438, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.27755102040816326, "grad_norm": 0.27034732699394226, "kl": 0.0016078948974609375, "learning_rate": 1e-06, "loss": 0.0225, "num_tokens": 18935492.0, "reward": 0.566964328289032, "reward_std": 0.25010165572166443, "rewards/verify_math_reward/mean": 0.5669642686843872, "rewards/verify_math_reward/std": 0.49660524725914, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2419.0, "completions/mean_length": 606.669677734375, "completions/mean_terminated_length": 575.2342529296875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.27988338192419826, "grad_norm": 0.18903683125972748, "kl": 0.0017871856689453125, "learning_rate": 1e-06, "loss": 0.0183, "num_tokens": 19091674.0, "reward": 0.5491071939468384, "reward_std": 0.14640043675899506, "rewards/verify_math_reward/mean": 0.5491071343421936, "rewards/verify_math_reward/std": 0.49869707226753235, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 726.3438110351562, "completions/mean_terminated_length": 633.6008911132812, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.28221574344023326, "grad_norm": 0.2031654566526413, "kl": 0.001880645751953125, "learning_rate": 1e-06, "loss": 0.0537, "num_tokens": 19273375.0, "reward": 0.566964328289032, "reward_std": 0.18862183392047882, "rewards/verify_math_reward/mean": 0.5669642686843872, "rewards/verify_math_reward/std": 0.4966052174568176, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3289.0, "completions/mean_length": 618.388427734375, "completions/mean_terminated_length": 571.1810302734375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.2845481049562682, "grad_norm": 0.2487492859363556, "kl": 0.001918792724609375, "learning_rate": 1e-06, "loss": 0.0103, "num_tokens": 19432982.0, "reward": 0.5758928656578064, "reward_std": 0.21282710134983063, "rewards/verify_math_reward/mean": 0.5758928656578064, "rewards/verify_math_reward/std": 0.4953135550022125, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2278.0, "completions/mean_length": 732.3482666015625, "completions/mean_terminated_length": 655.552490234375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.2868804664723032, "grad_norm": 0.22322975099086761, "kl": 0.00141143798828125, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 19621012.0, "reward": 0.4821428656578064, "reward_std": 0.23417532444000244, "rewards/verify_math_reward/mean": 0.4821428656578064, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1437.0, "completions/mean_length": 578.107177734375, "completions/mean_terminated_length": 562.3318481445312, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.2892128279883382, "grad_norm": 0.21703268587589264, "kl": 0.0018138885498046875, "learning_rate": 1e-06, "loss": -0.0076, "num_tokens": 19771316.0, "reward": 0.5892857313156128, "reward_std": 0.2346104085445404, "rewards/verify_math_reward/mean": 0.5892857313156128, "rewards/verify_math_reward/std": 0.4930652976036072, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2791.0, "completions/mean_length": 568.9330444335938, "completions/mean_terminated_length": 537.1576538085938, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.2915451895043732, "grad_norm": 0.26182636618614197, "kl": 0.002227783203125, "learning_rate": 1e-06, "loss": 0.0242, "num_tokens": 19921253.0, "reward": 0.625, "reward_std": 0.23357194662094116, "rewards/verify_math_reward/mean": 0.625, "rewards/verify_math_reward/std": 0.4852071702480316, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 543.5625, "completions/mean_terminated_length": 527.63232421875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.2938775510204082, "grad_norm": 0.3207721412181854, "kl": 0.00211334228515625, "learning_rate": 1e-06, "loss": 0.0397, "num_tokens": 20062099.0, "reward": 0.6071428656578064, "reward_std": 0.30073481798171997, "rewards/verify_math_reward/mean": 0.6071428656578064, "rewards/verify_math_reward/std": 0.48947930335998535, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2994.0, "completions/mean_length": 616.7767944335938, "completions/mean_terminated_length": 601.1749267578125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.29620991253644313, "grad_norm": 0.21629698574543, "kl": 0.002124786376953125, "learning_rate": 1e-06, "loss": -0.0165, "num_tokens": 20223609.0, "reward": 0.4642857313156128, "reward_std": 0.16683855652809143, "rewards/verify_math_reward/mean": 0.4642857015132904, "rewards/verify_math_reward/std": 0.49983978271484375, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1894.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 605.419677734375, "completions/mean_terminated_length": 605.419677734375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.29854227405247813, "grad_norm": 0.2856070101261139, "kl": 0.0027828216552734375, "learning_rate": 1e-06, "loss": 0.0249, "num_tokens": 20385127.0, "reward": 0.4821428656578064, "reward_std": 0.3123260736465454, "rewards/verify_math_reward/mean": 0.4821428656578064, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1417.0, "completions/max_terminated_length": 1417.0, "completions/mean_length": 468.0625305175781, "completions/mean_terminated_length": 468.0625305175781, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.3008746355685131, "grad_norm": 0.2847056984901428, "kl": 0.00231170654296875, "learning_rate": 1e-06, "loss": 0.0068, "num_tokens": 20510829.0, "reward": 0.5580357313156128, "reward_std": 0.2102695107460022, "rewards/verify_math_reward/mean": 0.5580357313156128, "rewards/verify_math_reward/std": 0.4977326989173889, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3461.0, "completions/mean_length": 590.3125, "completions/mean_terminated_length": 542.7239990234375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.3032069970845481, "grad_norm": 0.22741061449050903, "kl": 0.0025844573974609375, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 20662627.0, "reward": 0.4598214626312256, "reward_std": 0.2215484380722046, "rewards/verify_math_reward/mean": 0.4598214328289032, "rewards/verify_math_reward/std": 0.49949926137924194, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 511.99554443359375, "completions/mean_terminated_length": 479.70721435546875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.30553935860058307, "grad_norm": 0.24569222331047058, "kl": 0.002933502197265625, "learning_rate": 1e-06, "loss": 0.0339, "num_tokens": 20795994.0, "reward": 0.6651785969734192, "reward_std": 0.16818653047084808, "rewards/verify_math_reward/mean": 0.6651785969734192, "rewards/verify_math_reward/std": 0.4729849100112915, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2050.0, "completions/mean_length": 615.7366333007812, "completions/mean_terminated_length": 536.2785034179688, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.30787172011661806, "grad_norm": 0.30736494064331055, "kl": 0.0018939971923828125, "learning_rate": 1e-06, "loss": 0.0255, "num_tokens": 20961351.0, "reward": 0.4732142984867096, "reward_std": 0.24589939415454865, "rewards/verify_math_reward/mean": 0.4732142984867096, "rewards/verify_math_reward/std": 0.5004002451896667, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2477.0, "completions/mean_length": 518.5089721679688, "completions/mean_terminated_length": 486.279296875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.31020408163265306, "grad_norm": 0.27623245120048523, "kl": 0.002376556396484375, "learning_rate": 1e-06, "loss": 0.027, "num_tokens": 21097609.0, "reward": 0.6428571939468384, "reward_std": 0.19013814628124237, "rewards/verify_math_reward/mean": 0.6428571343421936, "rewards/verify_math_reward/std": 0.48023054003715515, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4057.0, "completions/mean_length": 599.4777221679688, "completions/mean_terminated_length": 552.0136108398438, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.31253644314868806, "grad_norm": 0.26248371601104736, "kl": 0.0019016265869140625, "learning_rate": 1e-06, "loss": 0.0744, "num_tokens": 21252732.0, "reward": 0.5535714626312256, "reward_std": 0.2783453166484833, "rewards/verify_math_reward/mean": 0.5535714030265808, "rewards/verify_math_reward/std": 0.49823519587516785, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 625.3839721679688, "completions/mean_terminated_length": 609.8206787109375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.31486880466472306, "grad_norm": 0.24094338715076447, "kl": 0.0017375946044921875, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 21413170.0, "reward": 0.5133928656578064, "reward_std": 0.2057577222585678, "rewards/verify_math_reward/mean": 0.5133928656578064, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1678.0, "completions/mean_length": 616.8527221679688, "completions/mean_terminated_length": 601.2511596679688, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.317201166180758, "grad_norm": 0.2017364650964737, "kl": 0.00182342529296875, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 21573433.0, "reward": 0.4955357313156128, "reward_std": 0.16592560708522797, "rewards/verify_math_reward/mean": 0.4955357015132904, "rewards/verify_math_reward/std": 0.5010998249053955, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3052.0, "completions/mean_length": 620.1964721679688, "completions/mean_terminated_length": 604.6099243164062, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.319533527696793, "grad_norm": 0.2938466966152191, "kl": 0.0016841888427734375, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 21736213.0, "reward": 0.5267857313156128, "reward_std": 0.2657212018966675, "rewards/verify_math_reward/mean": 0.5267857313156128, "rewards/verify_math_reward/std": 0.500400185585022, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3395.0, "completions/mean_length": 572.7232666015625, "completions/mean_terminated_length": 556.9237670898438, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.321865889212828, "grad_norm": 0.21208110451698303, "kl": 0.0015926361083984375, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 21885079.0, "reward": 0.5892857313156128, "reward_std": 0.18397442996501923, "rewards/verify_math_reward/mean": 0.5892857313156128, "rewards/verify_math_reward/std": 0.4930652976036072, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 2273.0, "completions/mean_length": 668.8705444335938, "completions/mean_terminated_length": 574.5458374023438, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.324198250728863, "grad_norm": 0.17574279010295868, "kl": 0.0015773773193359375, "learning_rate": 1e-06, "loss": 0.0425, "num_tokens": 22054858.0, "reward": 0.6830357313156128, "reward_std": 0.16006861627101898, "rewards/verify_math_reward/mean": 0.6830357313156128, "rewards/verify_math_reward/std": 0.4663354754447937, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 630.0625, "completions/mean_terminated_length": 550.9314575195312, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.32653061224489793, "grad_norm": 0.26903802156448364, "kl": 0.0017642974853515625, "learning_rate": 1e-06, "loss": 0.068, "num_tokens": 22215264.0, "reward": 0.5, "reward_std": 0.2637726068496704, "rewards/verify_math_reward/mean": 0.5, "rewards/verify_math_reward/std": 0.5011197924613953, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 1723.0, "completions/mean_length": 607.9285888671875, "completions/mean_terminated_length": 544.5090942382812, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.32886297376093293, "grad_norm": 0.2991168200969696, "kl": 0.00200653076171875, "learning_rate": 1e-06, "loss": 0.0203, "num_tokens": 22376632.0, "reward": 0.5535714626312256, "reward_std": 0.24363845586776733, "rewards/verify_math_reward/mean": 0.5535714030265808, "rewards/verify_math_reward/std": 0.49823519587516785, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3375.0, "completions/mean_length": 655.3616333007812, "completions/mean_terminated_length": 608.6561279296875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.33119533527696793, "grad_norm": 0.25582826137542725, "kl": 0.0018901824951171875, "learning_rate": 1e-06, "loss": 0.0388, "num_tokens": 22544921.0, "reward": 0.5401785969734192, "reward_std": 0.23326799273490906, "rewards/verify_math_reward/mean": 0.5401785969734192, "rewards/verify_math_reward/std": 0.49949929118156433, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3335.0, "completions/mean_length": 616.21875, "completions/mean_terminated_length": 584.869384765625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.3335276967930029, "grad_norm": 0.24448157846927643, "kl": 0.0015926361083984375, "learning_rate": 1e-06, "loss": 0.0388, "num_tokens": 22703818.0, "reward": 0.5625, "reward_std": 0.18562637269496918, "rewards/verify_math_reward/mean": 0.5625, "rewards/verify_math_reward/std": 0.49718940258026123, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3484.0, "completions/mean_length": 667.4598388671875, "completions/mean_terminated_length": 620.9185791015625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.3358600583090379, "grad_norm": 0.2597779631614685, "kl": 0.0016498565673828125, "learning_rate": 1e-06, "loss": 0.0698, "num_tokens": 22878609.0, "reward": 0.5133928656578064, "reward_std": 0.25401005148887634, "rewards/verify_math_reward/mean": 0.5133928656578064, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3756.0, "completions/mean_length": 655.3839721679688, "completions/mean_terminated_length": 624.3873901367188, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.33819241982507287, "grad_norm": 0.2341889888048172, "kl": 0.0018711090087890625, "learning_rate": 1e-06, "loss": 0.0053, "num_tokens": 23045887.0, "reward": 0.455357164144516, "reward_std": 0.19343754649162292, "rewards/verify_math_reward/mean": 0.4553571343421936, "rewards/verify_math_reward/std": 0.4991183578968048, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 602.3660888671875, "completions/mean_terminated_length": 489.668212890625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.34052478134110786, "grad_norm": 0.28709134459495544, "kl": 0.00226593017578125, "learning_rate": 1e-06, "loss": 0.0675, "num_tokens": 23198337.0, "reward": 0.5848214626312256, "reward_std": 0.26121222972869873, "rewards/verify_math_reward/mean": 0.5848214030265808, "rewards/verify_math_reward/std": 0.49385640025138855, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3598.0, "completions/mean_length": 759.6205444335938, "completions/mean_terminated_length": 603.7149047851562, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.34285714285714286, "grad_norm": 0.20393072068691254, "kl": 0.0019683837890625, "learning_rate": 1e-06, "loss": 0.0278, "num_tokens": 23391076.0, "reward": 0.5133928656578064, "reward_std": 0.1720893532037735, "rewards/verify_math_reward/mean": 0.5133928656578064, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 521.03125, "completions/mean_terminated_length": 472.5022888183594, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.34518950437317786, "grad_norm": 0.30667921900749207, "kl": 0.00206756591796875, "learning_rate": 1e-06, "loss": 0.0489, "num_tokens": 23523939.0, "reward": 0.625, "reward_std": 0.23296292126178741, "rewards/verify_math_reward/mean": 0.625, "rewards/verify_math_reward/std": 0.4852071702480316, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3738.0, "completions/mean_length": 623.5178833007812, "completions/mean_terminated_length": 560.3817749023438, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.34752186588921286, "grad_norm": 0.23010538518428802, "kl": 0.0018062591552734375, "learning_rate": 1e-06, "loss": 0.0152, "num_tokens": 23689143.0, "reward": 0.4642857313156128, "reward_std": 0.22289641201496124, "rewards/verify_math_reward/mean": 0.4642857015132904, "rewards/verify_math_reward/std": 0.49983978271484375, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1672.0, "completions/mean_length": 587.0803833007812, "completions/mean_terminated_length": 539.447998046875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.3498542274052478, "grad_norm": 0.31920623779296875, "kl": 0.0019931793212890625, "learning_rate": 1e-06, "loss": 0.0181, "num_tokens": 23841681.0, "reward": 0.5267857313156128, "reward_std": 0.2663346827030182, "rewards/verify_math_reward/mean": 0.5267857313156128, "rewards/verify_math_reward/std": 0.500400185585022, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1792.0, "completions/mean_length": 583.2902221679688, "completions/mean_terminated_length": 551.6441650390625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.3521865889212828, "grad_norm": 0.22783829271793365, "kl": 0.0017986297607421875, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 23994498.0, "reward": 0.53125, "reward_std": 0.1931336373090744, "rewards/verify_math_reward/mean": 0.53125, "rewards/verify_math_reward/std": 0.5001401305198669, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3782.0, "completions/mean_length": 762.4152221679688, "completions/mean_terminated_length": 606.6401977539062, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.3545189504373178, "grad_norm": 0.1953461915254593, "kl": 0.0016193389892578125, "learning_rate": 1e-06, "loss": 0.0531, "num_tokens": 24185887.0, "reward": 0.6116071939468384, "reward_std": 0.18306151032447815, "rewards/verify_math_reward/mean": 0.6116071343421936, "rewards/verify_math_reward/std": 0.4884762763977051, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 547.544677734375, "completions/mean_terminated_length": 531.63232421875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.3568513119533528, "grad_norm": 0.2813705503940582, "kl": 0.0021514892578125, "learning_rate": 1e-06, "loss": 0.0244, "num_tokens": 24331713.0, "reward": 0.486607164144516, "reward_std": 0.2268020212650299, "rewards/verify_math_reward/mean": 0.4866071343421936, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2735.0, "completions/mean_length": 689.7678833007812, "completions/mean_terminated_length": 579.889404296875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.35918367346938773, "grad_norm": 0.23221136629581451, "kl": 0.001651763916015625, "learning_rate": 1e-06, "loss": 0.0972, "num_tokens": 24516269.0, "reward": 0.5178571939468384, "reward_std": 0.2346103936433792, "rewards/verify_math_reward/mean": 0.5178571343421936, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 539.294677734375, "completions/mean_terminated_length": 507.25225830078125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.36151603498542273, "grad_norm": 0.27546215057373047, "kl": 0.00171661376953125, "learning_rate": 1e-06, "loss": -0.0212, "num_tokens": 24655727.0, "reward": 0.660714328289032, "reward_std": 0.2150852382183075, "rewards/verify_math_reward/mean": 0.6607142686843872, "rewards/verify_math_reward/std": 0.4745272994041443, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 639.1964721679688, "completions/mean_terminated_length": 560.27392578125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.3638483965014577, "grad_norm": 0.21887610852718353, "kl": 0.0019931793212890625, "learning_rate": 1e-06, "loss": 0.0176, "num_tokens": 24822891.0, "reward": 0.5089285969734192, "reward_std": 0.2038063406944275, "rewards/verify_math_reward/mean": 0.5089285969734192, "rewards/verify_math_reward/std": 0.5010399222373962, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 576.6027221679688, "completions/mean_terminated_length": 512.6136474609375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.3661807580174927, "grad_norm": 0.24248209595680237, "kl": 0.0025463104248046875, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 24977922.0, "reward": 0.5848214626312256, "reward_std": 0.1690966635942459, "rewards/verify_math_reward/mean": 0.5848214030265808, "rewards/verify_math_reward/std": 0.49385640025138855, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 1617.0, "completions/mean_length": 608.7410888671875, "completions/mean_terminated_length": 545.3363647460938, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.3685131195335277, "grad_norm": 0.21610601246356964, "kl": 0.0019626617431640625, "learning_rate": 1e-06, "loss": 0.0501, "num_tokens": 25141424.0, "reward": 0.5401785969734192, "reward_std": 0.16804811358451843, "rewards/verify_math_reward/mean": 0.5401785969734192, "rewards/verify_math_reward/std": 0.49949929118156433, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2173.0, "completions/max_terminated_length": 2173.0, "completions/mean_length": 554.3527221679688, "completions/mean_terminated_length": 554.3527221679688, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.37084548104956266, "grad_norm": 0.2853037118911743, "kl": 0.002079010009765625, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 25286391.0, "reward": 0.5535714626312256, "reward_std": 0.28707224130630493, "rewards/verify_math_reward/mean": 0.5535714030265808, "rewards/verify_math_reward/std": 0.49823519587516785, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 564.6830444335938, "completions/mean_terminated_length": 564.6830444335938, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.37317784256559766, "grad_norm": 0.2480822205543518, "kl": 0.0020084381103515625, "learning_rate": 1e-06, "loss": -0.0062, "num_tokens": 25441424.0, "reward": 0.4910714626312256, "reward_std": 0.23417532444000244, "rewards/verify_math_reward/mean": 0.4910714328289032, "rewards/verify_math_reward/std": 0.5010399222373962, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 603.9285888671875, "completions/mean_terminated_length": 572.468505859375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.37551020408163266, "grad_norm": 0.2697044610977173, "kl": 0.0019435882568359375, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 25606600.0, "reward": 0.424107164144516, "reward_std": 0.33350035548210144, "rewards/verify_math_reward/mean": 0.4241071343421936, "rewards/verify_math_reward/std": 0.4953135550022125, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4052.0, "completions/mean_length": 888.4553833007812, "completions/mean_terminated_length": 738.570068359375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.37784256559766766, "grad_norm": 0.21930626034736633, "kl": 0.00145721435546875, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 25828246.0, "reward": 0.4776785969734192, "reward_std": 0.17885644733905792, "rewards/verify_math_reward/mean": 0.4776785671710968, "rewards/verify_math_reward/std": 0.5006202459335327, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2992.0, "completions/mean_length": 640.8527221679688, "completions/mean_terminated_length": 578.0317993164062, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.3801749271137026, "grad_norm": 0.20078200101852417, "kl": 0.0017642974853515625, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 26001229.0, "reward": 0.4375000298023224, "reward_std": 0.1979493498802185, "rewards/verify_math_reward/mean": 0.4375, "rewards/verify_math_reward/std": 0.49718940258026123, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 548.919677734375, "completions/mean_terminated_length": 533.0134887695312, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.3825072886297376, "grad_norm": 0.2016945630311966, "kl": 0.0019893646240234375, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 26144011.0, "reward": 0.6205357313156128, "reward_std": 0.17854970693588257, "rewards/verify_math_reward/mean": 0.6205357313156128, "rewards/verify_math_reward/std": 0.4863404929637909, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 575.2678833007812, "completions/mean_terminated_length": 543.549560546875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.3848396501457726, "grad_norm": 0.18787410855293274, "kl": 0.002079010009765625, "learning_rate": 1e-06, "loss": 0.0311, "num_tokens": 26298183.0, "reward": 0.4419642984867096, "reward_std": 0.1489580273628235, "rewards/verify_math_reward/mean": 0.4419642984867096, "rewards/verify_math_reward/std": 0.4977326989173889, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 598.0223388671875, "completions/mean_terminated_length": 550.5385131835938, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.3871720116618076, "grad_norm": 0.18036533892154694, "kl": 0.0022106170654296875, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 26453404.0, "reward": 0.5625, "reward_std": 0.1441422998905182, "rewards/verify_math_reward/mean": 0.5625, "rewards/verify_math_reward/std": 0.49718940258026123, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2771.0, "completions/max_terminated_length": 2771.0, "completions/mean_length": 606.2366333007812, "completions/mean_terminated_length": 606.2366333007812, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.3895043731778426, "grad_norm": 0.2669314444065094, "kl": 0.0019855499267578125, "learning_rate": 1e-06, "loss": 0.0219, "num_tokens": 26612577.0, "reward": 0.4776785969734192, "reward_std": 0.21478131413459778, "rewards/verify_math_reward/mean": 0.4776785671710968, "rewards/verify_math_reward/std": 0.5006202459335327, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 523.4598388671875, "completions/mean_terminated_length": 507.4394836425781, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.39183673469387753, "grad_norm": 0.25433677434921265, "kl": 0.002719879150390625, "learning_rate": 1e-06, "loss": 0.014, "num_tokens": 26751080.0, "reward": 0.6651785969734192, "reward_std": 0.19117942452430725, "rewards/verify_math_reward/mean": 0.6651785969734192, "rewards/verify_math_reward/std": 0.4729849696159363, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 2245.0, "completions/mean_length": 666.044677734375, "completions/mean_terminated_length": 539.00927734375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.39416909620991253, "grad_norm": 0.23573501408100128, "kl": 0.0018463134765625, "learning_rate": 1e-06, "loss": 0.064, "num_tokens": 26919794.0, "reward": 0.5178571939468384, "reward_std": 0.20667067170143127, "rewards/verify_math_reward/mean": 0.5178571343421936, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 573.107177734375, "completions/mean_terminated_length": 541.369384765625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.3965014577259475, "grad_norm": 0.21281416714191437, "kl": 0.002506256103515625, "learning_rate": 1e-06, "loss": 0.0269, "num_tokens": 27068994.0, "reward": 0.5803571939468384, "reward_std": 0.17976489663124084, "rewards/verify_math_reward/mean": 0.5803571343421936, "rewards/verify_math_reward/std": 0.49460574984550476, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2479.0, "completions/mean_length": 611.1607666015625, "completions/mean_terminated_length": 563.855224609375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.3988338192419825, "grad_norm": 0.2572023570537567, "kl": 0.002071380615234375, "learning_rate": 1e-06, "loss": 0.0386, "num_tokens": 27228854.0, "reward": 0.5848214626312256, "reward_std": 0.2900620996952057, "rewards/verify_math_reward/mean": 0.5848214030265808, "rewards/verify_math_reward/std": 0.49385640025138855, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1588.0, "completions/mean_length": 559.3125, "completions/mean_terminated_length": 527.450439453125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.40116618075801747, "grad_norm": 0.25404220819473267, "kl": 0.002285003662109375, "learning_rate": 1e-06, "loss": 0.0438, "num_tokens": 27376972.0, "reward": 0.5848214626312256, "reward_std": 0.2592580318450928, "rewards/verify_math_reward/mean": 0.5848214030265808, "rewards/verify_math_reward/std": 0.49385640025138855, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3019.0, "completions/mean_length": 540.6741333007812, "completions/mean_terminated_length": 492.4117736816406, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.40349854227405246, "grad_norm": 0.24391190707683563, "kl": 0.0030002593994140625, "learning_rate": 1e-06, "loss": 0.0295, "num_tokens": 27521619.0, "reward": 0.6428571939468384, "reward_std": 0.18623536825180054, "rewards/verify_math_reward/mean": 0.6428571343421936, "rewards/verify_math_reward/std": 0.48023054003715515, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1426.0, "completions/mean_length": 542.6964721679688, "completions/mean_terminated_length": 494.4615478515625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.40583090379008746, "grad_norm": 0.28649452328681946, "kl": 0.00246429443359375, "learning_rate": 1e-06, "loss": 0.0162, "num_tokens": 27665023.0, "reward": 0.5758928656578064, "reward_std": 0.263334721326828, "rewards/verify_math_reward/mean": 0.5758928656578064, "rewards/verify_math_reward/std": 0.4953135550022125, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3279.0, "completions/mean_length": 534.6964721679688, "completions/mean_terminated_length": 502.61260986328125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.40816326530612246, "grad_norm": 0.2814836800098419, "kl": 0.00211334228515625, "learning_rate": 1e-06, "loss": 0.0311, "num_tokens": 27804099.0, "reward": 0.5446428656578064, "reward_std": 0.20966331660747528, "rewards/verify_math_reward/mean": 0.5446428656578064, "rewards/verify_math_reward/std": 0.4991183578968048, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1450.0, "completions/mean_length": 538.982177734375, "completions/mean_terminated_length": 506.93695068359375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.41049562682215746, "grad_norm": 0.27930331230163574, "kl": 0.003086090087890625, "learning_rate": 1e-06, "loss": 0.0455, "num_tokens": 27946343.0, "reward": 0.629464328289032, "reward_std": 0.257912814617157, "rewards/verify_math_reward/mean": 0.6294642686843872, "rewards/verify_math_reward/std": 0.4840298891067505, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3200.0, "completions/mean_length": 567.2142944335938, "completions/mean_terminated_length": 535.4234619140625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.4128279883381924, "grad_norm": 0.22507323324680328, "kl": 0.002315521240234375, "learning_rate": 1e-06, "loss": 0.0272, "num_tokens": 28090431.0, "reward": 0.6339285969734192, "reward_std": 0.23205281794071198, "rewards/verify_math_reward/mean": 0.6339285969734192, "rewards/verify_math_reward/std": 0.4828082025051117, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2344.0, "completions/mean_length": 650.0267944335938, "completions/mean_terminated_length": 587.3726806640625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.4151603498542274, "grad_norm": 0.20785152912139893, "kl": 0.002262115478515625, "learning_rate": 1e-06, "loss": 0.0516, "num_tokens": 28262613.0, "reward": 0.6116071939468384, "reward_std": 0.163971409201622, "rewards/verify_math_reward/mean": 0.6116071343421936, "rewards/verify_math_reward/std": 0.4884762465953827, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 614.375, "completions/mean_terminated_length": 534.8858032226562, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.4174927113702624, "grad_norm": 0.24259614944458008, "kl": 0.0018787384033203125, "learning_rate": 1e-06, "loss": 0.0173, "num_tokens": 28429033.0, "reward": 0.5267857313156128, "reward_std": 0.23942892253398895, "rewards/verify_math_reward/mean": 0.5267857313156128, "rewards/verify_math_reward/std": 0.500400185585022, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 603.3035888671875, "completions/mean_terminated_length": 571.8378295898438, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.4198250728862974, "grad_norm": 0.2416960448026657, "kl": 0.002521514892578125, "learning_rate": 1e-06, "loss": 0.0146, "num_tokens": 28585157.0, "reward": 0.5758928656578064, "reward_std": 0.22558964788913727, "rewards/verify_math_reward/mean": 0.5758928656578064, "rewards/verify_math_reward/std": 0.4953135550022125, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 677.4420166015625, "completions/mean_terminated_length": 615.2863159179688, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.4221574344023324, "grad_norm": 0.20833344757556915, "kl": 0.002132415771484375, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 28758456.0, "reward": 0.5178571939468384, "reward_std": 0.1979493498802185, "rewards/verify_math_reward/mean": 0.5178571343421936, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 615.7678833007812, "completions/mean_terminated_length": 584.4144287109375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.42448979591836733, "grad_norm": 0.21140404045581818, "kl": 0.00281524658203125, "learning_rate": 1e-06, "loss": 0.0143, "num_tokens": 28918340.0, "reward": 0.5133928656578064, "reward_std": 0.17946544289588928, "rewards/verify_math_reward/mean": 0.5133928656578064, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1902.0, "completions/mean_length": 633.3795166015625, "completions/mean_terminated_length": 586.3756103515625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.4268221574344023, "grad_norm": 0.25119203329086304, "kl": 0.002170562744140625, "learning_rate": 1e-06, "loss": 0.0742, "num_tokens": 29083001.0, "reward": 0.6160714626312256, "reward_std": 0.2501044273376465, "rewards/verify_math_reward/mean": 0.6160714030265808, "rewards/verify_math_reward/std": 0.48743006587028503, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2133.0, "completions/mean_length": 603.71875, "completions/mean_terminated_length": 588.058349609375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.4291545189504373, "grad_norm": 0.21964031457901, "kl": 0.0020694732666015625, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 29240394.0, "reward": 0.5803571939468384, "reward_std": 0.1956884115934372, "rewards/verify_math_reward/mean": 0.5803571343421936, "rewards/verify_math_reward/std": 0.49460574984550476, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2410.0, "completions/mean_length": 815.8527221679688, "completions/mean_terminated_length": 710.04150390625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.4314868804664723, "grad_norm": 0.1969069242477417, "kl": 0.0018215179443359375, "learning_rate": 1e-06, "loss": 0.0323, "num_tokens": 29445745.0, "reward": 0.4687500298023224, "reward_std": 0.21057625114917755, "rewards/verify_math_reward/mean": 0.46875, "rewards/verify_math_reward/std": 0.5001401305198669, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3714.0, "completions/mean_length": 531.6607666015625, "completions/mean_terminated_length": 515.6771850585938, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.43381924198250726, "grad_norm": 0.3044220209121704, "kl": 0.00237274169921875, "learning_rate": 1e-06, "loss": 0.0116, "num_tokens": 29582717.0, "reward": 0.5089285969734192, "reward_std": 0.24363845586776733, "rewards/verify_math_reward/mean": 0.5089285969734192, "rewards/verify_math_reward/std": 0.5010399222373962, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3589.0, "completions/mean_length": 565.0491333007812, "completions/mean_terminated_length": 533.23876953125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.43615160349854226, "grad_norm": 0.25785499811172485, "kl": 0.002742767333984375, "learning_rate": 1e-06, "loss": 0.0574, "num_tokens": 29727992.0, "reward": 0.6473214626312256, "reward_std": 0.23521658778190613, "rewards/verify_math_reward/mean": 0.6473214030265808, "rewards/verify_math_reward/std": 0.4788738489151001, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 661.5535888671875, "completions/mean_terminated_length": 630.6126098632812, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.43848396501457726, "grad_norm": 0.2619020342826843, "kl": 0.002166748046875, "learning_rate": 1e-06, "loss": 0.0682, "num_tokens": 29901892.0, "reward": 0.5267857313156128, "reward_std": 0.2770000994205475, "rewards/verify_math_reward/mean": 0.5267857313156128, "rewards/verify_math_reward/std": 0.500400185585022, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2728.0, "completions/mean_length": 694.2053833007812, "completions/mean_terminated_length": 616.5387573242188, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.44081632653061226, "grad_norm": 0.20431652665138245, "kl": 0.0028629302978515625, "learning_rate": 1e-06, "loss": -0.0025, "num_tokens": 30079418.0, "reward": 0.4910714626312256, "reward_std": 0.17269553244113922, "rewards/verify_math_reward/mean": 0.4910714328289032, "rewards/verify_math_reward/std": 0.5010399222373962, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2207.0, "completions/mean_length": 583.53125, "completions/mean_terminated_length": 551.8873901367188, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.44314868804664725, "grad_norm": 0.2757954001426697, "kl": 0.00231170654296875, "learning_rate": 1e-06, "loss": -0.0136, "num_tokens": 30236433.0, "reward": 0.5580357313156128, "reward_std": 0.1892252266407013, "rewards/verify_math_reward/mean": 0.5580357313156128, "rewards/verify_math_reward/std": 0.4977326989173889, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 480.9285888671875, "completions/mean_terminated_length": 480.9285888671875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.4454810495626822, "grad_norm": 0.23335270583629608, "kl": 0.00286102294921875, "learning_rate": 1e-06, "loss": 0.0041, "num_tokens": 30371673.0, "reward": 0.6026785969734192, "reward_std": 0.16878993809223175, "rewards/verify_math_reward/mean": 0.6026785969734192, "rewards/verify_math_reward/std": 0.4904395341873169, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 605.794677734375, "completions/mean_terminated_length": 574.3513793945312, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.4478134110787172, "grad_norm": 0.25478288531303406, "kl": 0.002330780029296875, "learning_rate": 1e-06, "loss": 0.0411, "num_tokens": 30526179.0, "reward": 0.5178571939468384, "reward_std": 0.22545400261878967, "rewards/verify_math_reward/mean": 0.5178571343421936, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2360.0, "completions/mean_length": 714.9107666015625, "completions/mean_terminated_length": 637.7168579101562, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.4501457725947522, "grad_norm": 0.20080216228961945, "kl": 0.001979827880859375, "learning_rate": 1e-06, "loss": 0.0239, "num_tokens": 30708671.0, "reward": 0.3705357313156128, "reward_std": 0.20801867544651031, "rewards/verify_math_reward/mean": 0.3705357015132904, "rewards/verify_math_reward/std": 0.4840298593044281, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1717.0, "completions/mean_length": 679.46875, "completions/mean_terminated_length": 633.0905151367188, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.4524781341107872, "grad_norm": 0.23196423053741455, "kl": 0.001995086669921875, "learning_rate": 1e-06, "loss": 0.0736, "num_tokens": 30885216.0, "reward": 0.486607164144516, "reward_std": 0.257912814617157, "rewards/verify_math_reward/mean": 0.4866071343421936, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1854.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 497.83929443359375, "completions/mean_terminated_length": 497.83929443359375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.45481049562682213, "grad_norm": 0.2476566582918167, "kl": 0.0028533935546875, "learning_rate": 1e-06, "loss": 0.0104, "num_tokens": 31013244.0, "reward": 0.6160714626312256, "reward_std": 0.1827620565891266, "rewards/verify_math_reward/mean": 0.6160714030265808, "rewards/verify_math_reward/std": 0.48743006587028503, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3214.0, "completions/max_terminated_length": 3214.0, "completions/mean_length": 527.8303833007812, "completions/mean_terminated_length": 527.8303833007812, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.45714285714285713, "grad_norm": 0.2752782702445984, "kl": 0.002559661865234375, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 31151286.0, "reward": 0.5, "reward_std": 0.21387286484241486, "rewards/verify_math_reward/mean": 0.5, "rewards/verify_math_reward/std": 0.5011197924613953, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 556.5402221679688, "completions/mean_terminated_length": 492.18634033203125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.4594752186588921, "grad_norm": 0.2352851927280426, "kl": 0.00229644775390625, "learning_rate": 1e-06, "loss": 0.0283, "num_tokens": 31306103.0, "reward": 0.53125, "reward_std": 0.21117517352104187, "rewards/verify_math_reward/mean": 0.53125, "rewards/verify_math_reward/std": 0.5001401305198669, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 1932.0, "completions/mean_length": 682.6473388671875, "completions/mean_terminated_length": 539.7628173828125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.4618075801749271, "grad_norm": 0.24791188538074493, "kl": 0.0019893646240234375, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 31483240.0, "reward": 0.5401785969734192, "reward_std": 0.2086220532655716, "rewards/verify_math_reward/mean": 0.5401785969734192, "rewards/verify_math_reward/std": 0.49949929118156433, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3958.0, "completions/mean_length": 726.6116333007812, "completions/mean_terminated_length": 617.921630859375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.4641399416909621, "grad_norm": 0.1901516169309616, "kl": 0.0020084381103515625, "learning_rate": 1e-06, "loss": 0.039, "num_tokens": 31671865.0, "reward": 0.4598214626312256, "reward_std": 0.19239181280136108, "rewards/verify_math_reward/mean": 0.4598214328289032, "rewards/verify_math_reward/std": 0.49949926137924194, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 2103.0, "completions/mean_length": 683.0580444335938, "completions/mean_terminated_length": 540.190673828125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.46647230320699706, "grad_norm": 0.26182863116264343, "kl": 0.002254486083984375, "learning_rate": 1e-06, "loss": 0.0497, "num_tokens": 31847166.0, "reward": 0.59375, "reward_std": 0.2502654790878296, "rewards/verify_math_reward/mean": 0.59375, "rewards/verify_math_reward/std": 0.4922322630882263, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 587.3482666015625, "completions/mean_terminated_length": 555.73876953125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.46880466472303206, "grad_norm": 0.28166404366493225, "kl": 0.00238037109375, "learning_rate": 1e-06, "loss": 0.0467, "num_tokens": 32003524.0, "reward": 0.5580357313156128, "reward_std": 0.26121222972869873, "rewards/verify_math_reward/mean": 0.5580357313156128, "rewards/verify_math_reward/std": 0.4977326989173889, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2860.0, "completions/mean_length": 632.5714721679688, "completions/mean_terminated_length": 601.369384765625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.47113702623906706, "grad_norm": 0.22461578249931335, "kl": 0.00252532958984375, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 32163052.0, "reward": 0.5803571939468384, "reward_std": 0.19178561866283417, "rewards/verify_math_reward/mean": 0.5803571343421936, "rewards/verify_math_reward/std": 0.49460574984550476, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 2833.0, "completions/mean_length": 701.357177734375, "completions/mean_terminated_length": 575.629638671875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.47346938775510206, "grad_norm": 0.21722613275051117, "kl": 0.0023365020751953125, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 32341532.0, "reward": 0.5892857313156128, "reward_std": 0.18081346154212952, "rewards/verify_math_reward/mean": 0.5892857313156128, "rewards/verify_math_reward/std": 0.4930652976036072, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 630.3125, "completions/mean_terminated_length": 534.9265747070312, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.47580174927113705, "grad_norm": 0.24759508669376373, "kl": 0.002536773681640625, "learning_rate": 1e-06, "loss": 0.0162, "num_tokens": 32499930.0, "reward": 0.65625, "reward_std": 0.22350260615348816, "rewards/verify_math_reward/mean": 0.65625, "rewards/verify_math_reward/std": 0.4760226309299469, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2909.0, "completions/mean_length": 650.1875, "completions/mean_terminated_length": 539.0322875976562, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.478134110787172, "grad_norm": 0.22896498441696167, "kl": 0.0021572113037109375, "learning_rate": 1e-06, "loss": 0.1047, "num_tokens": 32662868.0, "reward": 0.7500000596046448, "reward_std": 0.22289641201496124, "rewards/verify_math_reward/mean": 0.75, "rewards/verify_math_reward/std": 0.4339824914932251, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2468.0, "completions/mean_length": 553.357177734375, "completions/mean_terminated_length": 521.4414672851562, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.480466472303207, "grad_norm": 0.25822311639785767, "kl": 0.00254058837890625, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 32805700.0, "reward": 0.5625, "reward_std": 0.21222089231014252, "rewards/verify_math_reward/mean": 0.5625, "rewards/verify_math_reward/std": 0.49718940258026123, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2398.0, "completions/mean_length": 534.2857666015625, "completions/mean_terminated_length": 518.3139038085938, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.482798833819242, "grad_norm": 0.27964499592781067, "kl": 0.002460479736328125, "learning_rate": 1e-06, "loss": 0.0084, "num_tokens": 32949964.0, "reward": 0.5848214626312256, "reward_std": 0.1318221390247345, "rewards/verify_math_reward/mean": 0.5848214030265808, "rewards/verify_math_reward/std": 0.49385640025138855, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 671.84375, "completions/mean_terminated_length": 577.6008911132812, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.485131195335277, "grad_norm": 0.25014668703079224, "kl": 0.002231597900390625, "learning_rate": 1e-06, "loss": 0.0764, "num_tokens": 33119385.0, "reward": 0.629464328289032, "reward_std": 0.17720451951026917, "rewards/verify_math_reward/mean": 0.6294642686843872, "rewards/verify_math_reward/std": 0.4840298891067505, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 569.732177734375, "completions/mean_terminated_length": 537.9639892578125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.48746355685131193, "grad_norm": 0.23354767262935638, "kl": 0.0025787353515625, "learning_rate": 1e-06, "loss": 0.0271, "num_tokens": 33268325.0, "reward": 0.59375, "reward_std": 0.22558963298797607, "rewards/verify_math_reward/mean": 0.59375, "rewards/verify_math_reward/std": 0.4922322630882263, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2308.0, "completions/mean_length": 630.3928833007812, "completions/mean_terminated_length": 567.3817749023438, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.4897959183673469, "grad_norm": 0.2099696546792984, "kl": 0.00211334228515625, "learning_rate": 1e-06, "loss": 0.0263, "num_tokens": 33431557.0, "reward": 0.5223214626312256, "reward_std": 0.20996278524398804, "rewards/verify_math_reward/mean": 0.5223214030265808, "rewards/verify_math_reward/std": 0.5006202459335327, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2087.0, "completions/mean_length": 561.6027221679688, "completions/mean_terminated_length": 513.6244506835938, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.4921282798833819, "grad_norm": 0.2531992793083191, "kl": 0.0027008056640625, "learning_rate": 1e-06, "loss": 0.0132, "num_tokens": 33577524.0, "reward": 0.59375, "reward_std": 0.20410579442977905, "rewards/verify_math_reward/mean": 0.59375, "rewards/verify_math_reward/std": 0.4922322630882263, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3648.0, "completions/mean_length": 606.0045166015625, "completions/mean_terminated_length": 542.5499877929688, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.4944606413994169, "grad_norm": 0.25219160318374634, "kl": 0.002231597900390625, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 33735509.0, "reward": 0.535714328289032, "reward_std": 0.18067501485347748, "rewards/verify_math_reward/mean": 0.5357142686843872, "rewards/verify_math_reward/std": 0.49983981251716614, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1646.0, "completions/mean_length": 456.4910888671875, "completions/mean_terminated_length": 440.1704406738281, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4967930029154519, "grad_norm": 0.30093586444854736, "kl": 0.002838134765625, "learning_rate": 1e-06, "loss": 0.0323, "num_tokens": 33856723.0, "reward": 0.5848214626312256, "reward_std": 0.25687432289123535, "rewards/verify_math_reward/mean": 0.5848214030265808, "rewards/verify_math_reward/std": 0.49385640025138855, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1414.0, "completions/max_terminated_length": 1414.0, "completions/mean_length": 526.7678833007812, "completions/mean_terminated_length": 526.7678833007812, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.49912536443148686, "grad_norm": 0.26041379570961, "kl": 0.0023651123046875, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 33997551.0, "reward": 0.6071428656578064, "reward_std": 0.19209234416484833, "rewards/verify_math_reward/mean": 0.6071428656578064, "rewards/verify_math_reward/std": 0.48947933316230774, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1449.0, "completions/max_terminated_length": 1449.0, "completions/mean_length": 480.7857360839844, "completions/mean_terminated_length": 480.7857360839844, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.5014577259475219, "grad_norm": 0.2899254858493805, "kl": 0.002529144287109375, "learning_rate": 1e-06, "loss": 0.0328, "num_tokens": 34133663.0, "reward": 0.566964328289032, "reward_std": 0.257912814617157, "rewards/verify_math_reward/mean": 0.5669642686843872, "rewards/verify_math_reward/std": 0.49660524725914, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 589.6875, "completions/mean_terminated_length": 525.9363403320312, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.5037900874635568, "grad_norm": 0.2395540475845337, "kl": 0.002971649169921875, "learning_rate": 1e-06, "loss": -0.0118, "num_tokens": 34285385.0, "reward": 0.5401785969734192, "reward_std": 0.18909241259098053, "rewards/verify_math_reward/mean": 0.5401785969734192, "rewards/verify_math_reward/std": 0.49949926137924194, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 560.075927734375, "completions/mean_terminated_length": 528.220703125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.5061224489795918, "grad_norm": 0.2749766707420349, "kl": 0.00225067138671875, "learning_rate": 1e-06, "loss": 0.0425, "num_tokens": 34435378.0, "reward": 0.6026785969734192, "reward_std": 0.19990073144435883, "rewards/verify_math_reward/mean": 0.6026785969734192, "rewards/verify_math_reward/std": 0.4904395341873169, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3585.0, "completions/max_terminated_length": 3585.0, "completions/mean_length": 498.8214416503906, "completions/mean_terminated_length": 498.8214416503906, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.5084548104956268, "grad_norm": 0.24174532294273376, "kl": 0.00290679931640625, "learning_rate": 1e-06, "loss": 0.0132, "num_tokens": 34568954.0, "reward": 0.6428571939468384, "reward_std": 0.19178561866283417, "rewards/verify_math_reward/mean": 0.6428571343421936, "rewards/verify_math_reward/std": 0.48023054003715515, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 482.2812805175781, "completions/mean_terminated_length": 482.2812805175781, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.5107871720116618, "grad_norm": 0.3048853576183319, "kl": 0.002696990966796875, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 34701761.0, "reward": 0.5535714626312256, "reward_std": 0.2368713617324829, "rewards/verify_math_reward/mean": 0.5535714030265808, "rewards/verify_math_reward/std": 0.49823516607284546, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3880.0, "completions/mean_length": 559.6741333007812, "completions/mean_terminated_length": 527.8153076171875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.5131195335276968, "grad_norm": 0.27453503012657166, "kl": 0.002105712890625, "learning_rate": 1e-06, "loss": -0.0075, "num_tokens": 34850368.0, "reward": 0.6026785969734192, "reward_std": 0.21313384175300598, "rewards/verify_math_reward/mean": 0.6026785969734192, "rewards/verify_math_reward/std": 0.4904395043849945, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 532.0089721679688, "completions/mean_terminated_length": 516.0269165039062, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.5154518950437318, "grad_norm": 0.2646479308605194, "kl": 0.002300262451171875, "learning_rate": 1e-06, "loss": 0.0266, "num_tokens": 34991802.0, "reward": 0.4821428656578064, "reward_std": 0.18727383017539978, "rewards/verify_math_reward/mean": 0.4821428656578064, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2066.0, "completions/mean_length": 648.75, "completions/mean_terminated_length": 570.045654296875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.5177842565597668, "grad_norm": 0.2515674829483032, "kl": 0.0022411346435546875, "learning_rate": 1e-06, "loss": 0.0386, "num_tokens": 35164114.0, "reward": 0.5625, "reward_std": 0.2248506098985672, "rewards/verify_math_reward/mean": 0.5625, "rewards/verify_math_reward/std": 0.49718940258026123, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 638.2902221679688, "completions/mean_terminated_length": 575.4227294921875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.5201166180758018, "grad_norm": 0.2733094394207001, "kl": 0.002407073974609375, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 35327867.0, "reward": 0.4910714626312256, "reward_std": 0.283466100692749, "rewards/verify_math_reward/mean": 0.4910714328289032, "rewards/verify_math_reward/std": 0.5010398626327515, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3876.0, "completions/mean_length": 605.4241333007812, "completions/mean_terminated_length": 573.9774780273438, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.5224489795918368, "grad_norm": 0.25804582238197327, "kl": 0.002838134765625, "learning_rate": 1e-06, "loss": 0.0666, "num_tokens": 35484858.0, "reward": 0.5, "reward_std": 0.20997007191181183, "rewards/verify_math_reward/mean": 0.5, "rewards/verify_math_reward/std": 0.5011197924613953, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2558.0, "completions/mean_length": 652.5267944335938, "completions/mean_terminated_length": 605.7828369140625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.5247813411078717, "grad_norm": 0.2812970280647278, "kl": 0.002105712890625, "learning_rate": 1e-06, "loss": 0.068, "num_tokens": 35654296.0, "reward": 0.5714285969734192, "reward_std": 0.2380882054567337, "rewards/verify_math_reward/mean": 0.5714285969734192, "rewards/verify_math_reward/std": 0.49597999453544617, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 481.5937805175781, "completions/mean_terminated_length": 465.38568115234375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.5271137026239067, "grad_norm": 0.20803192257881165, "kl": 0.00357818603515625, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 35782853.0, "reward": 0.6071428656578064, "reward_std": 0.1554211974143982, "rewards/verify_math_reward/mean": 0.6071428656578064, "rewards/verify_math_reward/std": 0.48947930335998535, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2798.0, "completions/mean_length": 588.8705444335938, "completions/mean_terminated_length": 525.1045532226562, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.5294460641399417, "grad_norm": 0.2741509675979614, "kl": 0.002384185791015625, "learning_rate": 1e-06, "loss": 0.0279, "num_tokens": 35934520.0, "reward": 0.5758928656578064, "reward_std": 0.20245832204818726, "rewards/verify_math_reward/mean": 0.5758928656578064, "rewards/verify_math_reward/std": 0.4953135550022125, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1569.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 508.29913330078125, "completions/mean_terminated_length": 508.29913330078125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.5317784256559767, "grad_norm": 0.27911996841430664, "kl": 0.002399444580078125, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 36071419.0, "reward": 0.598214328289032, "reward_std": 0.2072695791721344, "rewards/verify_math_reward/mean": 0.5982142686843872, "rewards/verify_math_reward/std": 0.49135705828666687, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 582.4464721679688, "completions/mean_terminated_length": 550.7927856445312, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.5341107871720117, "grad_norm": 0.2247956246137619, "kl": 0.002819061279296875, "learning_rate": 1e-06, "loss": 0.0076, "num_tokens": 36222391.0, "reward": 0.5, "reward_std": 0.20576053857803345, "rewards/verify_math_reward/mean": 0.5, "rewards/verify_math_reward/std": 0.5011197924613953, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2208.0, "completions/mean_length": 564.6517944335938, "completions/mean_terminated_length": 548.816162109375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.5364431486880467, "grad_norm": 0.27169230580329895, "kl": 0.0021495819091796875, "learning_rate": 1e-06, "loss": 0.0486, "num_tokens": 36369601.0, "reward": 0.5401785969734192, "reward_std": 0.2604703903198242, "rewards/verify_math_reward/mean": 0.5401785969734192, "rewards/verify_math_reward/std": 0.49949929118156433, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3086.0, "completions/mean_length": 710.2366333007812, "completions/mean_terminated_length": 617.0504150390625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.5387755102040817, "grad_norm": 0.26924261450767517, "kl": 0.00235748291015625, "learning_rate": 1e-06, "loss": 0.0774, "num_tokens": 36556686.0, "reward": 0.4464285969734192, "reward_std": 0.2368713617324829, "rewards/verify_math_reward/mean": 0.4464285671710968, "rewards/verify_math_reward/std": 0.49823519587516785, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2172.0, "completions/mean_length": 518.2723388671875, "completions/mean_terminated_length": 469.7059020996094, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.5411078717201167, "grad_norm": 0.2511337697505951, "kl": 0.00334930419921875, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 36693419.0, "reward": 0.7321428656578064, "reward_std": 0.17599493265151978, "rewards/verify_math_reward/mean": 0.7321428656578064, "rewards/verify_math_reward/std": 0.443834513425827, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 532.8214721679688, "completions/mean_terminated_length": 500.7207336425781, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.5434402332361516, "grad_norm": 0.264467716217041, "kl": 0.003246307373046875, "learning_rate": 1e-06, "loss": 0.0203, "num_tokens": 36835603.0, "reward": 0.5625, "reward_std": 0.19990354776382446, "rewards/verify_math_reward/mean": 0.5625, "rewards/verify_math_reward/std": 0.49718940258026123, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3132.0, "completions/mean_length": 549.9955444335938, "completions/mean_terminated_length": 518.049560546875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.5457725947521865, "grad_norm": 0.20606504380702972, "kl": 0.002223968505859375, "learning_rate": 1e-06, "loss": 0.0225, "num_tokens": 36986130.0, "reward": 0.5401785969734192, "reward_std": 0.12640023231506348, "rewards/verify_math_reward/mean": 0.5401785969734192, "rewards/verify_math_reward/std": 0.49949926137924194, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2617.0, "completions/mean_length": 567.9642944335938, "completions/mean_terminated_length": 503.81817626953125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.5481049562682215, "grad_norm": 0.22268931567668915, "kl": 0.00246429443359375, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 37134954.0, "reward": 0.5892857313156128, "reward_std": 0.18727383017539978, "rewards/verify_math_reward/mean": 0.5892857313156128, "rewards/verify_math_reward/std": 0.4930652678012848, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2868.0, "completions/mean_length": 600.4375, "completions/mean_terminated_length": 552.9864501953125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.5504373177842565, "grad_norm": 0.2422136813402176, "kl": 0.0020904541015625, "learning_rate": 1e-06, "loss": 0.0084, "num_tokens": 37291836.0, "reward": 0.65625, "reward_std": 0.19929736852645874, "rewards/verify_math_reward/mean": 0.65625, "rewards/verify_math_reward/std": 0.4760226309299469, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1378.0, "completions/max_terminated_length": 1378.0, "completions/mean_length": 525.3973388671875, "completions/mean_terminated_length": 525.3973388671875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.5527696793002915, "grad_norm": 0.273456335067749, "kl": 0.002994537353515625, "learning_rate": 1e-06, "loss": 0.01, "num_tokens": 37433365.0, "reward": 0.5267857313156128, "reward_std": 0.237474724650383, "rewards/verify_math_reward/mean": 0.5267857313156128, "rewards/verify_math_reward/std": 0.500400185585022, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 543.9553833007812, "completions/mean_terminated_length": 495.7375793457031, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.5551020408163265, "grad_norm": 0.2322549819946289, "kl": 0.002719879150390625, "learning_rate": 1e-06, "loss": -0.0034, "num_tokens": 37579435.0, "reward": 0.6205357313156128, "reward_std": 0.12535615265369415, "rewards/verify_math_reward/mean": 0.6205357313156128, "rewards/verify_math_reward/std": 0.4863404929637909, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2258.0, "completions/mean_length": 577.625, "completions/mean_terminated_length": 545.9279174804688, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.5574344023323615, "grad_norm": 0.2547401487827301, "kl": 0.0025634765625, "learning_rate": 1e-06, "loss": -0.0182, "num_tokens": 37727943.0, "reward": 0.6026785969734192, "reward_std": 0.2057577222585678, "rewards/verify_math_reward/mean": 0.6026785969734192, "rewards/verify_math_reward/std": 0.4904395341873169, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3021.0, "completions/mean_length": 561.0045166015625, "completions/mean_terminated_length": 529.1576538085938, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.5597667638483965, "grad_norm": 0.23458260297775269, "kl": 0.002529144287109375, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 37871432.0, "reward": 0.5848214626312256, "reward_std": 0.2183874398469925, "rewards/verify_math_reward/mean": 0.5848214030265808, "rewards/verify_math_reward/std": 0.49385640025138855, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2791.0, "completions/mean_length": 724.4910888671875, "completions/mean_terminated_length": 647.5159301757812, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.5620991253644315, "grad_norm": 0.18068929016590118, "kl": 0.00234222412109375, "learning_rate": 1e-06, "loss": 0.0448, "num_tokens": 38057230.0, "reward": 0.4508928656578064, "reward_std": 0.19404374063014984, "rewards/verify_math_reward/mean": 0.4508928656578064, "rewards/verify_math_reward/std": 0.49869707226753235, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2172.0, "completions/mean_length": 589.4642944335938, "completions/mean_terminated_length": 541.8642578125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.5644314868804665, "grad_norm": 0.26205113530158997, "kl": 0.002826690673828125, "learning_rate": 1e-06, "loss": -0.0134, "num_tokens": 38209038.0, "reward": 0.5535714626312256, "reward_std": 0.20349960029125214, "rewards/verify_math_reward/mean": 0.5535714030265808, "rewards/verify_math_reward/std": 0.49823519587516785, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3522.0, "completions/mean_length": 674.1830444335938, "completions/mean_terminated_length": 611.9681396484375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.5667638483965015, "grad_norm": 0.249900683760643, "kl": 0.002323150634765625, "learning_rate": 1e-06, "loss": 0.0654, "num_tokens": 38382015.0, "reward": 0.4151785969734192, "reward_std": 0.29488059878349304, "rewards/verify_math_reward/mean": 0.4151785671710968, "rewards/verify_math_reward/std": 0.49385643005371094, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3973.0, "completions/mean_length": 582.9107666015625, "completions/mean_terminated_length": 551.2612915039062, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.5690962099125364, "grad_norm": 0.26367759704589844, "kl": 0.0029754638671875, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 38531371.0, "reward": 0.6160714626312256, "reward_std": 0.24980218708515167, "rewards/verify_math_reward/mean": 0.6160714030265808, "rewards/verify_math_reward/std": 0.48743006587028503, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3629.0, "completions/mean_length": 738.1785888671875, "completions/mean_terminated_length": 613.8148193359375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.5714285714285714, "grad_norm": 0.24815772473812103, "kl": 0.002346038818359375, "learning_rate": 1e-06, "loss": 0.105, "num_tokens": 38718659.0, "reward": 0.598214328289032, "reward_std": 0.2439451813697815, "rewards/verify_math_reward/mean": 0.5982142686843872, "rewards/verify_math_reward/std": 0.49135705828666687, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2647.0, "completions/mean_length": 716.2142944335938, "completions/mean_terminated_length": 685.7658081054688, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.5737609329446064, "grad_norm": 0.23341864347457886, "kl": 0.002655029296875, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 38900827.0, "reward": 0.4062500298023224, "reward_std": 0.2738363444805145, "rewards/verify_math_reward/mean": 0.40625, "rewards/verify_math_reward/std": 0.4922322630882263, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 543.9464721679688, "completions/mean_terminated_length": 528.0179443359375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.5760932944606414, "grad_norm": 0.30561524629592896, "kl": 0.003040313720703125, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 39042239.0, "reward": 0.5803571939468384, "reward_std": 0.303001344203949, "rewards/verify_math_reward/mean": 0.5803571343421936, "rewards/verify_math_reward/std": 0.49460574984550476, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 673.8482666015625, "completions/mean_terminated_length": 643.0180053710938, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.5784256559766764, "grad_norm": 0.22902356088161469, "kl": 0.002460479736328125, "learning_rate": 1e-06, "loss": 0.0076, "num_tokens": 39213909.0, "reward": 0.5535714626312256, "reward_std": 0.20770913362503052, "rewards/verify_math_reward/mean": 0.5535714030265808, "rewards/verify_math_reward/std": 0.49823519587516785, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 721.8035888671875, "completions/mean_terminated_length": 612.95849609375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.5807580174927114, "grad_norm": 0.216725155711174, "kl": 0.002513885498046875, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 39393785.0, "reward": 0.4107142984867096, "reward_std": 0.21569423377513885, "rewards/verify_math_reward/mean": 0.4107142984867096, "rewards/verify_math_reward/std": 0.4930652976036072, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2559.0, "completions/mean_length": 612.9866333007812, "completions/mean_terminated_length": 581.6080932617188, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.5830903790087464, "grad_norm": 0.23955409228801727, "kl": 0.00269317626953125, "learning_rate": 1e-06, "loss": 0.0232, "num_tokens": 39548894.0, "reward": 0.6473214626312256, "reward_std": 0.21808069944381714, "rewards/verify_math_reward/mean": 0.6473214030265808, "rewards/verify_math_reward/std": 0.4788738489151001, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3036.0, "completions/mean_length": 499.0714416503906, "completions/mean_terminated_length": 482.9417419433594, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.5854227405247814, "grad_norm": 0.27811646461486816, "kl": 0.00293731689453125, "learning_rate": 1e-06, "loss": 0.0375, "num_tokens": 39681006.0, "reward": 0.691964328289032, "reward_std": 0.2627313733100891, "rewards/verify_math_reward/mean": 0.6919642686843872, "rewards/verify_math_reward/std": 0.46271538734436035, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 554.1875, "completions/mean_terminated_length": 506.1086120605469, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5877551020408164, "grad_norm": 0.22807757556438446, "kl": 0.002674102783203125, "learning_rate": 1e-06, "loss": 0.0552, "num_tokens": 39827488.0, "reward": 0.59375, "reward_std": 0.15842114388942719, "rewards/verify_math_reward/mean": 0.59375, "rewards/verify_math_reward/std": 0.4922322630882263, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2387.0, "completions/mean_length": 615.3839721679688, "completions/mean_terminated_length": 584.0270385742188, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.5900874635568513, "grad_norm": 0.26391544938087463, "kl": 0.002716064453125, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 39987126.0, "reward": 0.5892857313156128, "reward_std": 0.22289641201496124, "rewards/verify_math_reward/mean": 0.5892857313156128, "rewards/verify_math_reward/std": 0.4930652976036072, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 622.8125, "completions/mean_terminated_length": 575.6651611328125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.5924198250728863, "grad_norm": 0.2827799916267395, "kl": 0.00335693359375, "learning_rate": 1e-06, "loss": -0.005, "num_tokens": 40152724.0, "reward": 0.5758928656578064, "reward_std": 0.21356894075870514, "rewards/verify_math_reward/mean": 0.5758928656578064, "rewards/verify_math_reward/std": 0.4953135550022125, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3665.0, "completions/max_terminated_length": 3665.0, "completions/mean_length": 537.0223388671875, "completions/mean_terminated_length": 537.0223388671875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5947521865889213, "grad_norm": 0.3180239200592041, "kl": 0.00290679931640625, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 40297233.0, "reward": 0.6428571939468384, "reward_std": 0.26243188977241516, "rewards/verify_math_reward/mean": 0.6428571343421936, "rewards/verify_math_reward/std": 0.48023054003715515, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2474.0, "completions/mean_length": 553.2366333007812, "completions/mean_terminated_length": 537.3497924804688, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.5970845481049563, "grad_norm": 0.28264546394348145, "kl": 0.002605438232421875, "learning_rate": 1e-06, "loss": 0.0243, "num_tokens": 40438830.0, "reward": 0.6517857313156128, "reward_std": 0.2284567803144455, "rewards/verify_math_reward/mean": 0.6517857313156128, "rewards/verify_math_reward/std": 0.47747132182121277, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2081.0, "completions/mean_length": 592.5357666015625, "completions/mean_terminated_length": 560.9729614257812, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.5994169096209913, "grad_norm": 0.27566614747047424, "kl": 0.00299072265625, "learning_rate": 1e-06, "loss": 0.0258, "num_tokens": 40599854.0, "reward": 0.5267857313156128, "reward_std": 0.1963018774986267, "rewards/verify_math_reward/mean": 0.5267857313156128, "rewards/verify_math_reward/std": 0.500400185585022, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2774.0, "completions/mean_length": 548.1517944335938, "completions/mean_terminated_length": 532.2421875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.6017492711370263, "grad_norm": 0.26068398356437683, "kl": 0.0031280517578125, "learning_rate": 1e-06, "loss": 0.0232, "num_tokens": 40743064.0, "reward": 0.6026785969734192, "reward_std": 0.22935959696769714, "rewards/verify_math_reward/mean": 0.6026785969734192, "rewards/verify_math_reward/std": 0.4904395043849945, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 590.7142944335938, "completions/mean_terminated_length": 559.1351318359375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.6040816326530613, "grad_norm": 0.2197912037372589, "kl": 0.00249481201171875, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 40902896.0, "reward": 0.566964328289032, "reward_std": 0.13737240433692932, "rewards/verify_math_reward/mean": 0.5669642686843872, "rewards/verify_math_reward/std": 0.49660524725914, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3635.0, "completions/mean_length": 699.9553833007812, "completions/mean_terminated_length": 669.3603515625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.6064139941690962, "grad_norm": 0.18691788613796234, "kl": 0.0020694732666015625, "learning_rate": 1e-06, "loss": -0.0057, "num_tokens": 41081134.0, "reward": 0.4464285969734192, "reward_std": 0.13707295060157776, "rewards/verify_math_reward/mean": 0.4464285671710968, "rewards/verify_math_reward/std": 0.49823519587516785, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 524.2366333007812, "completions/mean_terminated_length": 492.0585632324219, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.6087463556851312, "grad_norm": 0.2733798921108246, "kl": 0.002613067626953125, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 41223843.0, "reward": 0.65625, "reward_std": 0.1833682507276535, "rewards/verify_math_reward/mean": 0.65625, "rewards/verify_math_reward/std": 0.4760226309299469, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2262.0, "completions/mean_length": 517.8705444335938, "completions/mean_terminated_length": 485.6351318359375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.6110787172011661, "grad_norm": 0.2720491886138916, "kl": 0.003376007080078125, "learning_rate": 1e-06, "loss": 0.0338, "num_tokens": 41362726.0, "reward": 0.5758928656578064, "reward_std": 0.178862065076828, "rewards/verify_math_reward/mean": 0.5758928656578064, "rewards/verify_math_reward/std": 0.4953135550022125, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 575.1473388671875, "completions/mean_terminated_length": 559.3587646484375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.6134110787172011, "grad_norm": 0.20130468904972076, "kl": 0.00251007080078125, "learning_rate": 1e-06, "loss": 0.0124, "num_tokens": 41513887.0, "reward": 0.6116071939468384, "reward_std": 0.1460937112569809, "rewards/verify_math_reward/mean": 0.6116071343421936, "rewards/verify_math_reward/std": 0.4884762763977051, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 657.3080444335938, "completions/mean_terminated_length": 578.799072265625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.6157434402332361, "grad_norm": 0.267302542924881, "kl": 0.00292205810546875, "learning_rate": 1e-06, "loss": 0.0199, "num_tokens": 41689172.0, "reward": 0.4642857313156128, "reward_std": 0.23192447423934937, "rewards/verify_math_reward/mean": 0.4642857015132904, "rewards/verify_math_reward/std": 0.49983978271484375, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.049107142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2409.0, "completions/mean_length": 787.3170166015625, "completions/mean_terminated_length": 616.446044921875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.6180758017492711, "grad_norm": 0.17197687923908234, "kl": 0.0020236968994140625, "learning_rate": 1e-06, "loss": 0.0319, "num_tokens": 41890427.0, "reward": 0.4196428656578064, "reward_std": 0.16488432884216309, "rewards/verify_math_reward/mean": 0.4196428656578064, "rewards/verify_math_reward/std": 0.49460577964782715, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2180.0, "completions/mean_length": 657.7098388671875, "completions/mean_terminated_length": 626.7342529296875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.6204081632653061, "grad_norm": 0.22111672163009644, "kl": 0.00244903564453125, "learning_rate": 1e-06, "loss": 0.0448, "num_tokens": 42057090.0, "reward": 0.5580357313156128, "reward_std": 0.22558964788913727, "rewards/verify_math_reward/mean": 0.5580357313156128, "rewards/verify_math_reward/std": 0.4977326989173889, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2332.0, "completions/mean_length": 635.0402221679688, "completions/mean_terminated_length": 619.5202026367188, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.6227405247813411, "grad_norm": 0.23865652084350586, "kl": 0.00266265869140625, "learning_rate": 1e-06, "loss": 0.0086, "num_tokens": 42220995.0, "reward": 0.5758928656578064, "reward_std": 0.25010165572166443, "rewards/verify_math_reward/mean": 0.5758928656578064, "rewards/verify_math_reward/std": 0.4953135848045349, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 631.4598388671875, "completions/mean_terminated_length": 600.2477416992188, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6250728862973761, "grad_norm": 0.2775896489620209, "kl": 0.002925872802734375, "learning_rate": 1e-06, "loss": 0.0246, "num_tokens": 42392226.0, "reward": 0.53125, "reward_std": 0.2534010410308838, "rewards/verify_math_reward/mean": 0.53125, "rewards/verify_math_reward/std": 0.5001401305198669, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3737.0, "completions/mean_length": 638.1160888671875, "completions/mean_terminated_length": 575.2454223632812, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.6274052478134111, "grad_norm": 0.17287638783454895, "kl": 0.00275421142578125, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 42555212.0, "reward": 0.5133928656578064, "reward_std": 0.11047111451625824, "rewards/verify_math_reward/mean": 0.5133928656578064, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3778.0, "completions/mean_length": 484.3839416503906, "completions/mean_terminated_length": 468.1883544921875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.6297376093294461, "grad_norm": 0.29258185625076294, "kl": 0.003673553466796875, "learning_rate": 1e-06, "loss": 0.0174, "num_tokens": 42683554.0, "reward": 0.6517857313156128, "reward_std": 0.2137400209903717, "rewards/verify_math_reward/mean": 0.6517857313156128, "rewards/verify_math_reward/std": 0.47747132182121277, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 626.1473388671875, "completions/mean_terminated_length": 579.0452880859375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.632069970845481, "grad_norm": 0.22278255224227905, "kl": 0.00243377685546875, "learning_rate": 1e-06, "loss": 0.0514, "num_tokens": 42847131.0, "reward": 0.5625, "reward_std": 0.18471625447273254, "rewards/verify_math_reward/mean": 0.5625, "rewards/verify_math_reward/std": 0.49718940258026123, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2156.0, "completions/mean_length": 619.0625, "completions/mean_terminated_length": 539.6803588867188, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.634402332361516, "grad_norm": 0.3052447736263275, "kl": 0.00273895263671875, "learning_rate": 1e-06, "loss": 0.051, "num_tokens": 43006217.0, "reward": 0.4419642984867096, "reward_std": 0.24650557339191437, "rewards/verify_math_reward/mean": 0.4419642984867096, "rewards/verify_math_reward/std": 0.4977326989173889, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 533.75, "completions/mean_terminated_length": 485.3936767578125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.636734693877551, "grad_norm": 0.28764647245407104, "kl": 0.00301361083984375, "learning_rate": 1e-06, "loss": 0.0796, "num_tokens": 43146537.0, "reward": 0.7142857313156128, "reward_std": 0.26859113574028015, "rewards/verify_math_reward/mean": 0.7142857313156128, "rewards/verify_math_reward/std": 0.45276570320129395, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3619.0, "completions/mean_length": 720.232177734375, "completions/mean_terminated_length": 643.1597900390625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.639067055393586, "grad_norm": 0.24549873173236847, "kl": 0.002490997314453125, "learning_rate": 1e-06, "loss": 0.0215, "num_tokens": 43333597.0, "reward": 0.4821428656578064, "reward_std": 0.2582167685031891, "rewards/verify_math_reward/mean": 0.4821428656578064, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 677.21875, "completions/mean_terminated_length": 646.4189453125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.641399416909621, "grad_norm": 0.2859312891960144, "kl": 0.002758026123046875, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 43511214.0, "reward": 0.4821428656578064, "reward_std": 0.267547070980072, "rewards/verify_math_reward/mean": 0.4821428656578064, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 650.169677734375, "completions/mean_terminated_length": 571.4976806640625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.643731778425656, "grad_norm": 0.2649756968021393, "kl": 0.00237274169921875, "learning_rate": 1e-06, "loss": 0.0103, "num_tokens": 43674812.0, "reward": 0.504464328289032, "reward_std": 0.26828160881996155, "rewards/verify_math_reward/mean": 0.5044642686843872, "rewards/verify_math_reward/std": 0.5010998845100403, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3250.0, "completions/mean_length": 633.6116333007812, "completions/mean_terminated_length": 570.6590576171875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.646064139941691, "grad_norm": 0.24528780579566956, "kl": 0.00266265869140625, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 43842685.0, "reward": 0.535714328289032, "reward_std": 0.20710574090480804, "rewards/verify_math_reward/mean": 0.5357142686843872, "rewards/verify_math_reward/std": 0.49983981251716614, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3493.0, "completions/mean_length": 696.1116333007812, "completions/mean_terminated_length": 586.4378051757812, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.648396501457726, "grad_norm": 0.2225281298160553, "kl": 0.00244903564453125, "learning_rate": 1e-06, "loss": 0.125, "num_tokens": 44019342.0, "reward": 0.5446428656578064, "reward_std": 0.24498368799686432, "rewards/verify_math_reward/mean": 0.5446428656578064, "rewards/verify_math_reward/std": 0.4991183280944824, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 534.71875, "completions/mean_terminated_length": 518.7489013671875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.650728862973761, "grad_norm": 0.29402709007263184, "kl": 0.002429962158203125, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 44159327.0, "reward": 0.6383928656578064, "reward_std": 0.25851622223854065, "rewards/verify_math_reward/mean": 0.6383928656578064, "rewards/verify_math_reward/std": 0.4815419018268585, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3168.0, "completions/mean_length": 758.3527221679688, "completions/mean_terminated_length": 697.6681518554688, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.6530612244897959, "grad_norm": 0.23811939358711243, "kl": 0.00225067138671875, "learning_rate": 1e-06, "loss": 0.0262, "num_tokens": 44352358.0, "reward": 0.5178571939468384, "reward_std": 0.26181843876838684, "rewards/verify_math_reward/mean": 0.5178571343421936, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4039.0, "completions/mean_length": 693.1964721679688, "completions/mean_terminated_length": 662.54052734375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6553935860058309, "grad_norm": 0.21368886530399323, "kl": 0.0027618408203125, "learning_rate": 1e-06, "loss": 0.016, "num_tokens": 44530322.0, "reward": 0.5133928656578064, "reward_std": 0.1847190409898758, "rewards/verify_math_reward/mean": 0.5133928656578064, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 541.0267944335938, "completions/mean_terminated_length": 509.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.6577259475218659, "grad_norm": 0.2308286875486374, "kl": 0.00319671630859375, "learning_rate": 1e-06, "loss": 0.027, "num_tokens": 44674448.0, "reward": 0.535714328289032, "reward_std": 0.18727384507656097, "rewards/verify_math_reward/mean": 0.5357142686843872, "rewards/verify_math_reward/std": 0.49983981251716614, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1707.0, "completions/mean_length": 573.03125, "completions/mean_terminated_length": 525.2081909179688, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.6600583090379009, "grad_norm": 0.30643728375434875, "kl": 0.002544403076171875, "learning_rate": 1e-06, "loss": 0.0402, "num_tokens": 44833479.0, "reward": 0.535714328289032, "reward_std": 0.24017076194286346, "rewards/verify_math_reward/mean": 0.5357142686843872, "rewards/verify_math_reward/std": 0.49983981251716614, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 569.8170166015625, "completions/mean_terminated_length": 538.049560546875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.6623906705539359, "grad_norm": 0.2745210528373718, "kl": 0.002323150634765625, "learning_rate": 1e-06, "loss": 0.0264, "num_tokens": 44984406.0, "reward": 0.59375, "reward_std": 0.21282710134983063, "rewards/verify_math_reward/mean": 0.59375, "rewards/verify_math_reward/std": 0.4922322630882263, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3362.0, "completions/mean_length": 718.7188110351562, "completions/mean_terminated_length": 641.61181640625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.6647230320699709, "grad_norm": 0.24607455730438232, "kl": 0.002429962158203125, "learning_rate": 1e-06, "loss": -0.0092, "num_tokens": 45171231.0, "reward": 0.4285714626312256, "reward_std": 0.2607799470424652, "rewards/verify_math_reward/mean": 0.4285714328289032, "rewards/verify_math_reward/std": 0.49597999453544617, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2756.0, "completions/mean_length": 590.4598388671875, "completions/mean_terminated_length": 510.42462158203125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.6670553935860059, "grad_norm": 0.2717443108558655, "kl": 0.002529144287109375, "learning_rate": 1e-06, "loss": 0.0469, "num_tokens": 45327966.0, "reward": 0.4955357313156128, "reward_std": 0.23431093990802765, "rewards/verify_math_reward/mean": 0.4955357015132904, "rewards/verify_math_reward/std": 0.5010998249053955, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 705.1517944335938, "completions/mean_terminated_length": 611.82568359375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.6693877551020408, "grad_norm": 0.237776979804039, "kl": 0.002727508544921875, "learning_rate": 1e-06, "loss": 0.0322, "num_tokens": 45510584.0, "reward": 0.5, "reward_std": 0.2724939286708832, "rewards/verify_math_reward/mean": 0.5, "rewards/verify_math_reward/std": 0.5011197924613953, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 553.53125, "completions/mean_terminated_length": 505.4434509277344, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.6717201166180758, "grad_norm": 0.309678316116333, "kl": 0.00322723388671875, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 45657047.0, "reward": 0.625, "reward_std": 0.24754685163497925, "rewards/verify_math_reward/mean": 0.625, "rewards/verify_math_reward/std": 0.4852071702480316, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3109.0, "completions/mean_length": 723.4241333007812, "completions/mean_terminated_length": 646.4246215820312, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.6740524781341107, "grad_norm": 0.23280808329582214, "kl": 0.002765655517578125, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 45838278.0, "reward": 0.5535714626312256, "reward_std": 0.23626233637332916, "rewards/verify_math_reward/mean": 0.5535714030265808, "rewards/verify_math_reward/std": 0.49823519587516785, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3816.0, "completions/mean_length": 599.28125, "completions/mean_terminated_length": 583.6009521484375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.6763848396501457, "grad_norm": 0.17826339602470398, "kl": 0.002777099609375, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 45998789.0, "reward": 0.5580357313156128, "reward_std": 0.1303030252456665, "rewards/verify_math_reward/mean": 0.5580357313156128, "rewards/verify_math_reward/std": 0.49773266911506653, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2155.0, "completions/max_terminated_length": 2155.0, "completions/mean_length": 504.169677734375, "completions/mean_terminated_length": 504.169677734375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.6787172011661807, "grad_norm": 0.24889233708381653, "kl": 0.003597259521484375, "learning_rate": 1e-06, "loss": -0.0147, "num_tokens": 46141163.0, "reward": 0.535714328289032, "reward_std": 0.14835184812545776, "rewards/verify_math_reward/mean": 0.5357142686843872, "rewards/verify_math_reward/std": 0.49983981251716614, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3117.0, "completions/mean_length": 659.8928833007812, "completions/mean_terminated_length": 597.4181518554688, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.6810495626822157, "grad_norm": 0.19757574796676636, "kl": 0.00254058837890625, "learning_rate": 1e-06, "loss": -0.0062, "num_tokens": 46310859.0, "reward": 0.5178571939468384, "reward_std": 0.15165123343467712, "rewards/verify_math_reward/mean": 0.5178571343421936, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3519.0, "completions/mean_length": 636.3973388671875, "completions/mean_terminated_length": 573.4954223632812, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.6833819241982507, "grad_norm": 0.3277430832386017, "kl": 0.00293731689453125, "learning_rate": 1e-06, "loss": 0.0299, "num_tokens": 46474092.0, "reward": 0.5758928656578064, "reward_std": 0.2669408917427063, "rewards/verify_math_reward/mean": 0.5758928656578064, "rewards/verify_math_reward/std": 0.4953135848045349, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 3226.0, "completions/mean_length": 819.4285888671875, "completions/mean_terminated_length": 682.269775390625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.6857142857142857, "grad_norm": 0.19413602352142334, "kl": 0.0022869110107421875, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 46681548.0, "reward": 0.5491071939468384, "reward_std": 0.22033601999282837, "rewards/verify_math_reward/mean": 0.5491071343421936, "rewards/verify_math_reward/std": 0.49869707226753235, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 756.1250610351562, "completions/mean_terminated_length": 632.4259033203125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.6880466472303207, "grad_norm": 0.31327852606773376, "kl": 0.00290679931640625, "learning_rate": 1e-06, "loss": 0.035, "num_tokens": 46868584.0, "reward": 0.4821428656578064, "reward_std": 0.16457758843898773, "rewards/verify_math_reward/mean": 0.4821428656578064, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 593.2455444335938, "completions/mean_terminated_length": 545.6968383789062, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.6903790087463557, "grad_norm": 0.1804625689983368, "kl": 0.00264739990234375, "learning_rate": 1e-06, "loss": 0.0073, "num_tokens": 47028927.0, "reward": 0.598214328289032, "reward_std": 0.15268969535827637, "rewards/verify_math_reward/mean": 0.5982142686843872, "rewards/verify_math_reward/std": 0.49135705828666687, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3367.0, "completions/mean_length": 665.0982666015625, "completions/mean_terminated_length": 602.7181396484375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.6927113702623907, "grad_norm": 0.3225614130496979, "kl": 0.00260162353515625, "learning_rate": 1e-06, "loss": 0.0963, "num_tokens": 47197613.0, "reward": 0.5580357313156128, "reward_std": 0.25821956992149353, "rewards/verify_math_reward/mean": 0.5580357313156128, "rewards/verify_math_reward/std": 0.49773266911506653, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2163.0, "completions/mean_length": 577.0089721679688, "completions/mean_terminated_length": 545.3063354492188, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.6950437317784257, "grad_norm": 0.2598956227302551, "kl": 0.00286865234375, "learning_rate": 1e-06, "loss": 0.0299, "num_tokens": 47347927.0, "reward": 0.65625, "reward_std": 0.2197326421737671, "rewards/verify_math_reward/mean": 0.65625, "rewards/verify_math_reward/std": 0.4760226309299469, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2775.0, "completions/mean_length": 647.575927734375, "completions/mean_terminated_length": 600.7647094726562, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.6973760932944606, "grad_norm": 0.2414235770702362, "kl": 0.002384185791015625, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 47517312.0, "reward": 0.5758928656578064, "reward_std": 0.19794653356075287, "rewards/verify_math_reward/mean": 0.5758928656578064, "rewards/verify_math_reward/std": 0.4953135550022125, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3675.0, "completions/mean_length": 754.2098388671875, "completions/mean_terminated_length": 677.9132080078125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6997084548104956, "grad_norm": 0.21052268147468567, "kl": 0.002254486083984375, "learning_rate": 1e-06, "loss": -0.004, "num_tokens": 47706223.0, "reward": 0.424107164144516, "reward_std": 0.19886226952075958, "rewards/verify_math_reward/mean": 0.4241071343421936, "rewards/verify_math_reward/std": 0.4953135550022125, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3031.0, "completions/mean_length": 823.9241333007812, "completions/mean_terminated_length": 702.7361450195312, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.7020408163265306, "grad_norm": 0.25020062923431396, "kl": 0.002361297607421875, "learning_rate": 1e-06, "loss": 0.0415, "num_tokens": 47911286.0, "reward": 0.5, "reward_std": 0.2582167387008667, "rewards/verify_math_reward/mean": 0.5, "rewards/verify_math_reward/std": 0.5011197924613953, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 661.7098388671875, "completions/mean_terminated_length": 599.2681884765625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.7043731778425656, "grad_norm": 0.26508408784866333, "kl": 0.00241851806640625, "learning_rate": 1e-06, "loss": 0.0085, "num_tokens": 48084709.0, "reward": 0.5580357313156128, "reward_std": 0.27956050634384155, "rewards/verify_math_reward/mean": 0.5580357313156128, "rewards/verify_math_reward/std": 0.49773266911506653, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1530.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 568.4420166015625, "completions/mean_terminated_length": 568.4420166015625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.7067055393586006, "grad_norm": 0.29956838488578796, "kl": 0.002964019775390625, "learning_rate": 1e-06, "loss": 0.0324, "num_tokens": 48236312.0, "reward": 0.5848214626312256, "reward_std": 0.2842051088809967, "rewards/verify_math_reward/mean": 0.5848214030265808, "rewards/verify_math_reward/std": 0.49385643005371094, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2112.0, "completions/mean_length": 661.4285888671875, "completions/mean_terminated_length": 550.6359252929688, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.7090379008746356, "grad_norm": 0.26300880312919617, "kl": 0.002956390380859375, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 48407736.0, "reward": 0.486607164144516, "reward_std": 0.24168705940246582, "rewards/verify_math_reward/mean": 0.4866071343421936, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 613.2053833007812, "completions/mean_terminated_length": 565.9276123046875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.7113702623906706, "grad_norm": 0.30458030104637146, "kl": 0.002872467041015625, "learning_rate": 1e-06, "loss": 0.0394, "num_tokens": 48567294.0, "reward": 0.6517857313156128, "reward_std": 0.24949544668197632, "rewards/verify_math_reward/mean": 0.6517857313156128, "rewards/verify_math_reward/std": 0.47747132182121277, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2758.0, "completions/mean_length": 733.950927734375, "completions/mean_terminated_length": 625.4976806640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7137026239067056, "grad_norm": 0.2270134836435318, "kl": 0.002635955810546875, "learning_rate": 1e-06, "loss": 0.0355, "num_tokens": 48755403.0, "reward": 0.4642857313156128, "reward_std": 0.24077412486076355, "rewards/verify_math_reward/mean": 0.4642857015132904, "rewards/verify_math_reward/std": 0.49983981251716614, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3995.0, "completions/mean_length": 679.65625, "completions/mean_terminated_length": 633.2805786132812, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.7160349854227406, "grad_norm": 0.277005672454834, "kl": 0.0029144287109375, "learning_rate": 1e-06, "loss": 0.015, "num_tokens": 48933398.0, "reward": 0.4821428656578064, "reward_std": 0.2250189185142517, "rewards/verify_math_reward/mean": 0.4821428656578064, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1484.0, "completions/mean_length": 521.34375, "completions/mean_terminated_length": 489.1396484375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.7183673469387755, "grad_norm": 0.28764063119888306, "kl": 0.0028839111328125, "learning_rate": 1e-06, "loss": 0.0252, "num_tokens": 49070147.0, "reward": 0.660714328289032, "reward_std": 0.24680502712726593, "rewards/verify_math_reward/mean": 0.6607142686843872, "rewards/verify_math_reward/std": 0.4745272994041443, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 656.3527221679688, "completions/mean_terminated_length": 528.9583129882812, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.7206997084548105, "grad_norm": 0.27909383177757263, "kl": 0.0028228759765625, "learning_rate": 1e-06, "loss": 0.0326, "num_tokens": 49238482.0, "reward": 0.6071428656578064, "reward_std": 0.20636393129825592, "rewards/verify_math_reward/mean": 0.6071428656578064, "rewards/verify_math_reward/std": 0.48947930335998535, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 632.8616333007812, "completions/mean_terminated_length": 601.6621704101562, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.7230320699708455, "grad_norm": 0.22965008020401, "kl": 0.00463104248046875, "learning_rate": 1e-06, "loss": 0.0173, "num_tokens": 49401211.0, "reward": 0.5892857313156128, "reward_std": 0.1590273380279541, "rewards/verify_math_reward/mean": 0.5892857313156128, "rewards/verify_math_reward/std": 0.4930652976036072, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2120.0, "completions/mean_length": 709.6830444335938, "completions/mean_terminated_length": 648.1135864257812, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.7253644314868805, "grad_norm": 0.23763540387153625, "kl": 0.002582550048828125, "learning_rate": 1e-06, "loss": 0.0599, "num_tokens": 49584460.0, "reward": 0.5089285969734192, "reward_std": 0.26363420486450195, "rewards/verify_math_reward/mean": 0.5089285969734192, "rewards/verify_math_reward/std": 0.5010399222373962, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3078.0, "completions/mean_length": 564.5714721679688, "completions/mean_terminated_length": 532.7567749023438, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.7276967930029155, "grad_norm": 0.2912660837173462, "kl": 0.00273895263671875, "learning_rate": 1e-06, "loss": 0.0439, "num_tokens": 49730276.0, "reward": 0.6696428656578064, "reward_std": 0.2119242548942566, "rewards/verify_math_reward/mean": 0.6696428656578064, "rewards/verify_math_reward/std": 0.4713950753211975, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 503.732177734375, "completions/mean_terminated_length": 487.62335205078125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.7300291545189505, "grad_norm": 0.32089799642562866, "kl": 0.0036163330078125, "learning_rate": 1e-06, "loss": 0.046, "num_tokens": 49867160.0, "reward": 0.6428571939468384, "reward_std": 0.2790871262550354, "rewards/verify_math_reward/mean": 0.6428571343421936, "rewards/verify_math_reward/std": 0.48023056983947754, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 626.1295166015625, "completions/mean_terminated_length": 594.869384765625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.7323615160349854, "grad_norm": 0.27264782786369324, "kl": 0.002834320068359375, "learning_rate": 1e-06, "loss": 0.0178, "num_tokens": 50027325.0, "reward": 0.566964328289032, "reward_std": 0.2857242226600647, "rewards/verify_math_reward/mean": 0.5669642686843872, "rewards/verify_math_reward/std": 0.4966052174568176, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 570.0848388671875, "completions/mean_terminated_length": 554.2735595703125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.7346938775510204, "grad_norm": 0.2196282595396042, "kl": 0.00276947021484375, "learning_rate": 1e-06, "loss": 0.0116, "num_tokens": 50177688.0, "reward": 0.65625, "reward_std": 0.18202301859855652, "rewards/verify_math_reward/mean": 0.65625, "rewards/verify_math_reward/std": 0.4760226309299469, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3677.0, "completions/mean_length": 708.263427734375, "completions/mean_terminated_length": 598.9815673828125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.7370262390670554, "grad_norm": 0.23442630469799042, "kl": 0.0024261474609375, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 50358803.0, "reward": 0.5401785969734192, "reward_std": 0.19929735362529755, "rewards/verify_math_reward/mean": 0.5401785969734192, "rewards/verify_math_reward/std": 0.49949929118156433, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3688.0, "completions/mean_length": 636.9330444335938, "completions/mean_terminated_length": 574.0408935546875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.7393586005830903, "grad_norm": 0.29054102301597595, "kl": 0.00304412841796875, "learning_rate": 1e-06, "loss": 0.0479, "num_tokens": 50523668.0, "reward": 0.5178571939468384, "reward_std": 0.3070724606513977, "rewards/verify_math_reward/mean": 0.5178571343421936, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3165.0, "completions/mean_length": 657.9642944335938, "completions/mean_terminated_length": 579.4702758789062, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.7416909620991253, "grad_norm": 0.23028461635112762, "kl": 0.002498626708984375, "learning_rate": 1e-06, "loss": 0.0329, "num_tokens": 50692412.0, "reward": 0.6428571939468384, "reward_std": 0.27743518352508545, "rewards/verify_math_reward/mean": 0.6428571343421936, "rewards/verify_math_reward/std": 0.48023054003715515, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3843.0, "completions/mean_length": 597.375, "completions/mean_terminated_length": 581.6861572265625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.7440233236151603, "grad_norm": 0.22642745077610016, "kl": 0.00305938720703125, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 50855136.0, "reward": 0.5267857313156128, "reward_std": 0.17720730602741241, "rewards/verify_math_reward/mean": 0.5267857313156128, "rewards/verify_math_reward/std": 0.500400185585022, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 528.3214721679688, "completions/mean_terminated_length": 496.1802062988281, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.7463556851311953, "grad_norm": 0.30777016282081604, "kl": 0.004055023193359375, "learning_rate": 1e-06, "loss": 0.0423, "num_tokens": 50991064.0, "reward": 0.6696428656578064, "reward_std": 0.2718849182128906, "rewards/verify_math_reward/mean": 0.6696428656578064, "rewards/verify_math_reward/std": 0.4713950753211975, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2417.0, "completions/mean_length": 614.4642944335938, "completions/mean_terminated_length": 598.85205078125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.7486880466472303, "grad_norm": 0.2332543432712555, "kl": 0.003437042236328125, "learning_rate": 1e-06, "loss": -0.0139, "num_tokens": 51152456.0, "reward": 0.5267857313156128, "reward_std": 0.220338836312294, "rewards/verify_math_reward/mean": 0.5267857313156128, "rewards/verify_math_reward/std": 0.500400185585022, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3255.0, "completions/mean_length": 670.6964721679688, "completions/mean_terminated_length": 608.4181518554688, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.7510204081632653, "grad_norm": 0.24177144467830658, "kl": 0.00246429443359375, "learning_rate": 1e-06, "loss": 0.0426, "num_tokens": 51326092.0, "reward": 0.6517857313156128, "reward_std": 0.2458893060684204, "rewards/verify_math_reward/mean": 0.6517857313156128, "rewards/verify_math_reward/std": 0.47747132182121277, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3210.0, "completions/mean_length": 790.2813110351562, "completions/mean_terminated_length": 760.5, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.7533527696793003, "grad_norm": 0.18753404915332794, "kl": 0.002582550048828125, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 51530483.0, "reward": 0.4062500298023224, "reward_std": 0.22094503045082092, "rewards/verify_math_reward/mean": 0.40625, "rewards/verify_math_reward/std": 0.4922322630882263, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 1779.0, "completions/mean_length": 565.5848388671875, "completions/mean_terminated_length": 501.39544677734375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.7556851311953353, "grad_norm": 0.21227282285690308, "kl": 0.00350189208984375, "learning_rate": 1e-06, "loss": 0.0346, "num_tokens": 51678470.0, "reward": 0.625, "reward_std": 0.12596234679222107, "rewards/verify_math_reward/mean": 0.625, "rewards/verify_math_reward/std": 0.4852071702480316, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2320.0, "completions/mean_length": 611.0982666015625, "completions/mean_terminated_length": 563.7918701171875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.7580174927113703, "grad_norm": 0.20451150834560394, "kl": 0.002536773681640625, "learning_rate": 1e-06, "loss": 0.0086, "num_tokens": 51841772.0, "reward": 0.6116071939468384, "reward_std": 0.1237042024731636, "rewards/verify_math_reward/mean": 0.6116071343421936, "rewards/verify_math_reward/std": 0.4884762763977051, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2162.0, "completions/mean_length": 499.1562805175781, "completions/mean_terminated_length": 450.330322265625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.7603498542274052, "grad_norm": 0.3030953109264374, "kl": 0.002872467041015625, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 51969783.0, "reward": 0.6830357313156128, "reward_std": 0.24003510177135468, "rewards/verify_math_reward/mean": 0.6830357313156128, "rewards/verify_math_reward/std": 0.4663354456424713, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 841.8482666015625, "completions/mean_terminated_length": 689.7850341796875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.7626822157434402, "grad_norm": 0.1860758662223816, "kl": 0.00211334228515625, "learning_rate": 1e-06, "loss": 0.0311, "num_tokens": 52182957.0, "reward": 0.4464285969734192, "reward_std": 0.23235955834388733, "rewards/verify_math_reward/mean": 0.4464285671710968, "rewards/verify_math_reward/std": 0.49823519587516785, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0758928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 2630.0, "completions/mean_length": 844.6607666015625, "completions/mean_terminated_length": 577.6425170898438, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.7650145772594752, "grad_norm": 0.24089130759239197, "kl": 0.002498626708984375, "learning_rate": 1e-06, "loss": 0.0652, "num_tokens": 52399113.0, "reward": 0.5625, "reward_std": 0.20320294797420502, "rewards/verify_math_reward/mean": 0.5625, "rewards/verify_math_reward/std": 0.49718940258026123, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 601.3705444335938, "completions/mean_terminated_length": 585.6995849609375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.7673469387755102, "grad_norm": 0.2542576491832733, "kl": 0.002681732177734375, "learning_rate": 1e-06, "loss": 0.0195, "num_tokens": 52560980.0, "reward": 0.6160714626312256, "reward_std": 0.22771494090557098, "rewards/verify_math_reward/mean": 0.6160714030265808, "rewards/verify_math_reward/std": 0.48743006587028503, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3245.0, "completions/mean_length": 604.0357666015625, "completions/mean_terminated_length": 556.6334838867188, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.7696793002915452, "grad_norm": 0.2152654528617859, "kl": 0.0027008056640625, "learning_rate": 1e-06, "loss": -0.0111, "num_tokens": 52714860.0, "reward": 0.6875000596046448, "reward_std": 0.1979493498802185, "rewards/verify_math_reward/mean": 0.6875, "rewards/verify_math_reward/std": 0.4645504951477051, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2202.0, "completions/mean_length": 520.0357666015625, "completions/mean_terminated_length": 504.0000305175781, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.7720116618075802, "grad_norm": 0.15844669938087463, "kl": 0.003021240234375, "learning_rate": 1e-06, "loss": 0.0097, "num_tokens": 52849452.0, "reward": 0.6830357313156128, "reward_std": 0.1124253123998642, "rewards/verify_math_reward/mean": 0.6830357313156128, "rewards/verify_math_reward/std": 0.4663354754447937, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2081.0, "completions/mean_length": 697.1160888671875, "completions/mean_terminated_length": 635.3181762695312, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.7743440233236152, "grad_norm": 0.230719193816185, "kl": 0.002788543701171875, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 53026982.0, "reward": 0.5178571939468384, "reward_std": 0.23235955834388733, "rewards/verify_math_reward/mean": 0.5178571343421936, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2053.0, "completions/mean_length": 540.232177734375, "completions/mean_terminated_length": 508.1982116699219, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.7766763848396502, "grad_norm": 0.31468960642814636, "kl": 0.003276824951171875, "learning_rate": 1e-06, "loss": 0.0393, "num_tokens": 53170906.0, "reward": 0.5, "reward_std": 0.2818186283111572, "rewards/verify_math_reward/mean": 0.5, "rewards/verify_math_reward/std": 0.5011197924613953, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 2453.0, "completions/mean_length": 800.7857666015625, "completions/mean_terminated_length": 678.74072265625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.7790087463556852, "grad_norm": 0.19736206531524658, "kl": 0.0021724700927734375, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 53369754.0, "reward": 0.4508928656578064, "reward_std": 0.17525030672550201, "rewards/verify_math_reward/mean": 0.4508928656578064, "rewards/verify_math_reward/std": 0.49869707226753235, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2565.0, "completions/mean_length": 694.9420166015625, "completions/mean_terminated_length": 585.2304077148438, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.7813411078717201, "grad_norm": 0.24868535995483398, "kl": 0.002613067626953125, "learning_rate": 1e-06, "loss": 0.0726, "num_tokens": 53552533.0, "reward": 0.5223214626312256, "reward_std": 0.287075012922287, "rewards/verify_math_reward/mean": 0.5223214030265808, "rewards/verify_math_reward/std": 0.5006202459335327, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3976.0, "completions/mean_length": 700.9688110351562, "completions/mean_terminated_length": 639.2409057617188, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.7836734693877551, "grad_norm": 0.1996821165084839, "kl": 0.002552032470703125, "learning_rate": 1e-06, "loss": 0.0374, "num_tokens": 53735670.0, "reward": 0.4151785969734192, "reward_std": 0.19043760001659393, "rewards/verify_math_reward/mean": 0.4151785671710968, "rewards/verify_math_reward/std": 0.49385643005371094, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 610.6428833007812, "completions/mean_terminated_length": 579.2432861328125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.7860058309037901, "grad_norm": 0.28694623708724976, "kl": 0.003108978271484375, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 53892774.0, "reward": 0.6741071939468384, "reward_std": 0.24424465000629425, "rewards/verify_math_reward/mean": 0.6741071343421936, "rewards/verify_math_reward/std": 0.46975722908973694, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 1700.0, "completions/mean_length": 700.9553833007812, "completions/mean_terminated_length": 607.5137329101562, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.7883381924198251, "grad_norm": 0.17995084822177887, "kl": 0.00258636474609375, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 54071436.0, "reward": 0.4776785969734192, "reward_std": 0.09919221699237823, "rewards/verify_math_reward/mean": 0.4776785671710968, "rewards/verify_math_reward/std": 0.5006202459335327, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 565.1473388671875, "completions/mean_terminated_length": 533.3378295898438, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.79067055393586, "grad_norm": 0.2467787116765976, "kl": 0.00278472900390625, "learning_rate": 1e-06, "loss": 0.0555, "num_tokens": 54225077.0, "reward": 0.5267857313156128, "reward_std": 0.22228743135929108, "rewards/verify_math_reward/mean": 0.5267857313156128, "rewards/verify_math_reward/std": 0.500400185585022, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 630.3035888671875, "completions/mean_terminated_length": 551.1780395507812, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.793002915451895, "grad_norm": 0.2259238064289093, "kl": 0.003124237060546875, "learning_rate": 1e-06, "loss": 0.0273, "num_tokens": 54386977.0, "reward": 0.5178571939468384, "reward_std": 0.20306451618671417, "rewards/verify_math_reward/mean": 0.5178571343421936, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1990.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 512.0982666015625, "completions/mean_terminated_length": 512.0982666015625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.79533527696793, "grad_norm": 0.27471014857292175, "kl": 0.002613067626953125, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 54518279.0, "reward": 0.6875000596046448, "reward_std": 0.17300227284431458, "rewards/verify_math_reward/mean": 0.6875, "rewards/verify_math_reward/std": 0.4645504951477051, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2337.0, "completions/max_terminated_length": 2337.0, "completions/mean_length": 499.3482360839844, "completions/mean_terminated_length": 499.3482360839844, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.797667638483965, "grad_norm": 0.2643197178840637, "kl": 0.00286102294921875, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 54655029.0, "reward": 0.5491071939468384, "reward_std": 0.23191718757152557, "rewards/verify_math_reward/mean": 0.5491071343421936, "rewards/verify_math_reward/std": 0.49869707226753235, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2478.0, "completions/mean_length": 708.0357666015625, "completions/mean_terminated_length": 646.4363403320312, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.8, "grad_norm": 0.18370066583156586, "kl": 0.00308990478515625, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 54831797.0, "reward": 0.6696428656578064, "reward_std": 0.20320293307304382, "rewards/verify_math_reward/mean": 0.6696428656578064, "rewards/verify_math_reward/std": 0.4713950753211975, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2742.0, "completions/mean_length": 681.419677734375, "completions/mean_terminated_length": 666.107666015625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.8023323615160349, "grad_norm": 0.1940321922302246, "kl": 0.0029296875, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 55011643.0, "reward": 0.486607164144516, "reward_std": 0.1460937112569809, "rewards/verify_math_reward/mean": 0.4866071343421936, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1981.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 555.7545166015625, "completions/mean_terminated_length": 555.7545166015625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.8046647230320699, "grad_norm": 0.23973211646080017, "kl": 0.00304412841796875, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 55158476.0, "reward": 0.625, "reward_std": 0.18397442996501923, "rewards/verify_math_reward/mean": 0.625, "rewards/verify_math_reward/std": 0.4852071702480316, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 552.4642944335938, "completions/mean_terminated_length": 552.4642944335938, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.8069970845481049, "grad_norm": 0.2807849049568176, "kl": 0.00318145751953125, "learning_rate": 1e-06, "loss": 0.0363, "num_tokens": 55302796.0, "reward": 0.5803571939468384, "reward_std": 0.2693273723125458, "rewards/verify_math_reward/mean": 0.5803571343421936, "rewards/verify_math_reward/std": 0.49460574984550476, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 465.2500305175781, "completions/mean_terminated_length": 448.9686279296875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.8093294460641399, "grad_norm": 0.255930095911026, "kl": 0.0030059814453125, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 55422732.0, "reward": 0.6517857313156128, "reward_std": 0.19057324528694153, "rewards/verify_math_reward/mean": 0.6517857313156128, "rewards/verify_math_reward/std": 0.47747132182121277, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3885.0, "completions/mean_length": 597.4464721679688, "completions/mean_terminated_length": 533.8363647460938, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.8116618075801749, "grad_norm": 0.24181298911571503, "kl": 0.0027923583984375, "learning_rate": 1e-06, "loss": 0.0428, "num_tokens": 55578592.0, "reward": 0.4910714626312256, "reward_std": 0.1842811554670334, "rewards/verify_math_reward/mean": 0.4910714328289032, "rewards/verify_math_reward/std": 0.5010399222373962, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2376.0, "completions/mean_length": 526.0670166015625, "completions/mean_terminated_length": 477.6063537597656, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.8139941690962099, "grad_norm": 0.2987842261791229, "kl": 0.00356292724609375, "learning_rate": 1e-06, "loss": 0.0222, "num_tokens": 55714367.0, "reward": 0.6473214626312256, "reward_std": 0.25535523891448975, "rewards/verify_math_reward/mean": 0.6473214030265808, "rewards/verify_math_reward/std": 0.4788738191127777, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3106.0, "completions/mean_length": 552.8705444335938, "completions/mean_terminated_length": 520.950439453125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.8163265306122449, "grad_norm": 0.238222137093544, "kl": 0.003093719482421875, "learning_rate": 1e-06, "loss": 0.0383, "num_tokens": 55858914.0, "reward": 0.5803571939468384, "reward_std": 0.20966331660747528, "rewards/verify_math_reward/mean": 0.5803571343421936, "rewards/verify_math_reward/std": 0.49460577964782715, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 685.4152221679688, "completions/mean_terminated_length": 607.5479125976562, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.8186588921282799, "grad_norm": 0.2661639153957367, "kl": 0.0029754638671875, "learning_rate": 1e-06, "loss": 0.035, "num_tokens": 56033735.0, "reward": 0.5, "reward_std": 0.24229323863983154, "rewards/verify_math_reward/mean": 0.5, "rewards/verify_math_reward/std": 0.5011197924613953, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 544.5089721679688, "completions/mean_terminated_length": 479.93634033203125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.8209912536443149, "grad_norm": 0.2590837776660919, "kl": 0.0030975341796875, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 56175849.0, "reward": 0.7098214626312256, "reward_std": 0.19538895785808563, "rewards/verify_math_reward/mean": 0.7098214030265808, "rewards/verify_math_reward/std": 0.4548610746860504, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3576.0, "completions/mean_length": 653.4866333007812, "completions/mean_terminated_length": 590.8954467773438, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.8233236151603499, "grad_norm": 0.1900016963481903, "kl": 0.002742767333984375, "learning_rate": 1e-06, "loss": 0.0306, "num_tokens": 56349542.0, "reward": 0.5401785969734192, "reward_std": 0.16006861627101898, "rewards/verify_math_reward/mean": 0.5401785969734192, "rewards/verify_math_reward/std": 0.49949926137924194, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 756.8035888671875, "completions/mean_terminated_length": 664.8990478515625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.8256559766763848, "grad_norm": 0.2148529291152954, "kl": 0.0026702880859375, "learning_rate": 1e-06, "loss": 0.0581, "num_tokens": 56545306.0, "reward": 0.4776785969734192, "reward_std": 0.19508221745491028, "rewards/verify_math_reward/mean": 0.4776785671710968, "rewards/verify_math_reward/std": 0.5006202459335327, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 669.0670166015625, "completions/mean_terminated_length": 622.5475463867188, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.8279883381924198, "grad_norm": 0.2178030163049698, "kl": 0.003086090087890625, "learning_rate": 1e-06, "loss": 0.0271, "num_tokens": 56719521.0, "reward": 0.5223214626312256, "reward_std": 0.20996278524398804, "rewards/verify_math_reward/mean": 0.5223214030265808, "rewards/verify_math_reward/std": 0.5006201863288879, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1532.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 479.5357360839844, "completions/mean_terminated_length": 479.5357360839844, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.8303206997084548, "grad_norm": 0.24629198014736176, "kl": 0.003513336181640625, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 56850889.0, "reward": 0.629464328289032, "reward_std": 0.17464692890644073, "rewards/verify_math_reward/mean": 0.6294642686843872, "rewards/verify_math_reward/std": 0.4840298891067505, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1809.0, "completions/mean_length": 509.5089416503906, "completions/mean_terminated_length": 477.1982116699219, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.8326530612244898, "grad_norm": 0.22888772189617157, "kl": 0.002506256103515625, "learning_rate": 1e-06, "loss": 0.0567, "num_tokens": 56983659.0, "reward": 0.7366071939468384, "reward_std": 0.1847190409898758, "rewards/verify_math_reward/mean": 0.7366071343421936, "rewards/verify_math_reward/std": 0.44146019220352173, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1700.0, "completions/mean_length": 608.5223388671875, "completions/mean_terminated_length": 577.1036376953125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.8349854227405248, "grad_norm": 0.24409568309783936, "kl": 0.002948760986328125, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 57145352.0, "reward": 0.4375000298023224, "reward_std": 0.2051515281200409, "rewards/verify_math_reward/mean": 0.4375, "rewards/verify_math_reward/std": 0.49718940258026123, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2183.0, "completions/mean_length": 702.482177734375, "completions/mean_terminated_length": 656.4163208007812, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.8373177842565598, "grad_norm": 0.222054123878479, "kl": 0.0023956298828125, "learning_rate": 1e-06, "loss": 0.0358, "num_tokens": 57324172.0, "reward": 0.4910714626312256, "reward_std": 0.2526620328426361, "rewards/verify_math_reward/mean": 0.4910714328289032, "rewards/verify_math_reward/std": 0.5010399222373962, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 594.40625, "completions/mean_terminated_length": 562.8603515625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.8396501457725948, "grad_norm": 0.2607190012931824, "kl": 0.002899169921875, "learning_rate": 1e-06, "loss": 0.0155, "num_tokens": 57479383.0, "reward": 0.6383928656578064, "reward_std": 0.1931336224079132, "rewards/verify_math_reward/mean": 0.6383928656578064, "rewards/verify_math_reward/std": 0.48154187202453613, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2589.0, "completions/mean_length": 615.5402221679688, "completions/mean_terminated_length": 536.0775756835938, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.8419825072886298, "grad_norm": 0.26173636317253113, "kl": 0.002773284912109375, "learning_rate": 1e-06, "loss": 0.0428, "num_tokens": 57640688.0, "reward": 0.5491071939468384, "reward_std": 0.22471500933170319, "rewards/verify_math_reward/mean": 0.5491071343421936, "rewards/verify_math_reward/std": 0.49869707226753235, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 764.3214721679688, "completions/mean_terminated_length": 656.847900390625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.8443148688046648, "grad_norm": 0.240379199385643, "kl": 0.0029144287109375, "learning_rate": 1e-06, "loss": 0.0338, "num_tokens": 57834704.0, "reward": 0.5178571939468384, "reward_std": 0.22558683156967163, "rewards/verify_math_reward/mean": 0.5178571343421936, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2574.0, "completions/mean_length": 563.4866333007812, "completions/mean_terminated_length": 547.645751953125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.8466472303206997, "grad_norm": 0.24983163177967072, "kl": 0.003002166748046875, "learning_rate": 1e-06, "loss": 0.0069, "num_tokens": 57982949.0, "reward": 0.6160714626312256, "reward_std": 0.15872061252593994, "rewards/verify_math_reward/mean": 0.6160714030265808, "rewards/verify_math_reward/std": 0.4874300956726074, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 533.2767944335938, "completions/mean_terminated_length": 501.1802062988281, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.8489795918367347, "grad_norm": 0.25354379415512085, "kl": 0.003021240234375, "learning_rate": 1e-06, "loss": 0.0277, "num_tokens": 58122107.0, "reward": 0.7276785969734192, "reward_std": 0.1690966635942459, "rewards/verify_math_reward/mean": 0.7276785969734192, "rewards/verify_math_reward/std": 0.4461514353752136, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1712.0, "completions/mean_length": 599.28125, "completions/mean_terminated_length": 567.779296875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.8513119533527697, "grad_norm": 0.23023010790348053, "kl": 0.002780914306640625, "learning_rate": 1e-06, "loss": 0.0328, "num_tokens": 58276594.0, "reward": 0.6830357313156128, "reward_std": 0.20320014655590057, "rewards/verify_math_reward/mean": 0.6830357313156128, "rewards/verify_math_reward/std": 0.4663354754447937, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 638.3170166015625, "completions/mean_terminated_length": 575.4500122070312, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.8536443148688047, "grad_norm": 0.23316563665866852, "kl": 0.003299713134765625, "learning_rate": 1e-06, "loss": -0.0065, "num_tokens": 58441009.0, "reward": 0.598214328289032, "reward_std": 0.18727383017539978, "rewards/verify_math_reward/mean": 0.5982142686843872, "rewards/verify_math_reward/std": 0.49135705828666687, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 627.5, "completions/mean_terminated_length": 564.4363403320312, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.8559766763848397, "grad_norm": 0.25327152013778687, "kl": 0.002445220947265625, "learning_rate": 1e-06, "loss": -0.0198, "num_tokens": 58603377.0, "reward": 0.7098214626312256, "reward_std": 0.14023670554161072, "rewards/verify_math_reward/mean": 0.7098214030265808, "rewards/verify_math_reward/std": 0.4548610746860504, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 703.9553833007812, "completions/mean_terminated_length": 642.2817993164062, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.8583090379008746, "grad_norm": 0.2600734233856201, "kl": 0.002674102783203125, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 58784655.0, "reward": 0.4955357313156128, "reward_std": 0.28737616539001465, "rewards/verify_math_reward/mean": 0.4955357015132904, "rewards/verify_math_reward/std": 0.5010998249053955, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 624.1116333007812, "completions/mean_terminated_length": 544.8447265625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.8606413994169096, "grad_norm": 0.2903830409049988, "kl": 0.003284454345703125, "learning_rate": 1e-06, "loss": 0.0398, "num_tokens": 58949656.0, "reward": 0.5, "reward_std": 0.25730663537979126, "rewards/verify_math_reward/mean": 0.5, "rewards/verify_math_reward/std": 0.5011197924613953, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3677.0, "completions/mean_length": 633.2277221679688, "completions/mean_terminated_length": 586.2217407226562, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.8629737609329446, "grad_norm": 0.2585247755050659, "kl": 0.00289154052734375, "learning_rate": 1e-06, "loss": 0.0294, "num_tokens": 59116291.0, "reward": 0.5491071939468384, "reward_std": 0.29488059878349304, "rewards/verify_math_reward/mean": 0.5491071343421936, "rewards/verify_math_reward/std": 0.49869707226753235, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 554.4330444335938, "completions/mean_terminated_length": 538.5515747070312, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.8653061224489796, "grad_norm": 0.2985505759716034, "kl": 0.002960205078125, "learning_rate": 1e-06, "loss": 0.0832, "num_tokens": 59265532.0, "reward": 0.7142857313156128, "reward_std": 0.2944483458995819, "rewards/verify_math_reward/mean": 0.7142857313156128, "rewards/verify_math_reward/std": 0.45276570320129395, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1631.0, "completions/mean_length": 554.7902221679688, "completions/mean_terminated_length": 538.9103393554688, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.8676384839650145, "grad_norm": 0.2962406873703003, "kl": 0.002971649169921875, "learning_rate": 1e-06, "loss": 0.0279, "num_tokens": 59412301.0, "reward": 0.59375, "reward_std": 0.2864660322666168, "rewards/verify_math_reward/mean": 0.59375, "rewards/verify_math_reward/std": 0.4922322630882263, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 693.4420166015625, "completions/mean_terminated_length": 631.5772705078125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.8699708454810495, "grad_norm": 0.20983396470546722, "kl": 0.00283050537109375, "learning_rate": 1e-06, "loss": -0.0164, "num_tokens": 59586864.0, "reward": 0.5401785969734192, "reward_std": 0.1976454108953476, "rewards/verify_math_reward/mean": 0.5401785969734192, "rewards/verify_math_reward/std": 0.49949926137924194, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 654.9152221679688, "completions/mean_terminated_length": 623.9144287109375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.8723032069970845, "grad_norm": 0.19895663857460022, "kl": 0.0031280517578125, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 59751733.0, "reward": 0.504464328289032, "reward_std": 0.1609787493944168, "rewards/verify_math_reward/mean": 0.5044642686843872, "rewards/verify_math_reward/std": 0.5010998249053955, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2510.0, "completions/mean_length": 584.3660888671875, "completions/mean_terminated_length": 536.6968383789062, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.8746355685131195, "grad_norm": 0.26324746012687683, "kl": 0.00290679931640625, "learning_rate": 1e-06, "loss": 0.0336, "num_tokens": 59901871.0, "reward": 0.6964285969734192, "reward_std": 0.20997007191181183, "rewards/verify_math_reward/mean": 0.6964285969734192, "rewards/verify_math_reward/std": 0.46082955598831177, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1503.0, "completions/max_terminated_length": 1503.0, "completions/mean_length": 535.482177734375, "completions/mean_terminated_length": 535.482177734375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.8769679300291545, "grad_norm": 0.2551153004169464, "kl": 0.003528594970703125, "learning_rate": 1e-06, "loss": 0.0186, "num_tokens": 60041931.0, "reward": 0.6071428656578064, "reward_std": 0.17659834027290344, "rewards/verify_math_reward/mean": 0.6071428656578064, "rewards/verify_math_reward/std": 0.48947930335998535, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3321.0, "completions/mean_length": 632.5535888671875, "completions/mean_terminated_length": 617.0224609375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.8793002915451895, "grad_norm": 0.26995542645454407, "kl": 0.00276947021484375, "learning_rate": 1e-06, "loss": 0.0288, "num_tokens": 60206087.0, "reward": 0.59375, "reward_std": 0.26121222972869873, "rewards/verify_math_reward/mean": 0.59375, "rewards/verify_math_reward/std": 0.4922322630882263, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 2294.0, "completions/mean_length": 668.0848388671875, "completions/mean_terminated_length": 573.738525390625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.8816326530612245, "grad_norm": 0.2051822394132614, "kl": 0.00415802001953125, "learning_rate": 1e-06, "loss": 0.0126, "num_tokens": 60374274.0, "reward": 0.53125, "reward_std": 0.19330193102359772, "rewards/verify_math_reward/mean": 0.53125, "rewards/verify_math_reward/std": 0.5001401305198669, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3709.0, "completions/mean_length": 734.4642944335938, "completions/mean_terminated_length": 657.7168579101562, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.8839650145772595, "grad_norm": 0.2789381444454193, "kl": 0.002765655517578125, "learning_rate": 1e-06, "loss": 0.0213, "num_tokens": 60561146.0, "reward": 0.486607164144516, "reward_std": 0.2714526653289795, "rewards/verify_math_reward/mean": 0.4866071343421936, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2681.0, "completions/mean_length": 639.8705444335938, "completions/mean_terminated_length": 592.9547729492188, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.8862973760932945, "grad_norm": 0.26450979709625244, "kl": 0.002971649169921875, "learning_rate": 1e-06, "loss": 0.0458, "num_tokens": 60728621.0, "reward": 0.5580357313156128, "reward_std": 0.204110249876976, "rewards/verify_math_reward/mean": 0.5580357313156128, "rewards/verify_math_reward/std": 0.49773266911506653, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 1746.0, "completions/mean_length": 623.7098388671875, "completions/mean_terminated_length": 560.5772705078125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.8886297376093294, "grad_norm": 0.27074259519577026, "kl": 0.00325775146484375, "learning_rate": 1e-06, "loss": 0.0264, "num_tokens": 60889812.0, "reward": 0.4821428656578064, "reward_std": 0.24003231525421143, "rewards/verify_math_reward/mean": 0.4821428656578064, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2262.0, "completions/mean_length": 622.0848388671875, "completions/mean_terminated_length": 606.5067749023438, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.8909620991253644, "grad_norm": 0.2521023154258728, "kl": 0.003017425537109375, "learning_rate": 1e-06, "loss": 0.0203, "num_tokens": 61048655.0, "reward": 0.625, "reward_std": 0.24229323863983154, "rewards/verify_math_reward/mean": 0.625, "rewards/verify_math_reward/std": 0.4852071702480316, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3000.0, "completions/mean_length": 630.4152221679688, "completions/mean_terminated_length": 614.8744506835938, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.8932944606413994, "grad_norm": 0.279275506734848, "kl": 0.00324249267578125, "learning_rate": 1e-06, "loss": -0.0305, "num_tokens": 61211604.0, "reward": 0.566964328289032, "reward_std": 0.25851622223854065, "rewards/verify_math_reward/mean": 0.5669642686843872, "rewards/verify_math_reward/std": 0.49660524725914, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 584.9910888671875, "completions/mean_terminated_length": 553.3603515625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.8956268221574344, "grad_norm": 0.24243345856666565, "kl": 0.0030517578125, "learning_rate": 1e-06, "loss": 0.0198, "num_tokens": 61363386.0, "reward": 0.6071428656578064, "reward_std": 0.2167326956987381, "rewards/verify_math_reward/mean": 0.6071428656578064, "rewards/verify_math_reward/std": 0.48947930335998535, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 601.34375, "completions/mean_terminated_length": 553.905029296875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.8979591836734694, "grad_norm": 0.2959814965724945, "kl": 0.00304412841796875, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 61525207.0, "reward": 0.5535714626312256, "reward_std": 0.2672403156757355, "rewards/verify_math_reward/mean": 0.5535714030265808, "rewards/verify_math_reward/std": 0.49823519587516785, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3316.0, "completions/mean_length": 669.3303833007812, "completions/mean_terminated_length": 622.8145141601562, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.9002915451895044, "grad_norm": 0.2504737973213196, "kl": 0.002788543701171875, "learning_rate": 1e-06, "loss": 0.0497, "num_tokens": 61702753.0, "reward": 0.5223214626312256, "reward_std": 0.23764583468437195, "rewards/verify_math_reward/mean": 0.5223214030265808, "rewards/verify_math_reward/std": 0.5006201863288879, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2066.0, "completions/mean_length": 622.4017944335938, "completions/mean_terminated_length": 591.1080932617188, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.9026239067055394, "grad_norm": 0.2649012804031372, "kl": 0.00327301025390625, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 61858083.0, "reward": 0.6071428656578064, "reward_std": 0.2083180993795395, "rewards/verify_math_reward/mean": 0.6071428656578064, "rewards/verify_math_reward/std": 0.48947930335998535, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 659.0670166015625, "completions/mean_terminated_length": 596.5772705078125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.9049562682215744, "grad_norm": 0.3432794213294983, "kl": 0.002933502197265625, "learning_rate": 1e-06, "loss": 0.0443, "num_tokens": 62026362.0, "reward": 0.59375, "reward_std": 0.21356894075870514, "rewards/verify_math_reward/mean": 0.59375, "rewards/verify_math_reward/std": 0.4922322630882263, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2138.0, "completions/mean_length": 688.9732666015625, "completions/mean_terminated_length": 611.1871948242188, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.9072886297376094, "grad_norm": 0.1855618804693222, "kl": 0.002826690673828125, "learning_rate": 1e-06, "loss": -0.0065, "num_tokens": 62203636.0, "reward": 0.53125, "reward_std": 0.17330171167850494, "rewards/verify_math_reward/mean": 0.53125, "rewards/verify_math_reward/std": 0.5001401305198669, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 2124.0, "completions/mean_length": 750.3303833007812, "completions/mean_terminated_length": 658.2476806640625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.9096209912536443, "grad_norm": 0.23227335512638092, "kl": 0.002773284912109375, "learning_rate": 1e-06, "loss": 0.0604, "num_tokens": 62392654.0, "reward": 0.4375000298023224, "reward_std": 0.19690078496932983, "rewards/verify_math_reward/mean": 0.4375, "rewards/verify_math_reward/std": 0.49718940258026123, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3690.0, "completions/mean_length": 678.8348388671875, "completions/mean_terminated_length": 568.6036987304688, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.9119533527696793, "grad_norm": 0.20150570571422577, "kl": 0.0031280517578125, "learning_rate": 1e-06, "loss": 0.0381, "num_tokens": 62564121.0, "reward": 0.5267857313156128, "reward_std": 0.17555983364582062, "rewards/verify_math_reward/mean": 0.5267857313156128, "rewards/verify_math_reward/std": 0.5004002451896667, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 669.5357666015625, "completions/mean_terminated_length": 623.0226440429688, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.9142857142857143, "grad_norm": 0.22798162698745728, "kl": 0.002750396728515625, "learning_rate": 1e-06, "loss": 0.0184, "num_tokens": 62736729.0, "reward": 0.4955357313156128, "reward_std": 0.22094503045082092, "rewards/verify_math_reward/mean": 0.4955357015132904, "rewards/verify_math_reward/std": 0.5010998249053955, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 544.7767944335938, "completions/mean_terminated_length": 496.5701599121094, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.9166180758017493, "grad_norm": 0.26611626148223877, "kl": 0.003292083740234375, "learning_rate": 1e-06, "loss": -0.0128, "num_tokens": 62883903.0, "reward": 0.4821428656578064, "reward_std": 0.23040536046028137, "rewards/verify_math_reward/mean": 0.4821428656578064, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0580357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3714.0, "completions/mean_length": 812.1607666015625, "completions/mean_terminated_length": 609.8388671875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.9189504373177843, "grad_norm": 0.2215123325586319, "kl": 0.002330780029296875, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 63089571.0, "reward": 0.4821428656578064, "reward_std": 0.23205281794071198, "rewards/verify_math_reward/mean": 0.4821428656578064, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3498.0, "completions/mean_length": 793.4642944335938, "completions/mean_terminated_length": 686.9308471679688, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.9212827988338192, "grad_norm": 0.21604013442993164, "kl": 0.00289154052734375, "learning_rate": 1e-06, "loss": 0.009, "num_tokens": 63286203.0, "reward": 0.4687500298023224, "reward_std": 0.18276484310626984, "rewards/verify_math_reward/mean": 0.46875, "rewards/verify_math_reward/std": 0.5001401305198669, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2077.0, "completions/mean_length": 583.482177734375, "completions/mean_terminated_length": 567.73095703125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.9236151603498542, "grad_norm": 0.2729056775569916, "kl": 0.003192901611328125, "learning_rate": 1e-06, "loss": 0.0881, "num_tokens": 63448335.0, "reward": 0.5625, "reward_std": 0.24815024435520172, "rewards/verify_math_reward/mean": 0.5625, "rewards/verify_math_reward/std": 0.49718940258026123, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 638.8660888671875, "completions/mean_terminated_length": 559.93603515625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.9259475218658892, "grad_norm": 0.2541603744029999, "kl": 0.00305938720703125, "learning_rate": 1e-06, "loss": 0.0335, "num_tokens": 63612353.0, "reward": 0.5803571939468384, "reward_std": 0.2024611085653305, "rewards/verify_math_reward/mean": 0.5803571343421936, "rewards/verify_math_reward/std": 0.49460577964782715, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2310.0, "completions/mean_length": 642.1205444335938, "completions/mean_terminated_length": 595.2352905273438, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.9282798833819242, "grad_norm": 0.24556109309196472, "kl": 0.0027008056640625, "learning_rate": 1e-06, "loss": 0.0174, "num_tokens": 63779372.0, "reward": 0.5446428656578064, "reward_std": 0.19539175927639008, "rewards/verify_math_reward/mean": 0.5446428656578064, "rewards/verify_math_reward/std": 0.4991183578968048, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 2991.0, "completions/mean_length": 755.2053833007812, "completions/mean_terminated_length": 663.2568359375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.9306122448979591, "grad_norm": 0.22407422959804535, "kl": 0.002590179443359375, "learning_rate": 1e-06, "loss": 0.0852, "num_tokens": 63972642.0, "reward": 0.5535714626312256, "reward_std": 0.23673290014266968, "rewards/verify_math_reward/mean": 0.5535714030265808, "rewards/verify_math_reward/std": 0.49823519587516785, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3997.0, "completions/mean_length": 624.4642944335938, "completions/mean_terminated_length": 577.3394165039062, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.9329446064139941, "grad_norm": 0.2202650010585785, "kl": 0.002841949462890625, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 64134586.0, "reward": 0.6026785969734192, "reward_std": 0.19642741978168488, "rewards/verify_math_reward/mean": 0.6026785969734192, "rewards/verify_math_reward/std": 0.4904395043849945, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1578.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 544.9241333007812, "completions/mean_terminated_length": 544.9241333007812, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.9352769679300291, "grad_norm": 0.3074655830860138, "kl": 0.003437042236328125, "learning_rate": 1e-06, "loss": 0.0258, "num_tokens": 64276137.0, "reward": 0.6741071939468384, "reward_std": 0.2475440502166748, "rewards/verify_math_reward/mean": 0.6741071343421936, "rewards/verify_math_reward/std": 0.46975722908973694, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2308.0, "completions/mean_length": 660.6160888671875, "completions/mean_terminated_length": 598.154541015625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.9376093294460641, "grad_norm": 0.20243020355701447, "kl": 0.002689361572265625, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 64452179.0, "reward": 0.504464328289032, "reward_std": 0.15060995519161224, "rewards/verify_math_reward/mean": 0.5044642686843872, "rewards/verify_math_reward/std": 0.5010998845100403, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3675.0, "completions/mean_length": 662.9553833007812, "completions/mean_terminated_length": 552.2119750976562, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.9399416909620991, "grad_norm": 0.25982972979545593, "kl": 0.0028228759765625, "learning_rate": 1e-06, "loss": -0.0118, "num_tokens": 64619345.0, "reward": 0.5714285969734192, "reward_std": 0.2445441037416458, "rewards/verify_math_reward/mean": 0.5714285969734192, "rewards/verify_math_reward/std": 0.49597999453544617, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 628.4330444335938, "completions/mean_terminated_length": 612.8834228515625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.9422740524781341, "grad_norm": 0.253924697637558, "kl": 0.00443267822265625, "learning_rate": 1e-06, "loss": 0.0395, "num_tokens": 64779418.0, "reward": 0.5267857313156128, "reward_std": 0.2576133608818054, "rewards/verify_math_reward/mean": 0.5267857313156128, "rewards/verify_math_reward/std": 0.500400185585022, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2335.0, "completions/max_terminated_length": 2335.0, "completions/mean_length": 614.3214721679688, "completions/mean_terminated_length": 614.3214721679688, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.9446064139941691, "grad_norm": 0.25928112864494324, "kl": 0.0035400390625, "learning_rate": 1e-06, "loss": -0.0117, "num_tokens": 64947074.0, "reward": 0.4196428656578064, "reward_std": 0.22094222903251648, "rewards/verify_math_reward/mean": 0.4196428656578064, "rewards/verify_math_reward/std": 0.49460577964782715, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 612.875, "completions/mean_terminated_length": 565.5927734375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.9469387755102041, "grad_norm": 0.2391643524169922, "kl": 0.00316619873046875, "learning_rate": 1e-06, "loss": 0.0207, "num_tokens": 65105822.0, "reward": 0.5803571939468384, "reward_std": 0.18262921273708344, "rewards/verify_math_reward/mean": 0.5803571343421936, "rewards/verify_math_reward/std": 0.49460574984550476, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 568.1160888671875, "completions/mean_terminated_length": 536.3333740234375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.9492711370262391, "grad_norm": 0.2419482320547104, "kl": 0.003387451171875, "learning_rate": 1e-06, "loss": 0.027, "num_tokens": 65263992.0, "reward": 0.65625, "reward_std": 0.1892252415418625, "rewards/verify_math_reward/mean": 0.65625, "rewards/verify_math_reward/std": 0.4760226309299469, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2297.0, "completions/mean_length": 648.419677734375, "completions/mean_terminated_length": 601.6199340820312, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.9516034985422741, "grad_norm": 0.20338350534439087, "kl": 0.002971649169921875, "learning_rate": 1e-06, "loss": 0.0419, "num_tokens": 65426726.0, "reward": 0.5446428656578064, "reward_std": 0.17720730602741241, "rewards/verify_math_reward/mean": 0.5446428656578064, "rewards/verify_math_reward/std": 0.4991183578968048, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 640.0402221679688, "completions/mean_terminated_length": 608.9053955078125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.953935860058309, "grad_norm": 0.23347207903862, "kl": 0.003124237060546875, "learning_rate": 1e-06, "loss": 0.0371, "num_tokens": 65593551.0, "reward": 0.5625, "reward_std": 0.20124875009059906, "rewards/verify_math_reward/mean": 0.5625, "rewards/verify_math_reward/std": 0.49718940258026123, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 2335.0, "completions/mean_length": 710.607177734375, "completions/mean_terminated_length": 585.2222290039062, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.956268221574344, "grad_norm": 0.16663672029972076, "kl": 0.0030059814453125, "learning_rate": 1e-06, "loss": 0.0122, "num_tokens": 65770463.0, "reward": 0.5223214626312256, "reward_std": 0.15225742757320404, "rewards/verify_math_reward/mean": 0.5223214030265808, "rewards/verify_math_reward/std": 0.5006201863288879, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3761.0, "completions/mean_length": 804.919677734375, "completions/mean_terminated_length": 775.270263671875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.958600583090379, "grad_norm": 0.21586303412914276, "kl": 0.002864837646484375, "learning_rate": 1e-06, "loss": -0.0089, "num_tokens": 65971501.0, "reward": 0.4687500298023224, "reward_std": 0.19525612890720367, "rewards/verify_math_reward/mean": 0.46875, "rewards/verify_math_reward/std": 0.5001401305198669, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3432.0, "completions/mean_length": 623.107177734375, "completions/mean_terminated_length": 607.53369140625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.960932944606414, "grad_norm": 0.19395920634269714, "kl": 0.0031585693359375, "learning_rate": 1e-06, "loss": 0.0115, "num_tokens": 66136613.0, "reward": 0.4821428656578064, "reward_std": 0.1717798113822937, "rewards/verify_math_reward/mean": 0.4821428656578064, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 620.6517944335938, "completions/mean_terminated_length": 589.3423461914062, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.963265306122449, "grad_norm": 0.2274623066186905, "kl": 0.0033721923828125, "learning_rate": 1e-06, "loss": 0.021, "num_tokens": 66300647.0, "reward": 0.486607164144516, "reward_std": 0.20216168463230133, "rewards/verify_math_reward/mean": 0.4866071343421936, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 594.7857666015625, "completions/mean_terminated_length": 563.2432861328125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.965597667638484, "grad_norm": 0.26743146777153015, "kl": 0.002994537353515625, "learning_rate": 1e-06, "loss": 0.0261, "num_tokens": 66453927.0, "reward": 0.6830357313156128, "reward_std": 0.24333453178405762, "rewards/verify_math_reward/mean": 0.6830357313156128, "rewards/verify_math_reward/std": 0.4663354456424713, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3358.0, "completions/mean_length": 651.0535888671875, "completions/mean_terminated_length": 604.2896118164062, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.967930029154519, "grad_norm": 0.2531168758869171, "kl": 0.002796173095703125, "learning_rate": 1e-06, "loss": -0.0073, "num_tokens": 66620363.0, "reward": 0.504464328289032, "reward_std": 0.25356483459472656, "rewards/verify_math_reward/mean": 0.5044642686843872, "rewards/verify_math_reward/std": 0.5010998845100403, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 543.40625, "completions/mean_terminated_length": 527.475341796875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.970262390670554, "grad_norm": 0.30833593010902405, "kl": 0.00341796875, "learning_rate": 1e-06, "loss": 0.0459, "num_tokens": 66769374.0, "reward": 0.6205357313156128, "reward_std": 0.27639955282211304, "rewards/verify_math_reward/mean": 0.6205357313156128, "rewards/verify_math_reward/std": 0.4863404929637909, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 561.4285888671875, "completions/mean_terminated_length": 529.5855712890625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.972594752186589, "grad_norm": 0.2585616707801819, "kl": 0.0034637451171875, "learning_rate": 1e-06, "loss": 0.0176, "num_tokens": 66920702.0, "reward": 0.53125, "reward_std": 0.2125304490327835, "rewards/verify_math_reward/mean": 0.53125, "rewards/verify_math_reward/std": 0.5001401305198669, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 698.6830444335938, "completions/mean_terminated_length": 605.1788940429688, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.9749271137026239, "grad_norm": 0.2406994253396988, "kl": 0.002899169921875, "learning_rate": 1e-06, "loss": 0.041, "num_tokens": 67100599.0, "reward": 0.4598214626312256, "reward_std": 0.21777845919132233, "rewards/verify_math_reward/mean": 0.4598214328289032, "rewards/verify_math_reward/std": 0.49949926137924194, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2071.0, "completions/mean_length": 579.0089721679688, "completions/mean_terminated_length": 547.3243408203125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.9772594752186589, "grad_norm": 0.30493125319480896, "kl": 0.003597259521484375, "learning_rate": 1e-06, "loss": 0.0291, "num_tokens": 67250985.0, "reward": 0.5758928656578064, "reward_std": 0.26663413643836975, "rewards/verify_math_reward/mean": 0.5758928656578064, "rewards/verify_math_reward/std": 0.4953135848045349, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1407.0, "completions/mean_length": 474.0535888671875, "completions/mean_terminated_length": 457.8116760253906, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.9795918367346939, "grad_norm": 0.3409460186958313, "kl": 0.00598907470703125, "learning_rate": 1e-06, "loss": 0.0225, "num_tokens": 67375421.0, "reward": 0.5803571939468384, "reward_std": 0.2634703516960144, "rewards/verify_math_reward/mean": 0.5803571343421936, "rewards/verify_math_reward/std": 0.49460574984550476, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 596.3616333007812, "completions/mean_terminated_length": 516.461181640625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.9819241982507289, "grad_norm": 0.1891840696334839, "kl": 0.0029144287109375, "learning_rate": 1e-06, "loss": 0.025, "num_tokens": 67530294.0, "reward": 0.5848214626312256, "reward_std": 0.16592560708522797, "rewards/verify_math_reward/mean": 0.5848214030265808, "rewards/verify_math_reward/std": 0.49385640025138855, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2185.0, "completions/max_terminated_length": 2185.0, "completions/mean_length": 633.7723388671875, "completions/mean_terminated_length": 633.7723388671875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.9842565597667638, "grad_norm": 0.2414131462574005, "kl": 0.003475189208984375, "learning_rate": 1e-06, "loss": 0.0227, "num_tokens": 67693523.0, "reward": 0.5133928656578064, "reward_std": 0.163971409201622, "rewards/verify_math_reward/mean": 0.5133928656578064, "rewards/verify_math_reward/std": 0.5009400248527527, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 559.9330444335938, "completions/mean_terminated_length": 511.9321594238281, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.9865889212827988, "grad_norm": 0.2653488516807556, "kl": 0.00396728515625, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 67838772.0, "reward": 0.5625, "reward_std": 0.22289641201496124, "rewards/verify_math_reward/mean": 0.5625, "rewards/verify_math_reward/std": 0.49718940258026123, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3941.0, "completions/mean_length": 705.4955444335938, "completions/mean_terminated_length": 612.1788940429688, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.9889212827988338, "grad_norm": 0.20497190952301025, "kl": 0.003063201904296875, "learning_rate": 1e-06, "loss": 0.0279, "num_tokens": 68016619.0, "reward": 0.5223214626312256, "reward_std": 0.19404374063014984, "rewards/verify_math_reward/mean": 0.5223214030265808, "rewards/verify_math_reward/std": 0.5006201863288879, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3048.0, "completions/mean_length": 651.3482666015625, "completions/mean_terminated_length": 540.2304077148438, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.9912536443148688, "grad_norm": 0.24258698523044586, "kl": 0.00313568115234375, "learning_rate": 1e-06, "loss": 0.035, "num_tokens": 68190953.0, "reward": 0.4642857313156128, "reward_std": 0.20306451618671417, "rewards/verify_math_reward/mean": 0.4642857015132904, "rewards/verify_math_reward/std": 0.49983978271484375, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3861.0, "completions/mean_length": 570.625, "completions/mean_terminated_length": 506.5272521972656, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.9935860058309038, "grad_norm": 0.29159656167030334, "kl": 0.00347900390625, "learning_rate": 1e-06, "loss": 0.043, "num_tokens": 68338653.0, "reward": 0.5491071939468384, "reward_std": 0.23296575248241425, "rewards/verify_math_reward/mean": 0.5491071343421936, "rewards/verify_math_reward/std": 0.49869707226753235, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2233.0, "completions/mean_length": 618.0982666015625, "completions/mean_terminated_length": 602.5022583007812, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.9959183673469387, "grad_norm": 0.2417595386505127, "kl": 0.004085540771484375, "learning_rate": 1e-06, "loss": 0.0389, "num_tokens": 68498275.0, "reward": 0.5758928656578064, "reward_std": 0.23387587070465088, "rewards/verify_math_reward/mean": 0.5758928656578064, "rewards/verify_math_reward/std": 0.4953135550022125, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2932.0, "completions/mean_length": 703.9296875, "completions/mean_terminated_length": 622.52001953125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.9982507288629737, "grad_norm": 0.25353655219078064, "kl": 0.00315093994140625, "learning_rate": 1e-06, "loss": 0.0298, "num_tokens": 68666795.0, "reward": 0.5178571939468384, "reward_std": 0.22424164414405823, "rewards/verify_math_reward/mean": 0.5178571343421936, "rewards/verify_math_reward/std": 0.5008001327514648, "step": 428 }, { "epoch": 0.9982507288629737, "step": 428, "total_flos": 0.0, "train_loss": 0.025071051198389125, "train_runtime": 12494.3327, "train_samples_per_second": 0.96, "train_steps_per_second": 0.034 } ], "logging_steps": 1, "max_steps": 428, "num_input_tokens_seen": 68666795, "num_train_epochs": 1, "save_steps": 32, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }