{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08, "eval_steps": 500, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 1018.7421875, "completions/mean_terminated_length": 351.0, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.00017777777777777779, "grad_norm": 2.5255208115234473, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 149375.0, "reward": 0.03125, "reward_std": 0.10519562661647797, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.00035555555555555557, "grad_norm": 0.9856625228948361, "kl": 0.0, "learning_rate": 3.571428571428571e-08, "loss": -0.0, "num_tokens": 299423.0, "reward": 0.0234375, "reward_std": 0.050389111042022705, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0005333333333333334, "grad_norm": 0.008067823648516024, "kl": 0.0017123222351074219, "learning_rate": 7.142857142857142e-08, "loss": 0.0, "num_tokens": 449471.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 1015.5078125, "completions/mean_terminated_length": 480.5, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.0007111111111111111, "grad_norm": 0.8004779226245522, "kl": 0.001720428466796875, "learning_rate": 1.0714285714285713e-07, "loss": 0.0, "num_tokens": 598416.0, "reward": 0.015625, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 1018.890625, "completions/mean_terminated_length": 370.0, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.0008888888888888889, "grad_norm": 1.3601894663943612, "kl": 0.0012969970703125, "learning_rate": 1.4285714285714285e-07, "loss": 0.0, "num_tokens": 747906.0, "reward": 0.015625, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0010666666666666667, "grad_norm": 1.3305505575438885, "kl": 0.0013217926025390625, "learning_rate": 1.7857142857142858e-07, "loss": 0.0, "num_tokens": 898066.0, "reward": 0.0078125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0012444444444444445, "grad_norm": 0.7934489136623586, "kl": 0.001041412353515625, "learning_rate": 2.1428571428571426e-07, "loss": 0.0, "num_tokens": 1048226.0, "reward": 0.0234375, "reward_std": 0.050389111042022705, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 1019.9765625, "completions/mean_terminated_length": 509.0, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.0014222222222222223, "grad_norm": 1.0923112437833142, "kl": 0.0012717247009277344, "learning_rate": 2.5e-07, "loss": 0.0, "num_tokens": 1197935.0, "reward": 0.015625, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 1021.7734375, "completions/mean_terminated_length": 739.0, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 0.0016, "grad_norm": 0.9137637558522659, "kl": 0.0012459754943847656, "learning_rate": 2.857142857142857e-07, "loss": 0.0, "num_tokens": 1347650.0, "reward": 0.0234375, "reward_std": 0.07394562661647797, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0017777777777777779, "grad_norm": 0.8044050805472117, "kl": 0.0012941360473632812, "learning_rate": 3.2142857142857145e-07, "loss": 0.0, "num_tokens": 1497682.0, "reward": 0.0078125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 1018.546875, "completions/mean_terminated_length": 675.0, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "epoch": 0.0019555555555555554, "grad_norm": 0.0020805968811024493, "kl": 0.0012302398681640625, "learning_rate": 3.5714285714285716e-07, "loss": 0.0, "num_tokens": 1646952.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 1019.140625, "completions/mean_terminated_length": 713.0, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.0021333333333333334, "grad_norm": 1.1754417363368395, "kl": 0.0011076927185058594, "learning_rate": 3.928571428571428e-07, "loss": -0.0026, "num_tokens": 1796410.0, "reward": 0.0234375, "reward_std": 0.07394562661647797, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.002311111111111111, "grad_norm": 0.5931893617425497, "kl": 0.0013408660888671875, "learning_rate": 4.285714285714285e-07, "loss": 0.0, "num_tokens": 1946538.0, "reward": 0.0078125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 1017.6640625, "completions/mean_terminated_length": 213.0, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.002488888888888889, "grad_norm": 1.2751875270122845, "kl": 0.00154876708984375, "learning_rate": 4.6428571428571427e-07, "loss": -0.005, "num_tokens": 2095727.0, "reward": 0.046875, "reward_std": 0.12433473765850067, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 1018.125, "completions/mean_terminated_length": 272.0, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.0026666666666666666, "grad_norm": 2.4154141171390853, "kl": 0.0014505386352539062, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 2245103.0, "reward": 0.0390625, "reward_std": 0.09308473765850067, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 1019.984375, "completions/mean_terminated_length": 510.0, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.0028444444444444446, "grad_norm": 0.9633745180649104, "kl": 0.0015707015991210938, "learning_rate": 4.999935101463869e-07, "loss": 0.0, "num_tokens": 2394525.0, "reward": 0.0390625, "reward_std": 0.09308473765850067, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.003022222222222222, "grad_norm": 0.6380613836288761, "kl": 0.0014429092407226562, "learning_rate": 4.999740409224932e-07, "loss": 0.0, "num_tokens": 2544701.0, "reward": 0.0234375, "reward_std": 0.050389111042022705, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0032, "grad_norm": 0.5322415642426819, "kl": 0.0022706985473632812, "learning_rate": 4.999415933391384e-07, "loss": 0.0, "num_tokens": 2694813.0, "reward": 0.0078125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 1020.984375, "completions/mean_terminated_length": 638.0, "completions/min_length": 638.0, "completions/min_terminated_length": 638.0, "epoch": 0.0033777777777777777, "grad_norm": 1.441795768125264, "kl": 0.0020341873168945312, "learning_rate": 4.998961690809627e-07, "loss": -0.0015, "num_tokens": 2844427.0, "reward": 0.0390625, "reward_std": 0.1128891110420227, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0035555555555555557, "grad_norm": 1.1660926171762258, "kl": 0.0021762847900390625, "learning_rate": 4.998377705063407e-07, "loss": 0.0, "num_tokens": 2994555.0, "reward": 0.046875, "reward_std": 0.09108919650316238, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0037333333333333333, "grad_norm": 0.8182366403832771, "kl": 0.003204345703125, "learning_rate": 4.997664006472578e-07, "loss": 0.0, "num_tokens": 3144715.0, "reward": 0.0078125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 1023.4296875, "completions/mean_terminated_length": 951.0, "completions/min_length": 951.0, "completions/min_terminated_length": 951.0, "epoch": 0.003911111111111111, "grad_norm": 0.6296290763602951, "kl": 0.00325775146484375, "learning_rate": 4.996820632091536e-07, "loss": 0.0, "num_tokens": 3294706.0, "reward": 0.0234375, "reward_std": 0.050389111042022705, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 1022.515625, "completions/mean_terminated_length": 834.0, "completions/min_length": 834.0, "completions/min_terminated_length": 834.0, "epoch": 0.004088888888888889, "grad_norm": 0.8333548613867658, "kl": 0.00540924072265625, "learning_rate": 4.995847625707292e-07, "loss": 0.0, "num_tokens": 3444644.0, "reward": 0.015625, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.004266666666666667, "grad_norm": 0.9893470542829086, "kl": 0.004608154296875, "learning_rate": 4.994745037837194e-07, "loss": 0.0, "num_tokens": 3594676.0, "reward": 0.0234375, "reward_std": 0.07394562661647797, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0044444444444444444, "grad_norm": 0.8313803334424121, "kl": 0.00600433349609375, "learning_rate": 4.993512925726318e-07, "loss": 0.0, "num_tokens": 3744740.0, "reward": 0.03125, "reward_std": 0.055901698768138885, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 1020.078125, "completions/mean_terminated_length": 522.0, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "epoch": 0.004622222222222222, "grad_norm": 1.2645160981810022, "kl": 0.00774383544921875, "learning_rate": 4.992151353344481e-07, "loss": 0.009, "num_tokens": 3894414.0, "reward": 0.0390625, "reward_std": 0.11664125323295593, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 1023.21875, "completions/mean_terminated_length": 924.0, "completions/min_length": 924.0, "completions/min_terminated_length": 924.0, "epoch": 0.0048, "grad_norm": 1.9339041926895841, "kl": 0.0139923095703125, "learning_rate": 4.990660391382923e-07, "loss": -0.0004, "num_tokens": 4044250.0, "reward": 0.0703125, "reward_std": 0.21808473765850067, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.004977777777777778, "grad_norm": 10.116095365808855, "kl": 0.176483154296875, "learning_rate": 4.989040117250646e-07, "loss": 0.0002, "num_tokens": 4194394.0, "reward": 0.015625, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.005155555555555556, "grad_norm": 1.525182352130684, "kl": 0.010250091552734375, "learning_rate": 4.987290615070384e-07, "loss": 0.0, "num_tokens": 4344522.0, "reward": 0.0625, "reward_std": 0.125, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.005333333333333333, "grad_norm": 0.6796263042992647, "kl": 0.012786865234375, "learning_rate": 4.985411975674243e-07, "loss": 0.0, "num_tokens": 4494618.0, "reward": 0.0390625, "reward_std": 0.059839196503162384, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.005511111111111111, "grad_norm": 0.7723841507446317, "kl": 0.01345062255859375, "learning_rate": 4.983404296598978e-07, "loss": 0.0, "num_tokens": 4644794.0, "reward": 0.0078125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.005688888888888889, "grad_norm": 0.6903790795082435, "kl": 0.01425933837890625, "learning_rate": 4.981267682080939e-07, "loss": 0.0, "num_tokens": 4794954.0, "reward": 0.015625, "reward_std": 0.042695626616477966, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.005866666666666667, "grad_norm": 2.3774032902786253, "kl": 0.082550048828125, "learning_rate": 4.979002243050646e-07, "loss": 0.0001, "num_tokens": 4944890.0, "reward": 0.1328125, "reward_std": 0.24952572584152222, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 1022.859375, "completions/mean_terminated_length": 878.0, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 0.006044444444444444, "grad_norm": 1.3636219310317463, "kl": 0.03136444091796875, "learning_rate": 4.976608097127043e-07, "loss": 0.0, "num_tokens": 5094728.0, "reward": 0.0546875, "reward_std": 0.15558473765850067, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 1020.5625, "completions/mean_terminated_length": 804.0, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 0.006222222222222222, "grad_norm": 1.957068688125507, "kl": 0.023651123046875, "learning_rate": 4.974085368611381e-07, "loss": -0.0016, "num_tokens": 5244368.0, "reward": 0.1484375, "reward_std": 0.2893695831298828, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0064, "grad_norm": 1.401578731698888, "kl": 0.0356903076171875, "learning_rate": 4.97143418848077e-07, "loss": 0.0, "num_tokens": 5394480.0, "reward": 0.0703125, "reward_std": 0.1687908172607422, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.006577777777777778, "grad_norm": 0.9693833044084433, "kl": 0.0342559814453125, "learning_rate": 4.968654694381379e-07, "loss": 0.0, "num_tokens": 5544656.0, "reward": 0.015625, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0067555555555555554, "grad_norm": 2.042899773266861, "kl": 0.03973388671875, "learning_rate": 4.965747030621286e-07, "loss": 0.0, "num_tokens": 5694688.0, "reward": 0.203125, "reward_std": 0.26539260149002075, "rewards/equation_reward_func/mean": 0.203125, "rewards/equation_reward_func/std": 0.40390563011169434, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 1019.3203125, "completions/mean_terminated_length": 425.0, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.006933333333333333, "grad_norm": 0.9244755631049519, "kl": 0.03350830078125, "learning_rate": 4.962711348162987e-07, "loss": 0.0, "num_tokens": 5844105.0, "reward": 0.0703125, "reward_std": 0.1128891110420227, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0071111111111111115, "grad_norm": 1.3956208166189303, "kl": 0.0409698486328125, "learning_rate": 4.959547804615562e-07, "loss": 0.0, "num_tokens": 5994057.0, "reward": 0.1015625, "reward_std": 0.19234731793403625, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.007288888888888889, "grad_norm": 1.0939386922259269, "kl": 0.047637939453125, "learning_rate": 4.956256564226487e-07, "loss": 0.0, "num_tokens": 6144169.0, "reward": 0.109375, "reward_std": 0.12704971432685852, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.007466666666666667, "grad_norm": 0.9823276052532137, "kl": 0.070220947265625, "learning_rate": 4.952837797873106e-07, "loss": 0.0001, "num_tokens": 6294217.0, "reward": 0.1484375, "reward_std": 0.12654343247413635, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.007644444444444444, "grad_norm": 1.6872545714805152, "kl": 0.074737548828125, "learning_rate": 4.949291683053768e-07, "loss": 0.0001, "num_tokens": 6444217.0, "reward": 0.171875, "reward_std": 0.182951420545578, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.007822222222222222, "grad_norm": 2.502150125467201, "kl": 0.091094970703125, "learning_rate": 4.9456184038786e-07, "loss": 0.0001, "num_tokens": 6594169.0, "reward": 0.078125, "reward_std": 0.17430339753627777, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.008, "grad_norm": 2.660271893400313, "kl": 0.073974609375, "learning_rate": 4.941818151059955e-07, "loss": 0.0001, "num_tokens": 6744137.0, "reward": 0.2265625, "reward_std": 0.28534942865371704, "rewards/equation_reward_func/mean": 0.2265625, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.008177777777777779, "grad_norm": 1.3447168795347615, "kl": 0.0782470703125, "learning_rate": 4.937891121902508e-07, "loss": 0.0001, "num_tokens": 6894169.0, "reward": 0.1484375, "reward_std": 0.18904343247413635, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.008355555555555555, "grad_norm": 1.8006928888239953, "kl": 0.10235595703125, "learning_rate": 4.933837520293017e-07, "loss": 0.0001, "num_tokens": 7044169.0, "reward": 0.109375, "reward_std": 0.2000408172607422, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.008533333333333334, "grad_norm": 2.762282489622378, "kl": 0.1116943359375, "learning_rate": 4.929657556689726e-07, "loss": 0.0001, "num_tokens": 7194105.0, "reward": 0.1640625, "reward_std": 0.21763263642787933, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.00871111111111111, "grad_norm": 2.5581159742281967, "kl": 0.148712158203125, "learning_rate": 4.925351448111454e-07, "loss": 0.0001, "num_tokens": 7344233.0, "reward": 0.1484375, "reward_std": 0.14898642897605896, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.008888888888888889, "grad_norm": 1.4905993305804919, "kl": 0.139892578125, "learning_rate": 4.920919418126312e-07, "loss": 0.0001, "num_tokens": 7494249.0, "reward": 0.1640625, "reward_std": 0.15513263642787933, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.009066666666666667, "grad_norm": 3.7196426012925587, "kl": 0.0999755859375, "learning_rate": 4.91636169684011e-07, "loss": 0.0001, "num_tokens": 7644345.0, "reward": 0.203125, "reward_std": 0.3239234387874603, "rewards/equation_reward_func/mean": 0.203125, "rewards/equation_reward_func/std": 0.40390563011169434, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.009244444444444444, "grad_norm": 1.2282928962932194, "kl": 0.13177490234375, "learning_rate": 4.911678520884398e-07, "loss": 0.0001, "num_tokens": 7794473.0, "reward": 0.203125, "reward_std": 0.2722259759902954, "rewards/equation_reward_func/mean": 0.203125, "rewards/equation_reward_func/std": 0.40390563011169434, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.009422222222222222, "grad_norm": 0.5881369515153079, "kl": 0.100677490234375, "learning_rate": 4.906870133404186e-07, "loss": 0.0001, "num_tokens": 7944681.0, "reward": 0.0546875, "reward_std": 0.06404343992471695, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0096, "grad_norm": 1.7099570297282298, "kl": 0.178436279296875, "learning_rate": 4.901936784045324e-07, "loss": 0.0002, "num_tokens": 8094761.0, "reward": 0.0703125, "reward_std": 0.10673906654119492, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.009777777777777778, "grad_norm": 1.1618918846574768, "kl": 0.11260986328125, "learning_rate": 4.896878728941531e-07, "loss": 0.0001, "num_tokens": 8244985.0, "reward": 0.1484375, "reward_std": 0.1784759908914566, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.009955555555555556, "grad_norm": 1.6157595252800032, "kl": 0.1453857421875, "learning_rate": 4.891696230701103e-07, "loss": 0.0001, "num_tokens": 8395209.0, "reward": 0.3046875, "reward_std": 0.2733173370361328, "rewards/equation_reward_func/mean": 0.296875, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.010133333333333333, "grad_norm": 5.48741216187371, "kl": 0.172119140625, "learning_rate": 4.886389558393284e-07, "loss": 0.0002, "num_tokens": 8545401.0, "reward": 0.1953125, "reward_std": 0.22072336077690125, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 1017.421875, "completions/mean_terminated_length": 182.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.010311111111111111, "grad_norm": 12.73357583618356, "kl": 0.29205322265625, "learning_rate": 4.880958987534282e-07, "loss": 0.0092, "num_tokens": 8694655.0, "reward": 0.1640625, "reward_std": 0.1848391890525818, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.01048888888888889, "grad_norm": 0.6150431573998991, "kl": 0.11785888671875, "learning_rate": 4.875404800072976e-07, "loss": 0.0001, "num_tokens": 8844943.0, "reward": 0.03125, "reward_std": 0.055901698768138885, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.010666666666666666, "grad_norm": 1.3031664632582245, "kl": 0.1180419921875, "learning_rate": 4.869727284376277e-07, "loss": 0.0001, "num_tokens": 8995183.0, "reward": 0.0234375, "reward_std": 0.07394562661647797, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.010844444444444445, "grad_norm": 1.49150694128507, "kl": 0.2772216796875, "learning_rate": 4.86392673521415e-07, "loss": 0.0003, "num_tokens": 9145167.0, "reward": 0.109375, "reward_std": 0.12704971432685852, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.011022222222222221, "grad_norm": 1.276262157388709, "kl": 0.13037109375, "learning_rate": 4.858003453744314e-07, "loss": 0.0001, "num_tokens": 9295183.0, "reward": 0.140625, "reward_std": 0.19628483057022095, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0112, "grad_norm": 1.6201153044468526, "kl": 0.200164794921875, "learning_rate": 4.851957747496606e-07, "loss": 0.0002, "num_tokens": 9445231.0, "reward": 0.3515625, "reward_std": 0.3422882854938507, "rewards/equation_reward_func/mean": 0.3515625, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.011377777777777778, "grad_norm": 1.1055713016253716, "kl": 0.09039306640625, "learning_rate": 4.845789930357016e-07, "loss": 0.0001, "num_tokens": 9595311.0, "reward": 0.2109375, "reward_std": 0.12233919650316238, "rewards/equation_reward_func/mean": 0.2109375, "rewards/equation_reward_func/std": 0.4095771610736847, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.011555555555555555, "grad_norm": 1.1038951823064433, "kl": 0.11285400390625, "learning_rate": 4.839500322551386e-07, "loss": 0.0001, "num_tokens": 9745487.0, "reward": 0.2265625, "reward_std": 0.2396235316991806, "rewards/equation_reward_func/mean": 0.2265625, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.011733333333333333, "grad_norm": 1.1541214832914621, "kl": 0.191650390625, "learning_rate": 4.833089250628786e-07, "loss": 0.0002, "num_tokens": 9895727.0, "reward": 0.140625, "reward_std": 0.20597384870052338, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.011911111111111112, "grad_norm": 2.2256022872280807, "kl": 0.25091552734375, "learning_rate": 4.826557047444563e-07, "loss": 0.0003, "num_tokens": 10045695.0, "reward": 0.109375, "reward_std": 0.16769562661647797, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.012088888888888889, "grad_norm": 7.522155217602482, "kl": 0.75909423828125, "learning_rate": 4.819904052143058e-07, "loss": 0.0008, "num_tokens": 10195807.0, "reward": 0.109375, "reward_std": 0.12388263642787933, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.012266666666666667, "grad_norm": 1.2200205066895753, "kl": 0.12750244140625, "learning_rate": 4.813130610139993e-07, "loss": 0.0001, "num_tokens": 10345871.0, "reward": 0.3359375, "reward_std": 0.2896904945373535, "rewards/equation_reward_func/mean": 0.3359375, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.012444444444444444, "grad_norm": 0.609384418548224, "kl": 0.10894775390625, "learning_rate": 4.806237073104548e-07, "loss": 0.0001, "num_tokens": 10496063.0, "reward": 0.1796875, "reward_std": 0.10673906654119492, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.012622222222222222, "grad_norm": 2.3540125585368235, "kl": 0.20880126953125, "learning_rate": 4.799223798941089e-07, "loss": 0.0002, "num_tokens": 10646031.0, "reward": 0.1875, "reward_std": 0.2000408172607422, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0128, "grad_norm": 0.8370080443574314, "kl": 0.1231689453125, "learning_rate": 4.792091151770602e-07, "loss": 0.0001, "num_tokens": 10796063.0, "reward": 0.0703125, "reward_std": 0.12654343247413635, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.012977777777777777, "grad_norm": 1.0417505391889215, "kl": 0.177490234375, "learning_rate": 4.78483950191177e-07, "loss": 0.0002, "num_tokens": 10946095.0, "reward": 0.078125, "reward_std": 0.1379890739917755, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.013155555555555556, "grad_norm": 1.5063770044122746, "kl": 0.47418212890625, "learning_rate": 4.777469225861765e-07, "loss": 0.0005, "num_tokens": 11096303.0, "reward": 0.0234375, "reward_std": 0.07394562661647797, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.013333333333333334, "grad_norm": 0.8415854268223607, "kl": 0.115478515625, "learning_rate": 4.769980706276687e-07, "loss": 0.0001, "num_tokens": 11246415.0, "reward": 0.1796875, "reward_std": 0.16503483057022095, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.013511111111111111, "grad_norm": 2.161975707055707, "kl": 0.201416015625, "learning_rate": 4.762374331951703e-07, "loss": 0.0002, "num_tokens": 11396543.0, "reward": 0.1328125, "reward_std": 0.11664125323295593, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.01368888888888889, "grad_norm": 2.9454579795519864, "kl": 0.31805419921875, "learning_rate": 4.7546504978008595e-07, "loss": 0.0003, "num_tokens": 11546463.0, "reward": 0.25, "reward_std": 0.2807757258415222, "rewards/equation_reward_func/mean": 0.25, "rewards/equation_reward_func/std": 0.434714138507843, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.013866666666666666, "grad_norm": 2.8774585156255648, "kl": 0.1600341796875, "learning_rate": 4.7468096048365814e-07, "loss": 0.0002, "num_tokens": 11696511.0, "reward": 0.171875, "reward_std": 0.182951420545578, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.014044444444444444, "grad_norm": 10.21789522347878, "kl": 1.0467529296875, "learning_rate": 4.738852060148848e-07, "loss": 0.001, "num_tokens": 11846623.0, "reward": 0.2734375, "reward_std": 0.1626407653093338, "rewards/equation_reward_func/mean": 0.2734375, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.014222222222222223, "grad_norm": 11.825855342156963, "kl": 1.4114990234375, "learning_rate": 4.730778276884061e-07, "loss": 0.0014, "num_tokens": 11996703.0, "reward": 0.1953125, "reward_std": 0.10629080981016159, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0144, "grad_norm": 2.4602106569208506, "kl": 0.1788330078125, "learning_rate": 4.722588674223593e-07, "loss": 0.0002, "num_tokens": 12146799.0, "reward": 0.15625, "reward_std": 0.120451420545578, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.014577777777777778, "grad_norm": 1.524987140594139, "kl": 0.194091796875, "learning_rate": 4.7142836773620227e-07, "loss": 0.0002, "num_tokens": 12296879.0, "reward": 0.0703125, "reward_std": 0.14898642897605896, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 1020.3046875, "completions/mean_terminated_length": 551.0, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 0.014755555555555555, "grad_norm": 1.4986869163411072, "kl": 0.27734375, "learning_rate": 4.70586371748506e-07, "loss": 0.0137, "num_tokens": 12446406.0, "reward": 0.0234375, "reward_std": 0.07394562661647797, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 1022.171875, "completions/mean_terminated_length": 790.0, "completions/min_length": 790.0, "completions/min_terminated_length": 790.0, "epoch": 0.014933333333333333, "grad_norm": 2.0186353352580753, "kl": 0.294921875, "learning_rate": 4.6973292317471635e-07, "loss": 0.005, "num_tokens": 12596172.0, "reward": 0.140625, "reward_std": 0.14789125323295593, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.015111111111111112, "grad_norm": 1.8860819234972812, "kl": 0.1568603515625, "learning_rate": 4.6886806632488363e-07, "loss": 0.0002, "num_tokens": 12746316.0, "reward": 0.2109375, "reward_std": 0.12233919650316238, "rewards/equation_reward_func/mean": 0.2109375, "rewards/equation_reward_func/std": 0.4095771610736847, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.015288888888888888, "grad_norm": 2.911879124308087, "kl": 0.290283203125, "learning_rate": 4.679918461013627e-07, "loss": 0.0003, "num_tokens": 12896364.0, "reward": 0.3125, "reward_std": 0.125, "rewards/equation_reward_func/mean": 0.3125, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.015466666666666667, "grad_norm": 0.8653363250654178, "kl": 0.20477294921875, "learning_rate": 4.6710430799648143e-07, "loss": 0.0002, "num_tokens": 13046476.0, "reward": 0.0390625, "reward_std": 0.059839196503162384, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.015644444444444443, "grad_norm": 1.5145530729286887, "kl": 0.220947265625, "learning_rate": 4.6620549809017885e-07, "loss": 0.0002, "num_tokens": 13196508.0, "reward": 0.109375, "reward_std": 0.11840169876813889, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.015822222222222224, "grad_norm": 1.3321962131494147, "kl": 0.1405029296875, "learning_rate": 4.652954630476127e-07, "loss": 0.0001, "num_tokens": 13346572.0, "reward": 0.203125, "reward_std": 0.2257782220840454, "rewards/equation_reward_func/mean": 0.203125, "rewards/equation_reward_func/std": 0.40390563011169434, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.016, "grad_norm": 626.6666909293125, "kl": 39.848876953125, "learning_rate": 4.643742501167366e-07, "loss": 0.0399, "num_tokens": 13496764.0, "reward": 0.078125, "reward_std": 0.10724534839391708, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 1019.9921875, "completions/mean_terminated_length": 511.0, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.016177777777777777, "grad_norm": 1.6421861144685825, "kl": 0.20947265625, "learning_rate": 4.6344190712584713e-07, "loss": 0.0015, "num_tokens": 13646299.0, "reward": 0.28125, "reward_std": 0.2638174891471863, "rewards/equation_reward_func/mean": 0.28125, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 1018.921875, "completions/mean_terminated_length": 374.0, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.016355555555555557, "grad_norm": 1.9984137973449856, "kl": 0.1668701171875, "learning_rate": 4.624984824811006e-07, "loss": 0.0002, "num_tokens": 13795777.0, "reward": 0.3203125, "reward_std": 0.33808404207229614, "rewards/equation_reward_func/mean": 0.3203125, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.016533333333333334, "grad_norm": 0.8981906764251413, "kl": 0.30419921875, "learning_rate": 4.615440251639995e-07, "loss": 0.0003, "num_tokens": 13945921.0, "reward": 0.15625, "reward_std": 0.15779343247413635, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.01671111111111111, "grad_norm": 1.8380422986227603, "kl": 0.29052734375, "learning_rate": 4.605785847288502e-07, "loss": 0.0003, "num_tokens": 14096097.0, "reward": 0.2265625, "reward_std": 0.2664783000946045, "rewards/equation_reward_func/mean": 0.2265625, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.016888888888888887, "grad_norm": 1.392364683616067, "kl": 0.18310546875, "learning_rate": 4.596022113001894e-07, "loss": 0.0002, "num_tokens": 14246033.0, "reward": 0.4140625, "reward_std": 0.28719252347946167, "rewards/equation_reward_func/mean": 0.4140625, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.017066666666666667, "grad_norm": 0.8779774547481551, "kl": 0.150634765625, "learning_rate": 4.5861495557018206e-07, "loss": 0.0002, "num_tokens": 14395937.0, "reward": 0.234375, "reward_std": 0.1566799283027649, "rewards/equation_reward_func/mean": 0.234375, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.017244444444444444, "grad_norm": 0.6923204569868814, "kl": 0.167236328125, "learning_rate": 4.576168687959895e-07, "loss": 0.0002, "num_tokens": 14545985.0, "reward": 0.140625, "reward_std": 0.11022830754518509, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 1020.5546875, "completions/mean_terminated_length": 583.0, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 0.01742222222222222, "grad_norm": 15.319283484189588, "kl": 0.8560791015625, "learning_rate": 4.566080027971082e-07, "loss": -0.0019, "num_tokens": 14695576.0, "reward": 0.28125, "reward_std": 0.21076759696006775, "rewards/equation_reward_func/mean": 0.28125, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0176, "grad_norm": 1.081729411248261, "kl": 0.1512451171875, "learning_rate": 4.555884099526793e-07, "loss": 0.0002, "num_tokens": 14845784.0, "reward": 0.140625, "reward_std": 0.1375408172607422, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.017777777777777778, "grad_norm": 0.8355772217383763, "kl": 0.190185546875, "learning_rate": 4.545581431987694e-07, "loss": 0.0002, "num_tokens": 14995912.0, "reward": 0.2890625, "reward_std": 0.14568254351615906, "rewards/equation_reward_func/mean": 0.2890625, "rewards/equation_reward_func/std": 0.45510825514793396, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.017955555555555554, "grad_norm": 1.3710233539627825, "kl": 0.201416015625, "learning_rate": 4.5351725602562174e-07, "loss": 0.0002, "num_tokens": 15145944.0, "reward": 0.3671875, "reward_std": 0.2792541980743408, "rewards/equation_reward_func/mean": 0.3671875, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.018133333333333335, "grad_norm": 1.666642469129247, "kl": 0.260986328125, "learning_rate": 4.5246580247487933e-07, "loss": 0.0003, "num_tokens": 15296120.0, "reward": 0.3203125, "reward_std": 0.18417394161224365, "rewards/equation_reward_func/mean": 0.3203125, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 1020.671875, "completions/mean_terminated_length": 598.0, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 0.01831111111111111, "grad_norm": 1.24293148615849, "kl": 0.18408203125, "learning_rate": 4.514038371367791e-07, "loss": 0.0014, "num_tokens": 15445742.0, "reward": 0.2734375, "reward_std": 0.25087815523147583, "rewards/equation_reward_func/mean": 0.2734375, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.018488888888888888, "grad_norm": 1.2403028948436436, "kl": 0.269287109375, "learning_rate": 4.5033141514731786e-07, "loss": 0.0003, "num_tokens": 15595726.0, "reward": 0.5234375, "reward_std": 0.2827712595462799, "rewards/equation_reward_func/mean": 0.5234375, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.018666666666666668, "grad_norm": 1.1075183309302854, "kl": 0.2298583984375, "learning_rate": 4.4924859218538936e-07, "loss": 0.0002, "num_tokens": 15746094.0, "reward": 0.109375, "reward_std": 0.193890780210495, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 1017.6328125, "completions/mean_terminated_length": 209.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.018844444444444445, "grad_norm": 11.380704908729829, "kl": 0.9163818359375, "learning_rate": 4.4815542446989373e-07, "loss": 0.0052, "num_tokens": 15895295.0, "reward": 0.4375, "reward_std": 0.2543674111366272, "rewards/equation_reward_func/mean": 0.4375, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.01902222222222222, "grad_norm": 1.4174906660969986, "kl": 0.30712890625, "learning_rate": 4.470519687568185e-07, "loss": 0.0003, "num_tokens": 16045359.0, "reward": 0.2578125, "reward_std": 0.22093652188777924, "rewards/equation_reward_func/mean": 0.2578125, "rewards/equation_reward_func/std": 0.43914902210235596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0192, "grad_norm": 1.3569083137873426, "kl": 0.387451171875, "learning_rate": 4.4593828233629214e-07, "loss": 0.0004, "num_tokens": 16195311.0, "reward": 0.265625, "reward_std": 0.1441391110420227, "rewards/equation_reward_func/mean": 0.265625, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.01937777777777778, "grad_norm": 5.298619203617672, "kl": 1.0986328125, "learning_rate": 4.4481442302960923e-07, "loss": 0.0011, "num_tokens": 16345631.0, "reward": 0.2109375, "reward_std": 0.23677174746990204, "rewards/equation_reward_func/mean": 0.2109375, "rewards/equation_reward_func/std": 0.4095771610736847, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 1017.40625, "completions/mean_terminated_length": 180.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.019555555555555555, "grad_norm": 1.4391264750859365, "kl": 0.5289306640625, "learning_rate": 4.4368044918622893e-07, "loss": 0.0256, "num_tokens": 16494851.0, "reward": 0.2890625, "reward_std": 0.19738000631332397, "rewards/equation_reward_func/mean": 0.28125, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 1022.234375, "completions/mean_terminated_length": 798.0, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "epoch": 0.019733333333333332, "grad_norm": 5.071060706207671, "kl": 0.76611328125, "learning_rate": 4.4253641968074505e-07, "loss": 0.0013, "num_tokens": 16644673.0, "reward": 0.34375, "reward_std": 0.14347384870052338, "rewards/equation_reward_func/mean": 0.34375, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.019911111111111112, "grad_norm": 1990.9797970578993, "kl": 77.30145263671875, "learning_rate": 4.4138239390983e-07, "loss": 0.0775, "num_tokens": 16794753.0, "reward": 0.2890625, "reward_std": 0.11493883281946182, "rewards/equation_reward_func/mean": 0.2890625, "rewards/equation_reward_func/std": 0.45510825514793396, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.02008888888888889, "grad_norm": 1.424762573469881, "kl": 0.574951171875, "learning_rate": 4.402184317891501e-07, "loss": 0.0006, "num_tokens": 16944817.0, "reward": 0.0546875, "reward_std": 0.12233919650316238, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 1017.765625, "completions/mean_terminated_length": 625.0, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 0.020266666666666665, "grad_norm": 1.2785481976820225, "kl": 0.3013916015625, "learning_rate": 4.390445937502557e-07, "loss": 0.0057, "num_tokens": 17094003.0, "reward": 0.59375, "reward_std": 0.2656276822090149, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.020444444444444446, "grad_norm": 1.1352519560488858, "kl": 0.29443359375, "learning_rate": 4.37860940737443e-07, "loss": 0.0003, "num_tokens": 17243955.0, "reward": 0.4296875, "reward_std": 0.14305339753627777, "rewards/equation_reward_func/mean": 0.4296875, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.020622222222222222, "grad_norm": 1.4003882632845548, "kl": 0.248046875, "learning_rate": 4.3666753420459023e-07, "loss": 0.0002, "num_tokens": 17394147.0, "reward": 0.2578125, "reward_std": 0.1593368798494339, "rewards/equation_reward_func/mean": 0.2578125, "rewards/equation_reward_func/std": 0.43914902210235596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0208, "grad_norm": 1.4557392818393817, "kl": 0.47247314453125, "learning_rate": 4.354644361119671e-07, "loss": 0.0005, "num_tokens": 17544291.0, "reward": 0.1328125, "reward_std": 0.09308473765850067, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.02097777777777778, "grad_norm": 1.1098133412724638, "kl": 0.1973876953125, "learning_rate": 4.3425170892301764e-07, "loss": 0.0002, "num_tokens": 17694403.0, "reward": 0.2265625, "reward_std": 0.15843652188777924, "rewards/equation_reward_func/mean": 0.2265625, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.021155555555555556, "grad_norm": 1.3517307149758822, "kl": 0.325927734375, "learning_rate": 4.3302941560111716e-07, "loss": 0.0003, "num_tokens": 17844611.0, "reward": 0.3203125, "reward_std": 0.1692390739917755, "rewards/equation_reward_func/mean": 0.3203125, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.021333333333333333, "grad_norm": 1.461430835003995, "kl": 0.2001953125, "learning_rate": 4.3179761960630357e-07, "loss": 0.0002, "num_tokens": 17994739.0, "reward": 0.140625, "reward_std": 0.1905868649482727, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.021511111111111113, "grad_norm": 1.4563651080656776, "kl": 0.222900390625, "learning_rate": 4.3055638489198236e-07, "loss": 0.0002, "num_tokens": 18144995.0, "reward": 0.1171875, "reward_std": 0.21433821320533752, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 1018.828125, "completions/mean_terminated_length": 362.0, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.02168888888888889, "grad_norm": 1.2321860488398484, "kl": 0.331298828125, "learning_rate": 4.293057759016063e-07, "loss": -0.0073, "num_tokens": 18294509.0, "reward": 0.1953125, "reward_std": 0.10629080981016159, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.021866666666666666, "grad_norm": 1.127015609270169, "kl": 0.17242431640625, "learning_rate": 4.280458575653296e-07, "loss": 0.0002, "num_tokens": 18444637.0, "reward": 0.21875, "reward_std": 0.1918674111366272, "rewards/equation_reward_func/mean": 0.21875, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 1018.9296875, "completions/mean_terminated_length": 375.0, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.022044444444444443, "grad_norm": 1.2596541334667415, "kl": 0.214599609375, "learning_rate": 4.2677669529663686e-07, "loss": 0.0047, "num_tokens": 18594068.0, "reward": 0.25, "reward_std": 0.18023644387722015, "rewards/equation_reward_func/mean": 0.25, "rewards/equation_reward_func/std": 0.434714138507843, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.022222222222222223, "grad_norm": 1.0912720297780243, "kl": 0.2236328125, "learning_rate": 4.2549835498894665e-07, "loss": 0.0002, "num_tokens": 18744036.0, "reward": 0.296875, "reward_std": 0.18968652188777924, "rewards/equation_reward_func/mean": 0.296875, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0224, "grad_norm": 186.64021491680717, "kl": 13.404541015625, "learning_rate": 4.2421090301219077e-07, "loss": 0.0135, "num_tokens": 18894180.0, "reward": 0.0234375, "reward_std": 0.07394562661647797, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 1022.71875, "completions/mean_terminated_length": 860.0, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 0.022577777777777776, "grad_norm": 0.9642473314413512, "kl": 0.2364501953125, "learning_rate": 4.229144062093679e-07, "loss": 0.0002, "num_tokens": 19043952.0, "reward": 0.3671875, "reward_std": 0.15092839300632477, "rewards/equation_reward_func/mean": 0.3671875, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.022755555555555557, "grad_norm": 38.416968662701784, "kl": 1.39447021484375, "learning_rate": 4.216089318930741e-07, "loss": 0.0014, "num_tokens": 19194144.0, "reward": 0.1640625, "reward_std": 0.14568254351615906, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.022933333333333333, "grad_norm": 0.863398222584579, "kl": 0.19775390625, "learning_rate": 4.2029454784200675e-07, "loss": 0.0002, "num_tokens": 19344176.0, "reward": 0.140625, "reward_std": 0.15984316170215607, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.02311111111111111, "grad_norm": 1.194503539250207, "kl": 0.1934814453125, "learning_rate": 4.189713222974466e-07, "loss": 0.0002, "num_tokens": 19494240.0, "reward": 0.4609375, "reward_std": 0.1128891110420227, "rewards/equation_reward_func/mean": 0.4609375, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.02328888888888889, "grad_norm": 1.7013679330285094, "kl": 0.2406005859375, "learning_rate": 4.1763932395971433e-07, "loss": 0.0002, "num_tokens": 19644288.0, "reward": 0.4375, "reward_std": 0.16703036427497864, "rewards/equation_reward_func/mean": 0.4375, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.023466666666666667, "grad_norm": 2.188684604832949, "kl": 0.1885986328125, "learning_rate": 4.162986219846037e-07, "loss": 0.0002, "num_tokens": 19794304.0, "reward": 0.2109375, "reward_std": 0.1692390739917755, "rewards/equation_reward_func/mean": 0.2109375, "rewards/equation_reward_func/std": 0.4095771610736847, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.023644444444444444, "grad_norm": 1.0143465949289936, "kl": 0.2154541015625, "learning_rate": 4.1494928597979117e-07, "loss": 0.0002, "num_tokens": 19944288.0, "reward": 0.09375, "reward_std": 0.09108919650316238, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.023822222222222224, "grad_norm": 0.6425274087993792, "kl": 0.320556640625, "learning_rate": 4.135913860012219e-07, "loss": 0.0003, "num_tokens": 20094384.0, "reward": 0.3671875, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.3671875, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.024, "grad_norm": 1.9119591141475343, "kl": 0.3507080078125, "learning_rate": 4.122249925494726e-07, "loss": 0.0004, "num_tokens": 20244720.0, "reward": 0.1796875, "reward_std": 0.1626407653093338, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.024177777777777777, "grad_norm": 1.4598963078162495, "kl": 0.2135009765625, "learning_rate": 4.10850176566091e-07, "loss": 0.0002, "num_tokens": 20394784.0, "reward": 0.1953125, "reward_std": 0.12654343247413635, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.024355555555555554, "grad_norm": 1.5157144794364117, "kl": 0.269775390625, "learning_rate": 4.094670094299131e-07, "loss": 0.0003, "num_tokens": 20544864.0, "reward": 0.2734375, "reward_std": 0.14943468570709229, "rewards/equation_reward_func/mean": 0.2734375, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.024533333333333334, "grad_norm": 1.80581776572753, "kl": 0.3927001953125, "learning_rate": 4.080755629533566e-07, "loss": 0.0004, "num_tokens": 20694800.0, "reward": 0.234375, "reward_std": 0.14347384870052338, "rewards/equation_reward_func/mean": 0.234375, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.02471111111111111, "grad_norm": 1.5732331949510052, "kl": 0.278564453125, "learning_rate": 4.066759093786931e-07, "loss": 0.0003, "num_tokens": 20844976.0, "reward": 0.28125, "reward_std": 0.15292394161224365, "rewards/equation_reward_func/mean": 0.28125, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.024888888888888887, "grad_norm": 7.517382799927848, "kl": 0.92236328125, "learning_rate": 4.052681213742971e-07, "loss": 0.0009, "num_tokens": 20995008.0, "reward": 0.3359375, "reward_std": 0.059839196503162384, "rewards/equation_reward_func/mean": 0.3359375, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.025066666666666668, "grad_norm": 6.302481912783854, "kl": 0.6650390625, "learning_rate": 4.038522720308732e-07, "loss": 0.0007, "num_tokens": 21144944.0, "reward": 0.59375, "reward_std": 0.3047843277454376, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.025244444444444444, "grad_norm": 0.9504144713656012, "kl": 0.2236328125, "learning_rate": 4.024284348576611e-07, "loss": 0.0002, "num_tokens": 21295024.0, "reward": 0.234375, "reward_std": 0.11022830754518509, "rewards/equation_reward_func/mean": 0.234375, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.02542222222222222, "grad_norm": 2.404596200646812, "kl": 0.296875, "learning_rate": 4.009966837786194e-07, "loss": 0.0003, "num_tokens": 21445200.0, "reward": 0.40625, "reward_std": 0.21497184038162231, "rewards/equation_reward_func/mean": 0.40625, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0256, "grad_norm": 5.040516222986242, "kl": 0.3662109375, "learning_rate": 3.9955709312858744e-07, "loss": 0.0004, "num_tokens": 21595232.0, "reward": 0.3359375, "reward_std": 0.305073618888855, "rewards/equation_reward_func/mean": 0.3359375, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.025777777777777778, "grad_norm": 1.3636920929106318, "kl": 0.1595458984375, "learning_rate": 3.981097376494259e-07, "loss": 0.0002, "num_tokens": 21745456.0, "reward": 0.1171875, "reward_std": 0.1128891110420227, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.025955555555555555, "grad_norm": 2.3554253327540406, "kl": 0.28057861328125, "learning_rate": 3.9665469248613616e-07, "loss": 0.0003, "num_tokens": 21895504.0, "reward": 0.328125, "reward_std": 0.12433473765850067, "rewards/equation_reward_func/mean": 0.328125, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.026133333333333335, "grad_norm": 25.676220939432554, "kl": 3.357177734375, "learning_rate": 3.951920331829592e-07, "loss": 0.0034, "num_tokens": 22045408.0, "reward": 0.6171875, "reward_std": 0.29959267377853394, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.02631111111111111, "grad_norm": 3.0126984776908015, "kl": 0.3270263671875, "learning_rate": 3.9372183567945314e-07, "loss": 0.0003, "num_tokens": 22195616.0, "reward": 0.3359375, "reward_std": 0.28322336077690125, "rewards/equation_reward_func/mean": 0.3359375, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.026488888888888888, "grad_norm": 2.423043662017283, "kl": 0.314453125, "learning_rate": 3.922441763065506e-07, "loss": 0.0003, "num_tokens": 22345712.0, "reward": 0.4296875, "reward_std": 0.14305339753627777, "rewards/equation_reward_func/mean": 0.4296875, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.02666666666666667, "grad_norm": 1.6542652076099005, "kl": 0.410400390625, "learning_rate": 3.907591317825956e-07, "loss": 0.0004, "num_tokens": 22495632.0, "reward": 0.6015625, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.026844444444444445, "grad_norm": 3.2511997715794356, "kl": 0.3416748046875, "learning_rate": 3.8926677920936093e-07, "loss": 0.0003, "num_tokens": 22645536.0, "reward": 0.34375, "reward_std": 0.1632782220840454, "rewards/equation_reward_func/mean": 0.34375, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 1015.6484375, "completions/mean_terminated_length": 489.5, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.027022222222222222, "grad_norm": 41.47803462143974, "kl": 4.93310546875, "learning_rate": 3.877671960680443e-07, "loss": 0.0049, "num_tokens": 22794451.0, "reward": 0.75, "reward_std": 0.2080235779285431, "rewards/equation_reward_func/mean": 0.75, "rewards/equation_reward_func/std": 0.434714138507843, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0272, "grad_norm": 5.72350697663499, "kl": 0.4898681640625, "learning_rate": 3.862604602152464e-07, "loss": 0.0005, "num_tokens": 22944547.0, "reward": 0.34375, "reward_std": 0.18683473765850067, "rewards/equation_reward_func/mean": 0.34375, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.02737777777777778, "grad_norm": 1.7409232696079044, "kl": 0.4661865234375, "learning_rate": 3.847466498789282e-07, "loss": 0.0005, "num_tokens": 23094803.0, "reward": 0.296875, "reward_std": 0.09859732538461685, "rewards/equation_reward_func/mean": 0.296875, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 1019.9609375, "completions/mean_terminated_length": 507.0, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 0.027555555555555555, "grad_norm": 3.019847280300027, "kl": 0.43408203125, "learning_rate": 3.8322584365434934e-07, "loss": 0.0025, "num_tokens": 23244286.0, "reward": 0.6171875, "reward_std": 0.2378891110420227, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 1019.6484375, "completions/mean_terminated_length": 467.0, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 0.027733333333333332, "grad_norm": 1.9588160003486783, "kl": 0.366455078125, "learning_rate": 3.816981204999882e-07, "loss": 0.0027, "num_tokens": 23393825.0, "reward": 0.28125, "reward_std": 0.19628483057022095, "rewards/equation_reward_func/mean": 0.28125, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 1022.6171875, "completions/mean_terminated_length": 847.0, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 0.027911111111111112, "grad_norm": 70.32680624580377, "kl": 2.532958984375, "learning_rate": 3.8016355973344173e-07, "loss": 0.0035, "num_tokens": 23543840.0, "reward": 0.390625, "reward_std": 0.16314704716205597, "rewards/equation_reward_func/mean": 0.390625, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.02808888888888889, "grad_norm": 1.762714814243919, "kl": 0.2900390625, "learning_rate": 3.786222410273078e-07, "loss": 0.0003, "num_tokens": 23693824.0, "reward": 0.28125, "reward_std": 0.18090170621871948, "rewards/equation_reward_func/mean": 0.28125, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 1024.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 1002.359375, "completions/mean_terminated_length": 470.0, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.028266666666666666, "grad_norm": 2.3680834942823696, "kl": 0.47607421875, "learning_rate": 3.7707424440504863e-07, "loss": 0.0025, "num_tokens": 23841022.0, "reward": 0.59375, "reward_std": 0.10519562661647797, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 1016.7421875, "completions/mean_terminated_length": 714.3333740234375, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.028444444444444446, "grad_norm": 10.569380085825292, "kl": 0.9423828125, "learning_rate": 3.755196502368361e-07, "loss": 0.0134, "num_tokens": 23990317.0, "reward": 0.4140625, "reward_std": 0.325918972492218, "rewards/equation_reward_func/mean": 0.4140625, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 1018.4453125, "completions/mean_terminated_length": 313.0, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.028622222222222223, "grad_norm": 7.343007133608892, "kl": 0.364013671875, "learning_rate": 3.739585392353787e-07, "loss": 0.0025, "num_tokens": 24139590.0, "reward": 0.53125, "reward_std": 0.3504300117492676, "rewards/equation_reward_func/mean": 0.53125, "rewards/equation_reward_func/std": 0.5009832978248596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0288, "grad_norm": 6.38266189430588, "kl": 0.55029296875, "learning_rate": 3.723909924517314e-07, "loss": 0.0006, "num_tokens": 24289718.0, "reward": 0.5, "reward_std": 0.27824485301971436, "rewards/equation_reward_func/mean": 0.5, "rewards/equation_reward_func/std": 0.5019646286964417, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 1020.484375, "completions/mean_terminated_length": 574.0, "completions/min_length": 574.0, "completions/min_terminated_length": 574.0, "epoch": 0.02897777777777778, "grad_norm": 4.075521996393651, "kl": 0.218994140625, "learning_rate": 3.7081709127108767e-07, "loss": 0.0015, "num_tokens": 24439412.0, "reward": 0.2265625, "reward_std": 0.14898642897605896, "rewards/equation_reward_func/mean": 0.2265625, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 1019.1796875, "completions/mean_terminated_length": 407.0, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.029155555555555556, "grad_norm": 4.233886008548603, "kl": 0.364990234375, "learning_rate": 3.692369174085534e-07, "loss": 0.0035, "num_tokens": 24588731.0, "reward": 0.5, "reward_std": 0.24401313066482544, "rewards/equation_reward_func/mean": 0.5, "rewards/equation_reward_func/std": 0.5019646286964417, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 1018.046875, "completions/mean_terminated_length": 643.0, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 0.029333333333333333, "grad_norm": 2.155597635859763, "kl": 0.3546142578125, "learning_rate": 3.6765055290490513e-07, "loss": 0.004, "num_tokens": 24738177.0, "reward": 0.25, "reward_std": 0.18217839300632477, "rewards/equation_reward_func/mean": 0.25, "rewards/equation_reward_func/std": 0.434714138507843, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 1018.25, "completions/mean_terminated_length": 288.0, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.02951111111111111, "grad_norm": 6.891989139232411, "kl": 1.4227294921875, "learning_rate": 3.6605808012233004e-07, "loss": 0.0045, "num_tokens": 24887553.0, "reward": 0.546875, "reward_std": 0.20048905909061432, "rewards/equation_reward_func/mean": 0.546875, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 1019.8828125, "completions/mean_terminated_length": 497.0, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.02968888888888889, "grad_norm": 41.363465483539514, "kl": 1.5928955078125, "learning_rate": 3.644595817401501e-07, "loss": 0.0032, "num_tokens": 25036994.0, "reward": 0.4375, "reward_std": 0.22293205559253693, "rewards/equation_reward_func/mean": 0.4375, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 1022.828125, "completions/mean_terminated_length": 874.0, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 0.029866666666666666, "grad_norm": 3.7418741027210762, "kl": 0.2694091796875, "learning_rate": 3.628551407505292e-07, "loss": 0.0014, "num_tokens": 25187020.0, "reward": 0.375, "reward_std": 0.275077760219574, "rewards/equation_reward_func/mean": 0.375, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.030044444444444443, "grad_norm": 5.2290660806403, "kl": 0.6962890625, "learning_rate": 3.6124484045416483e-07, "loss": 0.0007, "num_tokens": 25337036.0, "reward": 0.625, "reward_std": 0.2352283000946045, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.030222222222222223, "grad_norm": 3.9593952621251427, "kl": 0.405517578125, "learning_rate": 3.5962876445596224e-07, "loss": 0.0004, "num_tokens": 25487164.0, "reward": 0.3984375, "reward_std": 0.14965170621871948, "rewards/equation_reward_func/mean": 0.3984375, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 1020.3671875, "completions/mean_terminated_length": 559.0, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.0304, "grad_norm": 5.570927870883592, "kl": 0.3348388671875, "learning_rate": 3.580069966606949e-07, "loss": 0.0015, "num_tokens": 25636747.0, "reward": 0.375, "reward_std": 0.2257782220840454, "rewards/equation_reward_func/mean": 0.375, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.030577777777777777, "grad_norm": 6.614750876845712, "kl": 0.266357421875, "learning_rate": 3.563796212686475e-07, "loss": 0.0003, "num_tokens": 25787003.0, "reward": 0.1484375, "reward_std": 0.10253482311964035, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.030755555555555557, "grad_norm": 5.494588653288782, "kl": 0.4801025390625, "learning_rate": 3.547467227712444e-07, "loss": 0.0005, "num_tokens": 25936939.0, "reward": 0.390625, "reward_std": 0.1535891890525818, "rewards/equation_reward_func/mean": 0.390625, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 1010.109375, "completions/mean_terminated_length": 431.3333435058594, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.030933333333333334, "grad_norm": 6.616129190159188, "kl": 0.5126953125, "learning_rate": 3.531083859466635e-07, "loss": 0.0133, "num_tokens": 26085337.0, "reward": 0.390625, "reward_std": 0.2708735466003418, "rewards/equation_reward_func/mean": 0.390625, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 1019.3515625, "completions/mean_terminated_length": 429.0, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.03111111111111111, "grad_norm": 1.8393638599574536, "kl": 0.339599609375, "learning_rate": 3.5146469585543386e-07, "loss": 0.0028, "num_tokens": 26234726.0, "reward": 0.359375, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.359375, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03128888888888889, "grad_norm": 2.296402784651875, "kl": 0.36328125, "learning_rate": 3.498157378360204e-07, "loss": 0.0004, "num_tokens": 26384934.0, "reward": 0.2265625, "reward_std": 0.050389111042022705, "rewards/equation_reward_func/mean": 0.2265625, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 1017.4296875, "completions/mean_terminated_length": 183.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.031466666666666664, "grad_norm": 4.3839867477547205, "kl": 0.412353515625, "learning_rate": 3.481615975003922e-07, "loss": 0.004, "num_tokens": 26534109.0, "reward": 0.4609375, "reward_std": 0.2876407504081726, "rewards/equation_reward_func/mean": 0.4609375, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03164444444444445, "grad_norm": 1.5247224488456845, "kl": 0.25732421875, "learning_rate": 3.465023607295784e-07, "loss": 0.0003, "num_tokens": 26684189.0, "reward": 0.359375, "reward_std": 0.16703036427497864, "rewards/equation_reward_func/mean": 0.359375, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.031822222222222224, "grad_norm": 1.1753507818139157, "kl": 0.28125, "learning_rate": 3.448381136692089e-07, "loss": 0.0003, "num_tokens": 26834269.0, "reward": 0.40625, "reward_std": 0.1379890739917755, "rewards/equation_reward_func/mean": 0.40625, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 1019.234375, "completions/mean_terminated_length": 414.0, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.032, "grad_norm": 2.060202552181063, "kl": 0.2965087890625, "learning_rate": 3.4316894272504225e-07, "loss": 0.0003, "num_tokens": 26983755.0, "reward": 0.3203125, "reward_std": 0.21763262152671814, "rewards/equation_reward_func/mean": 0.3203125, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 1020.1953125, "completions/mean_terminated_length": 537.0, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 0.03217777777777778, "grad_norm": 1.380214768658021, "kl": 0.2884521484375, "learning_rate": 3.4149493455847897e-07, "loss": 0.0003, "num_tokens": 27133236.0, "reward": 0.3515625, "reward_std": 0.1128891110420227, "rewards/equation_reward_func/mean": 0.3515625, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.032355555555555554, "grad_norm": 3.061067762640794, "kl": 0.44189453125, "learning_rate": 3.398161760820628e-07, "loss": 0.0004, "num_tokens": 27283300.0, "reward": 0.3515625, "reward_std": 0.07394562661647797, "rewards/equation_reward_func/mean": 0.3515625, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03253333333333333, "grad_norm": 0.7026944494895592, "kl": 0.2001953125, "learning_rate": 3.3813275445496766e-07, "loss": 0.0002, "num_tokens": 27433508.0, "reward": 0.1015625, "reward_std": 0.050389111042022705, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.032711111111111114, "grad_norm": 0.8347836479214163, "kl": 0.2882080078125, "learning_rate": 3.364447570784731e-07, "loss": 0.0003, "num_tokens": 27583620.0, "reward": 0.3125, "reward_std": 0.06454972177743912, "rewards/equation_reward_func/mean": 0.3125, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03288888888888889, "grad_norm": 2.0443715202954325, "kl": 0.360595703125, "learning_rate": 3.347522715914262e-07, "loss": 0.0004, "num_tokens": 27733684.0, "reward": 0.6953125, "reward_std": 0.20818254351615906, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 1013.9765625, "completions/mean_terminated_length": 382.5, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.03306666666666667, "grad_norm": 3.2542212448513155, "kl": 0.65771484375, "learning_rate": 3.3305538586569116e-07, "loss": -0.0143, "num_tokens": 27882385.0, "reward": 0.6796875, "reward_std": 0.13644562661647797, "rewards/equation_reward_func/mean": 0.6796875, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.033244444444444445, "grad_norm": 1.9917457589553744, "kl": 0.28857421875, "learning_rate": 3.313541880015877e-07, "loss": 0.0003, "num_tokens": 28032529.0, "reward": 0.328125, "reward_std": 0.22359731793403625, "rewards/equation_reward_func/mean": 0.328125, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 1017.9140625, "completions/mean_terminated_length": 245.0, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.03342222222222222, "grad_norm": 1.6267494753843437, "kl": 0.2791748046875, "learning_rate": 3.296487663233168e-07, "loss": -0.0227, "num_tokens": 28181862.0, "reward": 0.484375, "reward_std": 0.23898044228553772, "rewards/equation_reward_func/mean": 0.484375, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0336, "grad_norm": 1.60685551807595, "kl": 0.2919921875, "learning_rate": 3.279392093743747e-07, "loss": 0.0003, "num_tokens": 28331926.0, "reward": 0.5390625, "reward_std": 0.20033007860183716, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.033777777777777775, "grad_norm": 7.456434965548244, "kl": 0.4049072265625, "learning_rate": 3.2622560591295606e-07, "loss": 0.0004, "num_tokens": 28482006.0, "reward": 0.34375, "reward_std": 0.11840169876813889, "rewards/equation_reward_func/mean": 0.34375, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03395555555555556, "grad_norm": 3.390889359449165, "kl": 0.39111328125, "learning_rate": 3.245080449073459e-07, "loss": 0.0004, "num_tokens": 28632118.0, "reward": 0.21875, "reward_std": 0.1441391110420227, "rewards/equation_reward_func/mean": 0.21875, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 1020.8984375, "completions/mean_terminated_length": 627.0, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "epoch": 0.034133333333333335, "grad_norm": 1.7344229671493958, "kl": 0.315185546875, "learning_rate": 3.227866155313002e-07, "loss": 0.0003, "num_tokens": 28781705.0, "reward": 0.484375, "reward_std": 0.18968652188777924, "rewards/equation_reward_func/mean": 0.484375, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03431111111111111, "grad_norm": 2.0144481228983855, "kl": 0.3726806640625, "learning_rate": 3.210614071594162e-07, "loss": 0.0004, "num_tokens": 28931689.0, "reward": 0.78125, "reward_std": 0.21917606890201569, "rewards/equation_reward_func/mean": 0.78125, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03448888888888889, "grad_norm": 4.074153411108236, "kl": 0.283203125, "learning_rate": 3.1933250936249213e-07, "loss": 0.0003, "num_tokens": 29081913.0, "reward": 0.421875, "reward_std": 0.3169713318347931, "rewards/equation_reward_func/mean": 0.421875, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 1017.890625, "completions/mean_terminated_length": 242.0, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.034666666666666665, "grad_norm": 0.6166688550318619, "kl": 0.274169921875, "learning_rate": 3.1760001190287695e-07, "loss": 0.0003, "num_tokens": 29231275.0, "reward": 0.140625, "reward_std": 0.042695626616477966, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 1019.0390625, "completions/mean_terminated_length": 706.5, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 0.03484444444444444, "grad_norm": 9.396209107038173, "kl": 2.702880859375, "learning_rate": 3.158640047298098e-07, "loss": 0.0038, "num_tokens": 29380640.0, "reward": 0.7578125, "reward_std": 0.34902724623680115, "rewards/equation_reward_func/mean": 0.7578125, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.035022222222222225, "grad_norm": 1.165871304920879, "kl": 0.392822265625, "learning_rate": 3.141245779747502e-07, "loss": 0.0004, "num_tokens": 29530640.0, "reward": 0.4609375, "reward_std": 0.08715169876813889, "rewards/equation_reward_func/mean": 0.4609375, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 1013.7734375, "completions/mean_terminated_length": 369.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.0352, "grad_norm": 2.149417623121469, "kl": 0.3369140625, "learning_rate": 3.123818219466981e-07, "loss": 0.0022, "num_tokens": 29679459.0, "reward": 0.328125, "reward_std": 0.09108919650316238, "rewards/equation_reward_func/mean": 0.328125, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 1019.8515625, "completions/mean_terminated_length": 493.0, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 0.03537777777777778, "grad_norm": 2.2329541959665464, "kl": 0.4510498046875, "learning_rate": 3.106358271275056e-07, "loss": 0.0031, "num_tokens": 29829008.0, "reward": 0.3046875, "reward_std": 0.1414783000946045, "rewards/equation_reward_func/mean": 0.3046875, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.035555555555555556, "grad_norm": 4.198704815789355, "kl": 0.3831787109375, "learning_rate": 3.088866841671789e-07, "loss": 0.0004, "num_tokens": 29979024.0, "reward": 0.609375, "reward_std": 0.18954971432685852, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03573333333333333, "grad_norm": 2.975524150097201, "kl": 0.3837890625, "learning_rate": 3.0713448387917227e-07, "loss": 0.0004, "num_tokens": 30129040.0, "reward": 0.4609375, "reward_std": 0.23677174746990204, "rewards/equation_reward_func/mean": 0.4609375, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03591111111111111, "grad_norm": 7.445478939481905, "kl": 0.9071044921875, "learning_rate": 3.0537931723567253e-07, "loss": 0.0009, "num_tokens": 30279296.0, "reward": 0.1796875, "reward_std": 0.10629080981016159, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 1023.3359375, "completions/mean_terminated_length": 939.0, "completions/min_length": 939.0, "completions/min_terminated_length": 939.0, "epoch": 0.036088888888888886, "grad_norm": 1.618843102702456, "kl": 0.349365234375, "learning_rate": 3.0362127536287636e-07, "loss": 0.0006, "num_tokens": 30429195.0, "reward": 0.3515625, "reward_std": 0.050389111042022705, "rewards/equation_reward_func/mean": 0.3515625, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 1012.8046875, "completions/mean_terminated_length": 307.5, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.03626666666666667, "grad_norm": 2.199240034282271, "kl": 0.4091796875, "learning_rate": 3.01860449536259e-07, "loss": 0.003, "num_tokens": 30577778.0, "reward": 0.3203125, "reward_std": 0.13578036427497864, "rewards/equation_reward_func/mean": 0.3203125, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.036444444444444446, "grad_norm": 3.3885059259037162, "kl": 0.39569091796875, "learning_rate": 3.0009693117583523e-07, "loss": 0.0004, "num_tokens": 30727874.0, "reward": 0.1796875, "reward_std": 0.16503483057022095, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 1017.9296875, "completions/mean_terminated_length": 247.0, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.03662222222222222, "grad_norm": 2.3069232873693832, "kl": 0.472900390625, "learning_rate": 2.983308118414131e-07, "loss": 0.0005, "num_tokens": 30877065.0, "reward": 0.484375, "reward_std": 0.2130298763513565, "rewards/equation_reward_func/mean": 0.484375, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 1021.8046875, "completions/mean_terminated_length": 743.0, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 0.0368, "grad_norm": 1.864816223816505, "kl": 0.3232421875, "learning_rate": 2.965621832278401e-07, "loss": 0.0003, "num_tokens": 31026896.0, "reward": 0.09375, "reward_std": 0.16974535584449768, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.036977777777777776, "grad_norm": 1.9107299505972535, "kl": 0.291015625, "learning_rate": 2.9479113716024275e-07, "loss": 0.0003, "num_tokens": 31177200.0, "reward": 0.1484375, "reward_std": 0.11664125323295593, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03715555555555555, "grad_norm": 2.500263990793289, "kl": 0.53759765625, "learning_rate": 2.9301776558925875e-07, "loss": 0.0005, "num_tokens": 31327312.0, "reward": 0.2578125, "reward_std": 0.07394562661647797, "rewards/equation_reward_func/mean": 0.2578125, "rewards/equation_reward_func/std": 0.43914902210235596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.037333333333333336, "grad_norm": 3.2674209857026884, "kl": 0.5302734375, "learning_rate": 2.912421605862632e-07, "loss": 0.0005, "num_tokens": 31477152.0, "reward": 0.3671875, "reward_std": 0.19828036427497864, "rewards/equation_reward_func/mean": 0.3671875, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03751111111111111, "grad_norm": 1.5340299901808074, "kl": 0.5916748046875, "learning_rate": 2.894644143385885e-07, "loss": 0.0006, "num_tokens": 31627232.0, "reward": 0.609375, "reward_std": 0.10519562661647797, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03768888888888889, "grad_norm": 1.9087572319482577, "kl": 0.555908203125, "learning_rate": 2.8768461914473794e-07, "loss": 0.0006, "num_tokens": 31777360.0, "reward": 0.4375, "reward_std": 0.13378483057022095, "rewards/equation_reward_func/mean": 0.4375, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.037866666666666667, "grad_norm": 1.7348358531912607, "kl": 0.4742431640625, "learning_rate": 2.859028674095937e-07, "loss": 0.0005, "num_tokens": 31927536.0, "reward": 0.4140625, "reward_std": 0.12654343247413635, "rewards/equation_reward_func/mean": 0.4140625, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 1018.8828125, "completions/mean_terminated_length": 369.0, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.03804444444444444, "grad_norm": 0.7051640590161659, "kl": 0.398681640625, "learning_rate": 2.8411925163961926e-07, "loss": 0.0004, "num_tokens": 32076913.0, "reward": 0.2421875, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.2421875, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 1019.5234375, "completions/mean_terminated_length": 737.5, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "epoch": 0.03822222222222222, "grad_norm": 3.022899144168358, "kl": 0.465576171875, "learning_rate": 2.823338644380566e-07, "loss": 0.0005, "num_tokens": 32226548.0, "reward": 0.2421875, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.2421875, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 1011.796875, "completions/mean_terminated_length": 243.0, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.0384, "grad_norm": 1.8759397506272986, "kl": 0.584228515625, "learning_rate": 2.8054679850011825e-07, "loss": 0.0054, "num_tokens": 32375098.0, "reward": 0.4765625, "reward_std": 0.07394562661647797, "rewards/equation_reward_func/mean": 0.4765625, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03857777777777778, "grad_norm": 4.036385771679356, "kl": 0.4168701171875, "learning_rate": 2.7875814660817504e-07, "loss": 0.0004, "num_tokens": 32525258.0, "reward": 0.3125, "reward_std": 0.26517558097839355, "rewards/equation_reward_func/mean": 0.3125, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03875555555555556, "grad_norm": 1.2765088858212177, "kl": 0.317626953125, "learning_rate": 2.769680016269385e-07, "loss": 0.0003, "num_tokens": 32675322.0, "reward": 0.15625, "reward_std": 0.055901698768138885, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.038933333333333334, "grad_norm": 3.394281977185132, "kl": 0.818603515625, "learning_rate": 2.751764564986396e-07, "loss": 0.0008, "num_tokens": 32825322.0, "reward": 0.28125, "reward_std": 0.13378483057022095, "rewards/equation_reward_func/mean": 0.28125, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03911111111111111, "grad_norm": 5.212701863988515, "kl": 0.48974609375, "learning_rate": 2.7338360423820327e-07, "loss": 0.0005, "num_tokens": 32975498.0, "reward": 0.34375, "reward_std": 0.24478617310523987, "rewards/equation_reward_func/mean": 0.34375, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03928888888888889, "grad_norm": 31.290075554961753, "kl": 3.33349609375, "learning_rate": 2.715895379284194e-07, "loss": 0.0033, "num_tokens": 33125594.0, "reward": 0.5390625, "reward_std": 0.16923905909061432, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.039466666666666664, "grad_norm": 4674.344562884797, "kl": 183.544677734375, "learning_rate": 2.6979435071510956e-07, "loss": 0.1834, "num_tokens": 33275738.0, "reward": 0.34375, "reward_std": 0.17558008432388306, "rewards/equation_reward_func/mean": 0.34375, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 1020.0859375, "completions/mean_terminated_length": 523.0, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 0.03964444444444445, "grad_norm": 473.9963863860674, "kl": 22.975341796875, "learning_rate": 2.6799813580229174e-07, "loss": 0.023, "num_tokens": 33425461.0, "reward": 0.34375, "reward_std": 0.23808008432388306, "rewards/equation_reward_func/mean": 0.34375, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.039822222222222224, "grad_norm": 53.305771598131855, "kl": 5.0078125, "learning_rate": 2.662009864473406e-07, "loss": 0.005, "num_tokens": 33575669.0, "reward": 0.3515625, "reward_std": 0.2863866090774536, "rewards/equation_reward_func/mean": 0.3515625, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04, "grad_norm": 50.586708672159034, "kl": 2.627197265625, "learning_rate": 2.6440299595614606e-07, "loss": 0.0026, "num_tokens": 33725621.0, "reward": 0.328125, "reward_std": 0.12433473765850067, "rewards/equation_reward_func/mean": 0.328125, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04017777777777778, "grad_norm": 344.6473091499346, "kl": 17.0888671875, "learning_rate": 2.626042576782687e-07, "loss": 0.0171, "num_tokens": 33875781.0, "reward": 0.5234375, "reward_std": 0.2251407504081726, "rewards/equation_reward_func/mean": 0.5234375, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 1020.8359375, "completions/mean_terminated_length": 619.0, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.040355555555555554, "grad_norm": 1449.3437514738944, "kl": 33.110595703125, "learning_rate": 2.6080486500209347e-07, "loss": 0.0445, "num_tokens": 34025568.0, "reward": 0.1015625, "reward_std": 0.08715169876813889, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04053333333333333, "grad_norm": 5217.154181582886, "kl": 283.72021484375, "learning_rate": 2.590049113499809e-07, "loss": 0.2838, "num_tokens": 34175712.0, "reward": 0.5, "reward_std": 0.22782793641090393, "rewards/equation_reward_func/mean": 0.5, "rewards/equation_reward_func/std": 0.5019646286964417, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.040711111111111115, "grad_norm": 4.054158607237894, "kl": 0.3785400390625, "learning_rate": 2.572044901734166e-07, "loss": 0.0004, "num_tokens": 34325936.0, "reward": 0.0078125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04088888888888889, "grad_norm": 30.52243180963527, "kl": 3.31103515625, "learning_rate": 2.5540369494815966e-07, "loss": 0.0033, "num_tokens": 34476112.0, "reward": 0.3515625, "reward_std": 0.17006750404834747, "rewards/equation_reward_func/mean": 0.3515625, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04106666666666667, "grad_norm": 14.183864563983192, "kl": 1.187744140625, "learning_rate": 2.536026191693893e-07, "loss": 0.0012, "num_tokens": 34626192.0, "reward": 0.234375, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.234375, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.041244444444444445, "grad_norm": 3390.5316359967046, "kl": 76.3818359375, "learning_rate": 2.5180135634685064e-07, "loss": 0.0764, "num_tokens": 34776224.0, "reward": 0.484375, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.484375, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04142222222222222, "grad_norm": 41.78839487675275, "kl": 3.020751953125, "learning_rate": 2.5e-07, "loss": 0.003, "num_tokens": 34926144.0, "reward": 0.578125, "reward_std": 0.14789125323295593, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0416, "grad_norm": 11.71823133458197, "kl": 0.776123046875, "learning_rate": 2.4819864365314934e-07, "loss": 0.0008, "num_tokens": 35076432.0, "reward": 0.4140625, "reward_std": 0.1606174111366272, "rewards/equation_reward_func/mean": 0.4140625, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.041777777777777775, "grad_norm": 9.47344162450626, "kl": 1.062255859375, "learning_rate": 2.4639738083061073e-07, "loss": 0.0011, "num_tokens": 35226624.0, "reward": 0.375, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.375, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04195555555555556, "grad_norm": 2079.04795786749, "kl": 46.068603515625, "learning_rate": 2.445963050518403e-07, "loss": 0.046, "num_tokens": 35376736.0, "reward": 0.3828125, "reward_std": 0.24052390456199646, "rewards/equation_reward_func/mean": 0.3828125, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.042133333333333335, "grad_norm": 6.17214741900066, "kl": 0.896484375, "learning_rate": 2.4279550982658345e-07, "loss": 0.0009, "num_tokens": 35526944.0, "reward": 0.46875, "reward_std": 0.0816391110420227, "rewards/equation_reward_func/mean": 0.46875, "rewards/equation_reward_func/std": 0.5009832978248596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04231111111111111, "grad_norm": 36.079398694046525, "kl": 3.13671875, "learning_rate": 2.4099508865001914e-07, "loss": 0.0031, "num_tokens": 35676912.0, "reward": 0.2265625, "reward_std": 0.12984731793403625, "rewards/equation_reward_func/mean": 0.2265625, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04248888888888889, "grad_norm": 4.685074390848359, "kl": 0.38671875, "learning_rate": 2.3919513499790646e-07, "loss": 0.0004, "num_tokens": 35827040.0, "reward": 0.15625, "reward_std": 0.2181389182806015, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.042666666666666665, "grad_norm": 12.950823812050304, "kl": 0.58740234375, "learning_rate": 2.3739574232173134e-07, "loss": 0.0006, "num_tokens": 35976992.0, "reward": 0.75, "reward_std": 0.1632782220840454, "rewards/equation_reward_func/mean": 0.75, "rewards/equation_reward_func/std": 0.434714138507843, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04284444444444444, "grad_norm": 4.145336992759783, "kl": 0.459716796875, "learning_rate": 2.3559700404385394e-07, "loss": 0.0005, "num_tokens": 36127072.0, "reward": 0.6328125, "reward_std": 0.18904343247413635, "rewards/equation_reward_func/mean": 0.6328125, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.043022222222222226, "grad_norm": 5.352878969306266, "kl": 0.37353515625, "learning_rate": 2.3379901355265936e-07, "loss": 0.0004, "num_tokens": 36277216.0, "reward": 0.46875, "reward_std": 0.3014804422855377, "rewards/equation_reward_func/mean": 0.46875, "rewards/equation_reward_func/std": 0.5009832978248596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0432, "grad_norm": 11.572106210290512, "kl": 0.852783203125, "learning_rate": 2.3200186419770823e-07, "loss": 0.0009, "num_tokens": 36427296.0, "reward": 0.40625, "reward_std": 0.15119513869285583, "rewards/equation_reward_func/mean": 0.40625, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04337777777777778, "grad_norm": 4.190365627005719, "kl": 0.4508056640625, "learning_rate": 2.3020564928489041e-07, "loss": 0.0005, "num_tokens": 36577232.0, "reward": 0.5234375, "reward_std": 0.15763446688652039, "rewards/equation_reward_func/mean": 0.5234375, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.043555555555555556, "grad_norm": 3.331218853761097, "kl": 0.28326416015625, "learning_rate": 2.284104620715807e-07, "loss": 0.0003, "num_tokens": 36727472.0, "reward": 0.1171875, "reward_std": 0.10629080981016159, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04373333333333333, "grad_norm": 2.385943253578034, "kl": 0.6612548828125, "learning_rate": 2.2661639576179676e-07, "loss": 0.0007, "num_tokens": 36877408.0, "reward": 0.5, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.5, "rewards/equation_reward_func/std": 0.5019646286964417, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04391111111111111, "grad_norm": 15.034805494562207, "kl": 0.428466796875, "learning_rate": 2.2482354350136043e-07, "loss": 0.0004, "num_tokens": 37027392.0, "reward": 0.5859375, "reward_std": 0.17914125323295593, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.044088888888888886, "grad_norm": 6.861399433276675, "kl": 0.35986328125, "learning_rate": 2.2303199837306153e-07, "loss": 0.0004, "num_tokens": 37177584.0, "reward": 0.3671875, "reward_std": 0.12654343247413635, "rewards/equation_reward_func/mean": 0.3671875, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 1020.9765625, "completions/mean_terminated_length": 637.0, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 0.04426666666666667, "grad_norm": 6.287042455899018, "kl": 0.4658203125, "learning_rate": 2.2124185339182496e-07, "loss": 0.0005, "num_tokens": 37327133.0, "reward": 0.5390625, "reward_std": 0.21523858606815338, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.044444444444444446, "grad_norm": 6.091118268761547, "kl": 0.41015625, "learning_rate": 2.194532014998817e-07, "loss": 0.0004, "num_tokens": 37477181.0, "reward": 0.4921875, "reward_std": 0.2343776822090149, "rewards/equation_reward_func/mean": 0.4921875, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04462222222222222, "grad_norm": 4.683666190865392, "kl": 0.429931640625, "learning_rate": 2.1766613556194344e-07, "loss": 0.0004, "num_tokens": 37627133.0, "reward": 0.59375, "reward_std": 0.21073855459690094, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0448, "grad_norm": 3.841241350263643, "kl": 0.446044921875, "learning_rate": 2.1588074836038071e-07, "loss": 0.0004, "num_tokens": 37777245.0, "reward": 0.3828125, "reward_std": 0.14523044228553772, "rewards/equation_reward_func/mean": 0.3828125, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.044977777777777776, "grad_norm": 3.032621222295675, "kl": 0.295654296875, "learning_rate": 2.1409713259040628e-07, "loss": 0.0003, "num_tokens": 37927421.0, "reward": 0.359375, "reward_std": 0.17078250646591187, "rewards/equation_reward_func/mean": 0.359375, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04515555555555555, "grad_norm": 4.827930222268017, "kl": 0.2357177734375, "learning_rate": 2.1231538085526204e-07, "loss": 0.0002, "num_tokens": 38077597.0, "reward": 0.2109375, "reward_std": 0.12233919650316238, "rewards/equation_reward_func/mean": 0.2109375, "rewards/equation_reward_func/std": 0.4095771610736847, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04533333333333334, "grad_norm": 8.327808498826666, "kl": 0.400390625, "learning_rate": 2.105355856614115e-07, "loss": 0.0004, "num_tokens": 38227437.0, "reward": 0.5234375, "reward_std": 0.30415070056915283, "rewards/equation_reward_func/mean": 0.5234375, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04551111111111111, "grad_norm": 10.113752378347993, "kl": 0.2716064453125, "learning_rate": 2.0875783941373686e-07, "loss": 0.0003, "num_tokens": 38377661.0, "reward": 0.3046875, "reward_std": 0.22596919536590576, "rewards/equation_reward_func/mean": 0.3046875, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04568888888888889, "grad_norm": 3.3738498952188727, "kl": 0.2901611328125, "learning_rate": 2.069822344107413e-07, "loss": 0.0003, "num_tokens": 38527853.0, "reward": 0.265625, "reward_std": 0.24821737408638, "rewards/equation_reward_func/mean": 0.265625, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04586666666666667, "grad_norm": 1.8465749053787883, "kl": 0.2725830078125, "learning_rate": 2.052088628397572e-07, "loss": 0.0003, "num_tokens": 38677997.0, "reward": 0.0546875, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04604444444444444, "grad_norm": 3.863021946489731, "kl": 0.44189453125, "learning_rate": 2.034378167721599e-07, "loss": 0.0004, "num_tokens": 38827981.0, "reward": 0.7421875, "reward_std": 0.2392735630273819, "rewards/equation_reward_func/mean": 0.7421875, "rewards/equation_reward_func/std": 0.43914902210235596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04622222222222222, "grad_norm": 3.2808299300550545, "kl": 0.379638671875, "learning_rate": 2.0166918815858688e-07, "loss": 0.0004, "num_tokens": 38978013.0, "reward": 0.3828125, "reward_std": 0.24164125323295593, "rewards/equation_reward_func/mean": 0.3828125, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0464, "grad_norm": 0.10501099254266726, "kl": 0.324462890625, "learning_rate": 1.9990306882416485e-07, "loss": 0.0003, "num_tokens": 39128205.0, "reward": 0.375, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.375, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04657777777777778, "grad_norm": 1.371472062025808, "kl": 0.2908935546875, "learning_rate": 1.9813955046374102e-07, "loss": 0.0003, "num_tokens": 39278317.0, "reward": 0.1484375, "reward_std": 0.07394562661647797, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04675555555555556, "grad_norm": 4869.125445284393, "kl": 268.3577880859375, "learning_rate": 1.9637872463712362e-07, "loss": 0.268, "num_tokens": 39428365.0, "reward": 0.21875, "reward_std": 0.12433473765850067, "rewards/equation_reward_func/mean": 0.21875, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.046933333333333334, "grad_norm": 2.1065019813436483, "kl": 0.33203125, "learning_rate": 1.946206827643275e-07, "loss": 0.0003, "num_tokens": 39578493.0, "reward": 0.328125, "reward_std": 0.11840169876813889, "rewards/equation_reward_func/mean": 0.328125, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04711111111111111, "grad_norm": 1.4385775563933836, "kl": 0.2474365234375, "learning_rate": 1.9286551612082773e-07, "loss": 0.0002, "num_tokens": 39728653.0, "reward": 0.21875, "reward_std": 0.08539125323295593, "rewards/equation_reward_func/mean": 0.21875, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04728888888888889, "grad_norm": 3.941556286070112, "kl": 0.5374755859375, "learning_rate": 1.9111331583282103e-07, "loss": 0.0005, "num_tokens": 39878813.0, "reward": 0.21875, "reward_std": 0.18090170621871948, "rewards/equation_reward_func/mean": 0.21875, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.047466666666666664, "grad_norm": 2.235477587914323, "kl": 0.32373046875, "learning_rate": 1.8936417287249446e-07, "loss": 0.0003, "num_tokens": 40028973.0, "reward": 0.3515625, "reward_std": 0.15558473765850067, "rewards/equation_reward_func/mean": 0.3515625, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04764444444444445, "grad_norm": 1.1279570923659827, "kl": 0.349609375, "learning_rate": 1.8761817805330195e-07, "loss": 0.0003, "num_tokens": 40178989.0, "reward": 0.375, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.375, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.047822222222222224, "grad_norm": 4.02478672160254, "kl": 0.360107421875, "learning_rate": 1.8587542202524985e-07, "loss": 0.0004, "num_tokens": 40328989.0, "reward": 0.1875, "reward_std": 0.17430339753627777, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.048, "grad_norm": 9.400419627836385, "kl": 0.50830078125, "learning_rate": 1.8413599527019018e-07, "loss": 0.0005, "num_tokens": 40479021.0, "reward": 0.6640625, "reward_std": 0.24494513869285583, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04817777777777778, "grad_norm": 2.518448261513236, "kl": 0.3475341796875, "learning_rate": 1.82399988097123e-07, "loss": 0.0003, "num_tokens": 40628957.0, "reward": 0.3203125, "reward_std": 0.12654343247413635, "rewards/equation_reward_func/mean": 0.3203125, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.048355555555555554, "grad_norm": 5.453656835742769, "kl": 0.3681640625, "learning_rate": 1.806674906375079e-07, "loss": 0.0004, "num_tokens": 40779053.0, "reward": 0.2734375, "reward_std": 0.1692390739917755, "rewards/equation_reward_func/mean": 0.2734375, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04853333333333333, "grad_norm": 7.051391983308751, "kl": 0.40673828125, "learning_rate": 1.7893859284058378e-07, "loss": 0.0004, "num_tokens": 40929197.0, "reward": 0.3203125, "reward_std": 0.1692390739917755, "rewards/equation_reward_func/mean": 0.3203125, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04871111111111111, "grad_norm": 11.947446546421647, "kl": 0.890380859375, "learning_rate": 1.7721338446869976e-07, "loss": 0.0009, "num_tokens": 41079293.0, "reward": 0.4765625, "reward_std": 0.15558473765850067, "rewards/equation_reward_func/mean": 0.4765625, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04888888888888889, "grad_norm": 2.4193510492073207, "kl": 0.646240234375, "learning_rate": 1.7549195509265407e-07, "loss": 0.0006, "num_tokens": 41229389.0, "reward": 0.359375, "reward_std": 0.042695626616477966, "rewards/equation_reward_func/mean": 0.359375, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04906666666666667, "grad_norm": 7.921046078529515, "kl": 0.434326171875, "learning_rate": 1.7377439408704392e-07, "loss": 0.0004, "num_tokens": 41379613.0, "reward": 0.1953125, "reward_std": 0.14568254351615906, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.049244444444444445, "grad_norm": 4.36823927242396, "kl": 0.33447265625, "learning_rate": 1.7206079062562536e-07, "loss": 0.0003, "num_tokens": 41529917.0, "reward": 0.140625, "reward_std": 0.1280868798494339, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04942222222222222, "grad_norm": 17.613787526523513, "kl": 2.544677734375, "learning_rate": 1.7035123367668323e-07, "loss": 0.0025, "num_tokens": 41679949.0, "reward": 0.3671875, "reward_std": 0.11664125323295593, "rewards/equation_reward_func/mean": 0.3671875, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0496, "grad_norm": 4.673033850790583, "kl": 0.885986328125, "learning_rate": 1.6864581199841226e-07, "loss": 0.0009, "num_tokens": 41830109.0, "reward": 0.5859375, "reward_std": 0.1128891110420227, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.049777777777777775, "grad_norm": 4.930460655029198, "kl": 0.5733642578125, "learning_rate": 1.6694461413430893e-07, "loss": 0.0006, "num_tokens": 41980333.0, "reward": 0.375, "reward_std": 0.2004890739917755, "rewards/equation_reward_func/mean": 0.375, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.04995555555555556, "grad_norm": 13.532835663295023, "kl": 1.1680908203125, "learning_rate": 1.6524772840857388e-07, "loss": 0.0012, "num_tokens": 42130365.0, "reward": 0.3671875, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.3671875, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.050133333333333335, "grad_norm": 6.8284398354744935, "kl": 0.658935546875, "learning_rate": 1.6355524292152684e-07, "loss": 0.0007, "num_tokens": 42280333.0, "reward": 0.5390625, "reward_std": 0.1606174111366272, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05031111111111111, "grad_norm": 5.548355313653332, "kl": 0.601318359375, "learning_rate": 1.6186724554503237e-07, "loss": 0.0006, "num_tokens": 42430269.0, "reward": 0.203125, "reward_std": 0.2097259908914566, "rewards/equation_reward_func/mean": 0.203125, "rewards/equation_reward_func/std": 0.40390563011169434, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05048888888888889, "grad_norm": 5.683013694137443, "kl": 0.533203125, "learning_rate": 1.6018382391793722e-07, "loss": 0.0005, "num_tokens": 42580221.0, "reward": 0.46875, "reward_std": 0.262323796749115, "rewards/equation_reward_func/mean": 0.46875, "rewards/equation_reward_func/std": 0.5009832978248596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.050666666666666665, "grad_norm": 6.0411125945253445, "kl": 0.418212890625, "learning_rate": 1.5850506544152103e-07, "loss": 0.0004, "num_tokens": 42730349.0, "reward": 0.3984375, "reward_std": 0.1753891110420227, "rewards/equation_reward_func/mean": 0.3984375, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05084444444444444, "grad_norm": 3.2015710010347638, "kl": 0.2939453125, "learning_rate": 1.5683105727495778e-07, "loss": 0.0003, "num_tokens": 42880589.0, "reward": 0.1640625, "reward_std": 0.2251407653093338, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05102222222222222, "grad_norm": 2.7616499678136956, "kl": 0.410888671875, "learning_rate": 1.5516188633079107e-07, "loss": 0.0004, "num_tokens": 43030621.0, "reward": 0.453125, "reward_std": 0.18683473765850067, "rewards/equation_reward_func/mean": 0.453125, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0512, "grad_norm": 3.7700054616857495, "kl": 0.3497314453125, "learning_rate": 1.5349763927042168e-07, "loss": 0.0003, "num_tokens": 43180717.0, "reward": 0.484375, "reward_std": 0.2870296835899353, "rewards/equation_reward_func/mean": 0.484375, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05137777777777778, "grad_norm": 2.346575611337738, "kl": 0.39892578125, "learning_rate": 1.5183840249960784e-07, "loss": 0.0004, "num_tokens": 43330733.0, "reward": 0.5390625, "reward_std": 0.2442798912525177, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.051555555555555556, "grad_norm": 0.6396216740601671, "kl": 0.28070068359375, "learning_rate": 1.501842621639796e-07, "loss": 0.0003, "num_tokens": 43480813.0, "reward": 0.3671875, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.3671875, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05173333333333333, "grad_norm": 3.159963231758412, "kl": 0.439453125, "learning_rate": 1.4853530414456612e-07, "loss": 0.0004, "num_tokens": 43631037.0, "reward": 0.2265625, "reward_std": 0.07394562661647797, "rewards/equation_reward_func/mean": 0.2265625, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05191111111111111, "grad_norm": 2.2163275518412577, "kl": 0.3870849609375, "learning_rate": 1.4689161405333652e-07, "loss": 0.0004, "num_tokens": 43781149.0, "reward": 0.5078125, "reward_std": 0.19362401962280273, "rewards/equation_reward_func/mean": 0.5078125, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.052088888888888886, "grad_norm": 1.8439651222859326, "kl": 0.25048828125, "learning_rate": 1.4525327722875568e-07, "loss": 0.0003, "num_tokens": 43931373.0, "reward": 0.2265625, "reward_std": 0.19439706206321716, "rewards/equation_reward_func/mean": 0.2265625, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05226666666666667, "grad_norm": 2.7105856114355777, "kl": 0.28564453125, "learning_rate": 1.4362037873135255e-07, "loss": 0.0003, "num_tokens": 44081437.0, "reward": 0.3828125, "reward_std": 0.2442798912525177, "rewards/equation_reward_func/mean": 0.3828125, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.052444444444444446, "grad_norm": 1.203965056367511, "kl": 0.2821044921875, "learning_rate": 1.4199300333930515e-07, "loss": 0.0003, "num_tokens": 44231581.0, "reward": 0.4609375, "reward_std": 0.11664125323295593, "rewards/equation_reward_func/mean": 0.4609375, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05262222222222222, "grad_norm": 0.9527473124996378, "kl": 0.2745361328125, "learning_rate": 1.403712355440378e-07, "loss": 0.0003, "num_tokens": 44381837.0, "reward": 0.359375, "reward_std": 0.09859732538461685, "rewards/equation_reward_func/mean": 0.359375, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0528, "grad_norm": 0.5736463121612332, "kl": 0.353515625, "learning_rate": 1.3875515954583523e-07, "loss": 0.0004, "num_tokens": 44531981.0, "reward": 0.2421875, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.2421875, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.052977777777777776, "grad_norm": 1.2521535220730278, "kl": 0.3388671875, "learning_rate": 1.371448592494707e-07, "loss": 0.0003, "num_tokens": 44682093.0, "reward": 0.4296875, "reward_std": 0.13644562661647797, "rewards/equation_reward_func/mean": 0.4296875, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05315555555555555, "grad_norm": 1.0312701160106668, "kl": 0.26708984375, "learning_rate": 1.3554041825985e-07, "loss": 0.0003, "num_tokens": 44832317.0, "reward": 0.328125, "reward_std": 0.12433473765850067, "rewards/equation_reward_func/mean": 0.328125, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05333333333333334, "grad_norm": 1.0235917596813247, "kl": 0.2357177734375, "learning_rate": 1.3394191987766996e-07, "loss": 0.0002, "num_tokens": 44982525.0, "reward": 0.1328125, "reward_std": 0.07394562661647797, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05351111111111111, "grad_norm": 1.5951473704958679, "kl": 0.301513671875, "learning_rate": 1.323494470950949e-07, "loss": 0.0003, "num_tokens": 45132669.0, "reward": 0.3515625, "reward_std": 0.21023227274417877, "rewards/equation_reward_func/mean": 0.3515625, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05368888888888889, "grad_norm": 2.103983700548187, "kl": 0.336181640625, "learning_rate": 1.3076308259144652e-07, "loss": 0.0003, "num_tokens": 45282813.0, "reward": 0.578125, "reward_std": 0.30651313066482544, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05386666666666667, "grad_norm": 1.949278257797819, "kl": 0.344482421875, "learning_rate": 1.2918290872891236e-07, "loss": 0.0003, "num_tokens": 45432861.0, "reward": 0.671875, "reward_std": 0.21244098246097565, "rewards/equation_reward_func/mean": 0.671875, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.054044444444444444, "grad_norm": 0.5476293487603306, "kl": 0.2764892578125, "learning_rate": 1.2760900754826858e-07, "loss": 0.0003, "num_tokens": 45582941.0, "reward": 0.3671875, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.3671875, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05422222222222222, "grad_norm": 1.6555940454022284, "kl": 0.327392578125, "learning_rate": 1.260414607646213e-07, "loss": 0.0003, "num_tokens": 45733069.0, "reward": 0.4296875, "reward_std": 0.21103432774543762, "rewards/equation_reward_func/mean": 0.4296875, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0544, "grad_norm": 1.9630495981822498, "kl": 0.33349609375, "learning_rate": 1.2448034976316394e-07, "loss": 0.0003, "num_tokens": 45883149.0, "reward": 0.4609375, "reward_std": 0.2797882854938507, "rewards/equation_reward_func/mean": 0.4609375, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05457777777777778, "grad_norm": 1.4415441648092056, "kl": 0.32470703125, "learning_rate": 1.2292575559495143e-07, "loss": 0.0003, "num_tokens": 46033165.0, "reward": 0.390625, "reward_std": 0.2004890739917755, "rewards/equation_reward_func/mean": 0.390625, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05475555555555556, "grad_norm": 1.3874852497225183, "kl": 0.2578125, "learning_rate": 1.213777589726922e-07, "loss": 0.0003, "num_tokens": 46183229.0, "reward": 0.140625, "reward_std": 0.10519562661647797, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.054933333333333334, "grad_norm": 1.3477321853146156, "kl": 0.3223876953125, "learning_rate": 1.1983644026655835e-07, "loss": 0.0003, "num_tokens": 46333453.0, "reward": 0.2109375, "reward_std": 0.059839196503162384, "rewards/equation_reward_func/mean": 0.2109375, "rewards/equation_reward_func/std": 0.4095771610736847, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05511111111111111, "grad_norm": 2.7351458176791237, "kl": 0.340576171875, "learning_rate": 1.183018795000118e-07, "loss": 0.0003, "num_tokens": 46483405.0, "reward": 0.59375, "reward_std": 0.24148225784301758, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05528888888888889, "grad_norm": 1.2105067942399799, "kl": 0.5364990234375, "learning_rate": 1.1677415634565066e-07, "loss": 0.0005, "num_tokens": 46633341.0, "reward": 0.359375, "reward_std": 0.0816391110420227, "rewards/equation_reward_func/mean": 0.359375, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.055466666666666664, "grad_norm": 1.8326795184321758, "kl": 0.4423828125, "learning_rate": 1.1525335012107188e-07, "loss": 0.0004, "num_tokens": 46783437.0, "reward": 0.4375, "reward_std": 0.24478615820407867, "rewards/equation_reward_func/mean": 0.4375, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05564444444444445, "grad_norm": 1.0832830720099889, "kl": 0.300537109375, "learning_rate": 1.1373953978475353e-07, "loss": 0.0003, "num_tokens": 46933469.0, "reward": 0.515625, "reward_std": 0.1556389182806015, "rewards/equation_reward_func/mean": 0.515625, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.055822222222222224, "grad_norm": 1.75523867799791, "kl": 0.4852294921875, "learning_rate": 1.1223280393195566e-07, "loss": 0.0005, "num_tokens": 47083645.0, "reward": 0.3671875, "reward_std": 0.15558473765850067, "rewards/equation_reward_func/mean": 0.3671875, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.056, "grad_norm": 1.425353512426999, "kl": 0.3338623046875, "learning_rate": 1.1073322079063913e-07, "loss": 0.0003, "num_tokens": 47233757.0, "reward": 0.3046875, "reward_std": 0.13644562661647797, "rewards/equation_reward_func/mean": 0.3046875, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05617777777777778, "grad_norm": 3.490588046898871, "kl": 0.692138671875, "learning_rate": 1.0924086821740436e-07, "loss": 0.0007, "num_tokens": 47383757.0, "reward": 0.6875, "reward_std": 0.1535891890525818, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 1020.8125, "completions/mean_terminated_length": 616.0, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "epoch": 0.056355555555555555, "grad_norm": 2.082362058812009, "kl": 0.427490234375, "learning_rate": 1.0775582369344946e-07, "loss": 0.0016, "num_tokens": 47533269.0, "reward": 0.484375, "reward_std": 0.1905868798494339, "rewards/equation_reward_func/mean": 0.484375, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05653333333333333, "grad_norm": 1.78962255832107, "kl": 0.383544921875, "learning_rate": 1.0627816432054689e-07, "loss": 0.0004, "num_tokens": 47683429.0, "reward": 0.3125, "reward_std": 0.17033424973487854, "rewards/equation_reward_func/mean": 0.3125, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05671111111111111, "grad_norm": 1.8129645811287745, "kl": 0.2994384765625, "learning_rate": 1.0480796681704077e-07, "loss": 0.0003, "num_tokens": 47833541.0, "reward": 0.65625, "reward_std": 0.21699902415275574, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05688888888888889, "grad_norm": 1.3045931824174442, "kl": 0.3631591796875, "learning_rate": 1.0334530751386386e-07, "loss": 0.0004, "num_tokens": 47983509.0, "reward": 0.5703125, "reward_std": 0.14568254351615906, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05706666666666667, "grad_norm": 1.7356638529838764, "kl": 0.46875, "learning_rate": 1.018902623505741e-07, "loss": 0.0005, "num_tokens": 48133573.0, "reward": 0.4296875, "reward_std": 0.18574902415275574, "rewards/equation_reward_func/mean": 0.4296875, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.057244444444444445, "grad_norm": 2.735164442952946, "kl": 0.311279296875, "learning_rate": 1.0044290687141255e-07, "loss": 0.0003, "num_tokens": 48283685.0, "reward": 0.3828125, "reward_std": 0.24074089527130127, "rewards/equation_reward_func/mean": 0.3828125, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05742222222222222, "grad_norm": 1.205414056575408, "kl": 0.3284912109375, "learning_rate": 9.900331622138063e-08, "loss": 0.0003, "num_tokens": 48433765.0, "reward": 0.2734375, "reward_std": 0.08715169876813889, "rewards/equation_reward_func/mean": 0.2734375, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0576, "grad_norm": 8.969807384024673, "kl": 2.828857421875, "learning_rate": 9.757156514233892e-08, "loss": 0.0028, "num_tokens": 48583813.0, "reward": 0.4921875, "reward_std": 0.13644562661647797, "rewards/equation_reward_func/mean": 0.4921875, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.057777777777777775, "grad_norm": 2.047982848446374, "kl": 0.3792724609375, "learning_rate": 9.614772796912681e-08, "loss": 0.0004, "num_tokens": 48733909.0, "reward": 0.71875, "reward_std": 0.29248809814453125, "rewards/equation_reward_func/mean": 0.71875, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05795555555555556, "grad_norm": 2.1073396754869247, "kl": 0.35888671875, "learning_rate": 9.473187862570289e-08, "loss": 0.0004, "num_tokens": 48883973.0, "reward": 0.5078125, "reward_std": 0.2779781222343445, "rewards/equation_reward_func/mean": 0.5078125, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.058133333333333335, "grad_norm": 1.5265235979435967, "kl": 0.2471923828125, "learning_rate": 9.332409062130686e-08, "loss": 0.0002, "num_tokens": 49034149.0, "reward": 0.3828125, "reward_std": 0.20033009350299835, "rewards/equation_reward_func/mean": 0.3828125, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05831111111111111, "grad_norm": 1.3958259656520005, "kl": 0.2979736328125, "learning_rate": 9.192443704664344e-08, "loss": 0.0003, "num_tokens": 49184341.0, "reward": 0.4765625, "reward_std": 0.1593368798494339, "rewards/equation_reward_func/mean": 0.4765625, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05848888888888889, "grad_norm": 2.2160128533738614, "kl": 0.4107666015625, "learning_rate": 9.053299057008699e-08, "loss": 0.0004, "num_tokens": 49334389.0, "reward": 0.265625, "reward_std": 0.10519562661647797, "rewards/equation_reward_func/mean": 0.265625, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.058666666666666666, "grad_norm": 2.5648574662500385, "kl": 0.3106689453125, "learning_rate": 8.914982343390895e-08, "loss": 0.0003, "num_tokens": 49484565.0, "reward": 0.4375, "reward_std": 0.23019562661647797, "rewards/equation_reward_func/mean": 0.4375, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05884444444444444, "grad_norm": 1.4412198896741628, "kl": 0.3126220703125, "learning_rate": 8.777500745052743e-08, "loss": 0.0003, "num_tokens": 49634789.0, "reward": 0.3046875, "reward_std": 0.13644562661647797, "rewards/equation_reward_func/mean": 0.3046875, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05902222222222222, "grad_norm": 3.2664264621515766, "kl": 0.3096923828125, "learning_rate": 8.640861399877805e-08, "loss": 0.0003, "num_tokens": 49784853.0, "reward": 0.2734375, "reward_std": 0.11664125323295593, "rewards/equation_reward_func/mean": 0.2734375, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0592, "grad_norm": 4.6940137145793805, "kl": 0.90478515625, "learning_rate": 8.505071402020892e-08, "loss": 0.0009, "num_tokens": 49935029.0, "reward": 0.3125, "reward_std": 0.25721919536590576, "rewards/equation_reward_func/mean": 0.3125, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05937777777777778, "grad_norm": 2.540226625343673, "kl": 0.479248046875, "learning_rate": 8.370137801539634e-08, "loss": 0.0005, "num_tokens": 50085237.0, "reward": 0.078125, "reward_std": 0.09529343992471695, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.059555555555555556, "grad_norm": 1.0128001047778752, "kl": 0.427001953125, "learning_rate": 8.236067604028562e-08, "loss": 0.0004, "num_tokens": 50235269.0, "reward": 0.3671875, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.3671875, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05973333333333333, "grad_norm": 2.287973213392771, "kl": 0.452392578125, "learning_rate": 8.102867770255337e-08, "loss": 0.0005, "num_tokens": 50385317.0, "reward": 0.4921875, "reward_std": 0.28231915831565857, "rewards/equation_reward_func/mean": 0.4921875, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05991111111111111, "grad_norm": 2.1996997560940783, "kl": 0.40576171875, "learning_rate": 7.970545215799327e-08, "loss": 0.0004, "num_tokens": 50535445.0, "reward": 0.3125, "reward_std": 0.21747365593910217, "rewards/equation_reward_func/mean": 0.3125, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.060088888888888886, "grad_norm": 2.639229113506021, "kl": 0.439208984375, "learning_rate": 7.839106810692589e-08, "loss": 0.0004, "num_tokens": 50685653.0, "reward": 0.3125, "reward_std": 0.19628483057022095, "rewards/equation_reward_func/mean": 0.3125, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06026666666666667, "grad_norm": 2.6297174808188775, "kl": 0.421142578125, "learning_rate": 7.708559379063204e-08, "loss": 0.0004, "num_tokens": 50835813.0, "reward": 0.4921875, "reward_std": 0.22958454489707947, "rewards/equation_reward_func/mean": 0.4921875, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.060444444444444446, "grad_norm": 1.7125463323394825, "kl": 0.36865234375, "learning_rate": 7.57890969878093e-08, "loss": 0.0004, "num_tokens": 50985973.0, "reward": 0.3046875, "reward_std": 0.1414783000946045, "rewards/equation_reward_func/mean": 0.3046875, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06062222222222222, "grad_norm": 1.8055139424145163, "kl": 0.525634765625, "learning_rate": 7.45016450110534e-08, "loss": 0.0005, "num_tokens": 51135973.0, "reward": 0.6953125, "reward_std": 0.15558473765850067, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0608, "grad_norm": 2.3183872921318933, "kl": 0.454345703125, "learning_rate": 7.322330470336313e-08, "loss": 0.0005, "num_tokens": 51286005.0, "reward": 0.5234375, "reward_std": 0.18837818503379822, "rewards/equation_reward_func/mean": 0.5234375, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06097777777777778, "grad_norm": 2.738503744990542, "kl": 0.6329345703125, "learning_rate": 7.195414243467029e-08, "loss": 0.0006, "num_tokens": 51436053.0, "reward": 0.421875, "reward_std": 0.1764804571866989, "rewards/equation_reward_func/mean": 0.421875, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06115555555555555, "grad_norm": 2.8516424865530565, "kl": 0.8662109375, "learning_rate": 7.069422409839363e-08, "loss": 0.0009, "num_tokens": 51586133.0, "reward": 0.5625, "reward_std": 0.21039125323295593, "rewards/equation_reward_func/mean": 0.5625, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 1023.4765625, "completions/mean_terminated_length": 957.0, "completions/min_length": 957.0, "completions/min_terminated_length": 957.0, "epoch": 0.06133333333333333, "grad_norm": 10.908659337900897, "kl": 2.3154296875, "learning_rate": 6.944361510801763e-08, "loss": 0.0018, "num_tokens": 51736194.0, "reward": 0.1640625, "reward_std": 0.25292789936065674, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.061511111111111114, "grad_norm": 1.113353409769231, "kl": 0.43115234375, "learning_rate": 6.820238039369647e-08, "loss": 0.0004, "num_tokens": 51886242.0, "reward": 0.6171875, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06168888888888889, "grad_norm": 14.582186799914224, "kl": 0.84619140625, "learning_rate": 6.697058439888283e-08, "loss": 0.0008, "num_tokens": 52036274.0, "reward": 0.578125, "reward_std": 0.25242161750793457, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 1018.3671875, "completions/mean_terminated_length": 663.5, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.06186666666666667, "grad_norm": 1.3035910257677628, "kl": 0.40185546875, "learning_rate": 6.574829107698238e-08, "loss": 0.0006, "num_tokens": 52185697.0, "reward": 0.4765625, "reward_std": 0.07394562661647797, "rewards/equation_reward_func/mean": 0.4765625, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 1011.9140625, "completions/mean_terminated_length": 508.3333435058594, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.062044444444444444, "grad_norm": 2.6205969283011683, "kl": 0.587158203125, "learning_rate": 6.453556388803288e-08, "loss": 0.0006, "num_tokens": 52334198.0, "reward": 0.5234375, "reward_std": 0.08715169876813889, "rewards/equation_reward_func/mean": 0.5234375, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 1018.1875, "completions/mean_terminated_length": 280.0, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.06222222222222222, "grad_norm": 2.8987677481196696, "kl": 0.445068359375, "learning_rate": 6.333246579540971e-08, "loss": 0.0057, "num_tokens": 52483646.0, "reward": 0.71875, "reward_std": 0.2999148368835449, "rewards/equation_reward_func/mean": 0.71875, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0624, "grad_norm": 2.681371319801926, "kl": 0.559814453125, "learning_rate": 6.213905926255697e-08, "loss": 0.0006, "num_tokens": 52633822.0, "reward": 0.3125, "reward_std": 0.22466085851192474, "rewards/equation_reward_func/mean": 0.3125, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 1011.4453125, "completions/mean_terminated_length": 488.3333435058594, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.06257777777777777, "grad_norm": 1.6857419751670386, "kl": 0.443115234375, "learning_rate": 6.095540624974435e-08, "loss": -0.0085, "num_tokens": 52782135.0, "reward": 0.4921875, "reward_std": 0.15558473765850067, "rewards/equation_reward_func/mean": 0.4921875, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06275555555555555, "grad_norm": 18.060240530837376, "kl": 1.934326171875, "learning_rate": 5.978156821084987e-08, "loss": 0.0019, "num_tokens": 52932215.0, "reward": 0.3984375, "reward_std": 0.14943468570709229, "rewards/equation_reward_func/mean": 0.3984375, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06293333333333333, "grad_norm": 1.8857569810962331, "kl": 0.4521484375, "learning_rate": 5.861760609017002e-08, "loss": 0.0005, "num_tokens": 53082103.0, "reward": 0.3515625, "reward_std": 0.19828036427497864, "rewards/equation_reward_func/mean": 0.3515625, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 1021.828125, "completions/mean_terminated_length": 746.0, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 0.06311111111111112, "grad_norm": 2.118010209410125, "kl": 0.37548828125, "learning_rate": 5.7463580319254853e-08, "loss": 0.0004, "num_tokens": 53231889.0, "reward": 0.4921875, "reward_std": 0.15843652188777924, "rewards/equation_reward_func/mean": 0.4921875, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0632888888888889, "grad_norm": 2.8712188022850302, "kl": 0.595703125, "learning_rate": 5.63195508137711e-08, "loss": 0.0006, "num_tokens": 53382017.0, "reward": 0.5078125, "reward_std": 0.24007564783096313, "rewards/equation_reward_func/mean": 0.5078125, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06346666666666667, "grad_norm": 1.1209180912275907, "kl": 0.330078125, "learning_rate": 5.518557697039081e-08, "loss": 0.0003, "num_tokens": 53532065.0, "reward": 0.0859375, "reward_std": 0.11493883281946182, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 1014.1953125, "completions/mean_terminated_length": 396.5, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.06364444444444445, "grad_norm": 1.721597768132904, "kl": 0.380615234375, "learning_rate": 5.4061717663707843e-08, "loss": -0.0033, "num_tokens": 53680874.0, "reward": 0.4140625, "reward_std": 0.15092839300632477, "rewards/equation_reward_func/mean": 0.4140625, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06382222222222222, "grad_norm": 4.889990077440367, "kl": 0.645263671875, "learning_rate": 5.294803124318145e-08, "loss": 0.0006, "num_tokens": 53830826.0, "reward": 0.2734375, "reward_std": 0.27398642897605896, "rewards/equation_reward_func/mean": 0.2734375, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.064, "grad_norm": 1.8582387197207795, "kl": 0.49658203125, "learning_rate": 5.1844575530106265e-08, "loss": 0.0005, "num_tokens": 53981002.0, "reward": 0.3359375, "reward_std": 0.11664125323295593, "rewards/equation_reward_func/mean": 0.3359375, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 1023.3671875, "completions/mean_terminated_length": 943.0, "completions/min_length": 943.0, "completions/min_terminated_length": 943.0, "epoch": 0.06417777777777778, "grad_norm": 2.917692147718676, "kl": 0.4169921875, "learning_rate": 5.07514078146106e-08, "loss": 0.001, "num_tokens": 54130937.0, "reward": 0.453125, "reward_std": 0.2816760540008545, "rewards/equation_reward_func/mean": 0.453125, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 1022.9765625, "completions/mean_terminated_length": 893.0, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "epoch": 0.06435555555555555, "grad_norm": 1.4082881764081276, "kl": 0.4091796875, "learning_rate": 4.9668584852682134e-08, "loss": 0.0009, "num_tokens": 54280854.0, "reward": 0.421875, "reward_std": 0.09529343992471695, "rewards/equation_reward_func/mean": 0.421875, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 1014.25, "completions/mean_terminated_length": 400.0, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.06453333333333333, "grad_norm": 21.266296763721, "kl": 2.232421875, "learning_rate": 4.859616286322094e-08, "loss": 0.0047, "num_tokens": 54429638.0, "reward": 0.546875, "reward_std": 0.2645905315876007, "rewards/equation_reward_func/mean": 0.546875, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06471111111111111, "grad_norm": 2.540246677497678, "kl": 0.467041015625, "learning_rate": 4.753419752512072e-08, "loss": 0.0005, "num_tokens": 54579718.0, "reward": 0.6015625, "reward_std": 0.2603282630443573, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06488888888888888, "grad_norm": 1.5725906857424574, "kl": 0.654052734375, "learning_rate": 4.648274397437829e-08, "loss": 0.0007, "num_tokens": 54729670.0, "reward": 0.5703125, "reward_std": 0.12984731793403625, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06506666666666666, "grad_norm": 2.0674814320267494, "kl": 0.427734375, "learning_rate": 4.5441856801230525e-08, "loss": 0.0004, "num_tokens": 54879766.0, "reward": 0.2578125, "reward_std": 0.12984731793403625, "rewards/equation_reward_func/mean": 0.2578125, "rewards/equation_reward_func/std": 0.43914902210235596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 1017.328125, "completions/mean_terminated_length": 170.0, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.06524444444444444, "grad_norm": 1.4693405180119679, "kl": 0.5068359375, "learning_rate": 4.4411590047320617e-08, "loss": 0.0062, "num_tokens": 55028896.0, "reward": 0.578125, "reward_std": 0.09108919650316238, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 1014.0546875, "completions/mean_terminated_length": 599.6666870117188, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.06542222222222223, "grad_norm": 2.633222472716859, "kl": 0.662109375, "learning_rate": 4.3391997202891825e-08, "loss": 0.0026, "num_tokens": 55177687.0, "reward": 0.6171875, "reward_std": 0.19234731793403625, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0656, "grad_norm": 1.4969302123335355, "kl": 0.46142578125, "learning_rate": 4.2383131204010494e-08, "loss": 0.0005, "num_tokens": 55327815.0, "reward": 0.359375, "reward_std": 0.12433473765850067, "rewards/equation_reward_func/mean": 0.359375, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06577777777777778, "grad_norm": 0.8408620511175955, "kl": 0.3326416015625, "learning_rate": 4.1385044429817966e-08, "loss": 0.0003, "num_tokens": 55477911.0, "reward": 0.1171875, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 1017.4609375, "completions/mean_terminated_length": 605.5, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.06595555555555556, "grad_norm": 1.4717435362764846, "kl": 0.5096435546875, "learning_rate": 4.039778869981064e-08, "loss": 0.0005, "num_tokens": 55627154.0, "reward": 0.421875, "reward_std": 0.10724534839391708, "rewards/equation_reward_func/mean": 0.421875, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06613333333333334, "grad_norm": 1.4260997105595992, "kl": 0.489501953125, "learning_rate": 3.942141527114978e-08, "loss": 0.0005, "num_tokens": 55777330.0, "reward": 0.4921875, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.4921875, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06631111111111111, "grad_norm": 2.6985829837872473, "kl": 0.385986328125, "learning_rate": 3.845597483600049e-08, "loss": 0.0004, "num_tokens": 55927426.0, "reward": 0.46875, "reward_std": 0.25878483057022095, "rewards/equation_reward_func/mean": 0.46875, "rewards/equation_reward_func/std": 0.5009832978248596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06648888888888889, "grad_norm": 2.4013311399055426, "kl": 0.38720703125, "learning_rate": 3.7501517518899486e-08, "loss": 0.0004, "num_tokens": 56077506.0, "reward": 0.453125, "reward_std": 0.21962818503379822, "rewards/equation_reward_func/mean": 0.453125, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 1023.3125, "completions/mean_terminated_length": 936.0, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "epoch": 0.06666666666666667, "grad_norm": 0.8918779101043425, "kl": 0.3922119140625, "learning_rate": 3.655809287415284e-08, "loss": 0.0004, "num_tokens": 56227450.0, "reward": 0.359375, "reward_std": 0.042695626616477966, "rewards/equation_reward_func/mean": 0.359375, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06684444444444444, "grad_norm": 0.6434546798822266, "kl": 0.3955078125, "learning_rate": 3.562574988326342e-08, "loss": 0.0004, "num_tokens": 56377562.0, "reward": 0.2421875, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.2421875, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06702222222222222, "grad_norm": 1.7298164026307943, "kl": 0.389892578125, "learning_rate": 3.4704536952387285e-08, "loss": 0.0004, "num_tokens": 56527610.0, "reward": 0.3984375, "reward_std": 0.17254295945167542, "rewards/equation_reward_func/mean": 0.3984375, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 1022.359375, "completions/mean_terminated_length": 814.0, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "epoch": 0.0672, "grad_norm": 1.682917283676319, "kl": 0.380859375, "learning_rate": 3.379450190982114e-08, "loss": 0.0004, "num_tokens": 56677448.0, "reward": 0.2421875, "reward_std": 0.15843652188777924, "rewards/equation_reward_func/mean": 0.2421875, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06737777777777777, "grad_norm": 1.9105026586942742, "kl": 0.6171875, "learning_rate": 3.2895692003518575e-08, "loss": 0.0006, "num_tokens": 56827480.0, "reward": 0.359375, "reward_std": 0.1905868798494339, "rewards/equation_reward_func/mean": 0.359375, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06755555555555555, "grad_norm": 1.4237191521903911, "kl": 0.669921875, "learning_rate": 3.2008153898637255e-08, "loss": 0.0007, "num_tokens": 56977464.0, "reward": 0.5859375, "reward_std": 0.14568254351615906, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 1019.6171875, "completions/mean_terminated_length": 463.0, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.06773333333333334, "grad_norm": 1.902344636416351, "kl": 0.4775390625, "learning_rate": 3.113193367511635e-08, "loss": 0.0028, "num_tokens": 57126951.0, "reward": 0.578125, "reward_std": 0.16769562661647797, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 1023.4140625, "completions/mean_terminated_length": 949.0, "completions/min_length": 949.0, "completions/min_terminated_length": 949.0, "epoch": 0.06791111111111112, "grad_norm": 2.1732511680992457, "kl": 0.877197265625, "learning_rate": 3.026707682528365e-08, "loss": 0.0014, "num_tokens": 57276940.0, "reward": 0.21875, "reward_std": 0.18500113487243652, "rewards/equation_reward_func/mean": 0.21875, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0680888888888889, "grad_norm": 2.1250741024864896, "kl": 0.3663330078125, "learning_rate": 2.9413628251493934e-08, "loss": 0.0004, "num_tokens": 57427020.0, "reward": 0.390625, "reward_std": 0.2584404945373535, "rewards/equation_reward_func/mean": 0.390625, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06826666666666667, "grad_norm": 1.007788588913057, "kl": 0.309326171875, "learning_rate": 2.8571632263797745e-08, "loss": 0.0003, "num_tokens": 57577212.0, "reward": 0.3046875, "reward_std": 0.10673906654119492, "rewards/equation_reward_func/mean": 0.3046875, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06844444444444445, "grad_norm": 1.8134191334546756, "kl": 0.43359375, "learning_rate": 2.774113257764066e-08, "loss": 0.0004, "num_tokens": 57727228.0, "reward": 0.390625, "reward_std": 0.20597384870052338, "rewards/equation_reward_func/mean": 0.390625, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06862222222222222, "grad_norm": 2.8712666096304145, "kl": 0.82763671875, "learning_rate": 2.6922172311593884e-08, "loss": 0.0008, "num_tokens": 57877420.0, "reward": 0.1328125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0688, "grad_norm": 1.6025038063949157, "kl": 0.4287109375, "learning_rate": 2.611479398511518e-08, "loss": 0.0004, "num_tokens": 58027548.0, "reward": 0.515625, "reward_std": 0.18068468570709229, "rewards/equation_reward_func/mean": 0.515625, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06897777777777778, "grad_norm": 1.7063313019322168, "kl": 0.426513671875, "learning_rate": 2.5319039516341844e-08, "loss": 0.0004, "num_tokens": 58177836.0, "reward": 0.359375, "reward_std": 0.26517558097839355, "rewards/equation_reward_func/mean": 0.359375, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 1020.171875, "completions/mean_terminated_length": 534.0, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.06915555555555555, "grad_norm": 1.913439190511589, "kl": 0.373291015625, "learning_rate": 2.4534950219914057e-08, "loss": 0.0004, "num_tokens": 58327378.0, "reward": 0.453125, "reward_std": 0.19344250857830048, "rewards/equation_reward_func/mean": 0.453125, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 1019.6640625, "completions/mean_terminated_length": 469.0, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.06933333333333333, "grad_norm": 7.838436090707214, "kl": 1.19384765625, "learning_rate": 2.3762566804829742e-08, "loss": 0.0012, "num_tokens": 58476759.0, "reward": 0.4609375, "reward_std": 0.2077304571866989, "rewards/equation_reward_func/mean": 0.4609375, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0695111111111111, "grad_norm": 1.4553310609497452, "kl": 0.2769775390625, "learning_rate": 2.300192937233128e-08, "loss": 0.0003, "num_tokens": 58626951.0, "reward": 0.1328125, "reward_std": 0.13578036427497864, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06968888888888888, "grad_norm": 1.8545221882989233, "kl": 0.512451171875, "learning_rate": 2.2253077413823458e-08, "loss": 0.0005, "num_tokens": 58776999.0, "reward": 0.7421875, "reward_std": 0.26783639192581177, "rewards/equation_reward_func/mean": 0.7421875, "rewards/equation_reward_func/std": 0.43914902210235596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06986666666666666, "grad_norm": 1.4368133651473673, "kl": 0.2852783203125, "learning_rate": 2.1516049808822935e-08, "loss": 0.0003, "num_tokens": 58927319.0, "reward": 0.171875, "reward_std": 0.09859732538461685, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07004444444444445, "grad_norm": 2.0693206177695735, "kl": 0.708984375, "learning_rate": 2.0790884822939836e-08, "loss": 0.0007, "num_tokens": 59077383.0, "reward": 0.171875, "reward_std": 0.14618882536888123, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 1013.53125, "completions/mean_terminated_length": 354.0, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.07022222222222223, "grad_norm": 1.605211760366692, "kl": 0.479736328125, "learning_rate": 2.007762010589098e-08, "loss": 0.0005, "num_tokens": 59226011.0, "reward": 0.6015625, "reward_std": 0.2152385711669922, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0704, "grad_norm": 2.0402063241263533, "kl": 0.505859375, "learning_rate": 1.9376292689545158e-08, "loss": 0.0005, "num_tokens": 59375995.0, "reward": 0.515625, "reward_std": 0.256390780210495, "rewards/equation_reward_func/mean": 0.515625, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07057777777777778, "grad_norm": 1.4975460606147797, "kl": 0.37353515625, "learning_rate": 1.8686938986000627e-08, "loss": 0.0004, "num_tokens": 59526155.0, "reward": 0.3203125, "reward_std": 0.2363196462392807, "rewards/equation_reward_func/mean": 0.3203125, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 1019.3671875, "completions/mean_terminated_length": 431.0, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.07075555555555556, "grad_norm": 15.397453855255312, "kl": 0.9715576171875, "learning_rate": 1.800959478569422e-08, "loss": 0.0028, "num_tokens": 59675658.0, "reward": 0.34375, "reward_std": 0.2689315676689148, "rewards/equation_reward_func/mean": 0.34375, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07093333333333333, "grad_norm": 0.5918277952776967, "kl": 0.3201904296875, "learning_rate": 1.734429525554365e-08, "loss": 0.0003, "num_tokens": 59825674.0, "reward": 0.2265625, "reward_std": 0.07394562661647797, "rewards/equation_reward_func/mean": 0.2265625, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 1019.203125, "completions/mean_terminated_length": 410.0, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.07111111111111111, "grad_norm": 1.3585554180159947, "kl": 0.39892578125, "learning_rate": 1.6691074937121407e-08, "loss": 0.0041, "num_tokens": 59975172.0, "reward": 0.3125, "reward_std": 0.15779343247413635, "rewards/equation_reward_func/mean": 0.3125, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07128888888888889, "grad_norm": 1.54358129186123, "kl": 0.37744140625, "learning_rate": 1.604996774486145e-08, "loss": 0.0004, "num_tokens": 60125076.0, "reward": 0.359375, "reward_std": 0.22953036427497864, "rewards/equation_reward_func/mean": 0.359375, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 1019.5546875, "completions/mean_terminated_length": 455.0, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.07146666666666666, "grad_norm": 1.2393228938484424, "kl": 0.3939208984375, "learning_rate": 1.5421006964298377e-08, "loss": 0.0033, "num_tokens": 60274683.0, "reward": 0.421875, "reward_std": 0.18888446688652039, "rewards/equation_reward_func/mean": 0.421875, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 1022.75, "completions/mean_terminated_length": 864.0, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 0.07164444444444444, "grad_norm": 2.7889375116441646, "kl": 0.4532470703125, "learning_rate": 1.4804225250339281e-08, "loss": 0.0005, "num_tokens": 60424683.0, "reward": 0.328125, "reward_std": 0.12433473765850067, "rewards/equation_reward_func/mean": 0.328125, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07182222222222222, "grad_norm": 3.434593576277822, "kl": 0.806884765625, "learning_rate": 1.4199654625568575e-08, "loss": 0.0008, "num_tokens": 60574747.0, "reward": 0.40625, "reward_std": 0.2848431468009949, "rewards/equation_reward_func/mean": 0.40625, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 1019.5625, "completions/mean_terminated_length": 456.0, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.072, "grad_norm": 1.8827754379297368, "kl": 0.40283203125, "learning_rate": 1.360732647858498e-08, "loss": 0.0004, "num_tokens": 60724227.0, "reward": 0.59375, "reward_std": 0.27494096755981445, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07217777777777777, "grad_norm": 2.2772308604110045, "kl": 0.553466796875, "learning_rate": 1.302727156237224e-08, "loss": 0.0006, "num_tokens": 60874355.0, "reward": 0.3515625, "reward_std": 0.07394562661647797, "rewards/equation_reward_func/mean": 0.3515625, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 1019.3984375, "completions/mean_terminated_length": 435.0, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.07235555555555556, "grad_norm": 1.0554081105016955, "kl": 0.506591796875, "learning_rate": 1.2459519992702311e-08, "loss": 0.0005, "num_tokens": 61023910.0, "reward": 0.625, "reward_std": 0.12433473765850067, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07253333333333334, "grad_norm": 1.9205465951543388, "kl": 0.3392333984375, "learning_rate": 1.1904101246571874e-08, "loss": 0.0003, "num_tokens": 61174230.0, "reward": 0.3046875, "reward_std": 0.2152385711669922, "rewards/equation_reward_func/mean": 0.3046875, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 1020.7109375, "completions/mean_terminated_length": 603.0, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "epoch": 0.07271111111111112, "grad_norm": 1.6441523651878778, "kl": 0.494140625, "learning_rate": 1.1361044160671629e-08, "loss": 0.003, "num_tokens": 61323953.0, "reward": 0.375, "reward_std": 0.29533424973487854, "rewards/equation_reward_func/mean": 0.375, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07288888888888889, "grad_norm": 1.921407714619121, "kl": 0.5361328125, "learning_rate": 1.0830376929889612e-08, "loss": 0.0005, "num_tokens": 61474081.0, "reward": 0.5234375, "reward_std": 0.31578171253204346, "rewards/equation_reward_func/mean": 0.5234375, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 1019.7109375, "completions/mean_terminated_length": 475.0, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 0.07306666666666667, "grad_norm": 2.277716888097387, "kl": 0.60791015625, "learning_rate": 1.0312127105846947e-08, "loss": 0.0006, "num_tokens": 61623580.0, "reward": 0.515625, "reward_std": 0.1875, "rewards/equation_reward_func/mean": 0.515625, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 1018.9375, "completions/mean_terminated_length": 376.0, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.07324444444444445, "grad_norm": 1.765990689057306, "kl": 0.45849609375, "learning_rate": 9.806321595467598e-09, "loss": 0.0005, "num_tokens": 61772996.0, "reward": 0.34375, "reward_std": 0.182951420545578, "rewards/equation_reward_func/mean": 0.34375, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07342222222222222, "grad_norm": 1.3964941123022503, "kl": 0.570556640625, "learning_rate": 9.312986659581301e-09, "loss": 0.0006, "num_tokens": 61923092.0, "reward": 0.421875, "reward_std": 0.16769562661647797, "rewards/equation_reward_func/mean": 0.421875, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 1022.671875, "completions/mean_terminated_length": 854.0, "completions/min_length": 854.0, "completions/min_terminated_length": 854.0, "epoch": 0.0736, "grad_norm": 1.4735126535441507, "kl": 0.4610595703125, "learning_rate": 8.832147911560173e-09, "loss": 0.0009, "num_tokens": 62072986.0, "reward": 0.5078125, "reward_std": 0.21808473765850067, "rewards/equation_reward_func/mean": 0.5078125, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07377777777777778, "grad_norm": 1.951003775396955, "kl": 0.588623046875, "learning_rate": 8.363830315988945e-09, "loss": 0.0006, "num_tokens": 62223018.0, "reward": 0.3671875, "reward_std": 0.15558473765850067, "rewards/equation_reward_func/mean": 0.3671875, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07395555555555555, "grad_norm": 1.0114132564622214, "kl": 0.417236328125, "learning_rate": 7.908058187368726e-09, "loss": 0.0004, "num_tokens": 62373050.0, "reward": 0.453125, "reward_std": 0.09529343992471695, "rewards/equation_reward_func/mean": 0.453125, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07413333333333333, "grad_norm": 0.6995293201719184, "kl": 0.502197265625, "learning_rate": 7.46485518885462e-09, "loss": 0.0005, "num_tokens": 62523050.0, "reward": 0.375, "reward_std": 0.08539125323295593, "rewards/equation_reward_func/mean": 0.375, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0743111111111111, "grad_norm": 1.6821237961996058, "kl": 0.466064453125, "learning_rate": 7.0342443310273665e-09, "loss": 0.0005, "num_tokens": 62673242.0, "reward": 0.5234375, "reward_std": 0.26783639192581177, "rewards/equation_reward_func/mean": 0.5234375, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07448888888888888, "grad_norm": 1.6375319968126312, "kl": 0.662841796875, "learning_rate": 6.616247970698319e-09, "loss": 0.0007, "num_tokens": 62823338.0, "reward": 0.4609375, "reward_std": 0.23414260149002075, "rewards/equation_reward_func/mean": 0.4609375, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07466666666666667, "grad_norm": 1.1643560946737068, "kl": 0.330322265625, "learning_rate": 6.210887809749099e-09, "loss": 0.0003, "num_tokens": 62973562.0, "reward": 0.1640625, "reward_std": 0.13644562661647797, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07484444444444445, "grad_norm": 1.9393633001214698, "kl": 0.408447265625, "learning_rate": 5.8181848940044855e-09, "loss": 0.0004, "num_tokens": 63123706.0, "reward": 0.3828125, "reward_std": 0.17914125323295593, "rewards/equation_reward_func/mean": 0.3828125, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 1020.453125, "completions/mean_terminated_length": 570.0, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "epoch": 0.07502222222222223, "grad_norm": 1.208769085905422, "kl": 0.3570556640625, "learning_rate": 5.4381596121399476e-09, "loss": 0.0004, "num_tokens": 63273396.0, "reward": 0.359375, "reward_std": 0.16532793641090393, "rewards/equation_reward_func/mean": 0.359375, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0752, "grad_norm": 0.8767912793294426, "kl": 0.304443359375, "learning_rate": 5.070831694623135e-09, "loss": 0.0003, "num_tokens": 63423572.0, "reward": 0.1875, "reward_std": 0.1379890739917755, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07537777777777778, "grad_norm": 1.795376695227377, "kl": 0.491943359375, "learning_rate": 4.716220212689332e-09, "loss": 0.0005, "num_tokens": 63573668.0, "reward": 0.5859375, "reward_std": 0.21193468570709229, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07555555555555556, "grad_norm": 3.9143052801591565, "kl": 0.66845703125, "learning_rate": 4.374343577351336e-09, "loss": 0.0007, "num_tokens": 63723860.0, "reward": 0.375, "reward_std": 0.12433473765850067, "rewards/equation_reward_func/mean": 0.375, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07573333333333333, "grad_norm": 1.1318328674707612, "kl": 0.385009765625, "learning_rate": 4.045219538443778e-09, "loss": 0.0004, "num_tokens": 63873844.0, "reward": 0.4453125, "reward_std": 0.21193468570709229, "rewards/equation_reward_func/mean": 0.4453125, "rewards/equation_reward_func/std": 0.4989531338214874, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07591111111111111, "grad_norm": 0.9661818601632447, "kl": 0.4649658203125, "learning_rate": 3.7288651837012745e-09, "loss": 0.0005, "num_tokens": 64023972.0, "reward": 0.109375, "reward_std": 0.11840169876813889, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07608888888888889, "grad_norm": 1.3961014258805358, "kl": 0.38623046875, "learning_rate": 3.4252969378714134e-09, "loss": 0.0004, "num_tokens": 64174052.0, "reward": 0.3359375, "reward_std": 0.09308473765850067, "rewards/equation_reward_func/mean": 0.3359375, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 1023.8125, "completions/mean_terminated_length": 1000.0, "completions/min_length": 1000.0, "completions/min_terminated_length": 1000.0, "epoch": 0.07626666666666666, "grad_norm": 1.429160107362864, "kl": 0.4967041015625, "learning_rate": 3.134530561862081e-09, "loss": 0.0005, "num_tokens": 64324044.0, "reward": 0.4921875, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.4921875, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 1018.125, "completions/mean_terminated_length": 272.0, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.07644444444444444, "grad_norm": 1.9294567567292205, "kl": 0.3994140625, "learning_rate": 2.856581151922943e-09, "loss": 0.0004, "num_tokens": 64473388.0, "reward": 0.15625, "reward_std": 0.08539125323295593, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 1021.2265625, "completions/mean_terminated_length": 669.0, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 0.07662222222222222, "grad_norm": 1.0652663829034763, "kl": 0.49853515625, "learning_rate": 2.5914631388619103e-09, "loss": 0.0005, "num_tokens": 64623161.0, "reward": 0.4140625, "reward_std": 0.11994513869285583, "rewards/equation_reward_func/mean": 0.4140625, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0768, "grad_norm": 1.464456187980756, "kl": 0.4398193359375, "learning_rate": 2.339190287295678e-09, "loss": 0.0004, "num_tokens": 64773305.0, "reward": 0.2421875, "reward_std": 0.18837818503379822, "rewards/equation_reward_func/mean": 0.2421875, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07697777777777778, "grad_norm": 2.2970045024048615, "kl": 0.500732421875, "learning_rate": 2.0997756949353297e-09, "loss": 0.0005, "num_tokens": 64923401.0, "reward": 0.5859375, "reward_std": 0.2832997143268585, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07715555555555556, "grad_norm": 1.3629147182881671, "kl": 0.427490234375, "learning_rate": 1.8732317919060715e-09, "loss": 0.0004, "num_tokens": 65073465.0, "reward": 0.3046875, "reward_std": 0.09579972177743912, "rewards/equation_reward_func/mean": 0.3046875, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07733333333333334, "grad_norm": 57.005398701994565, "kl": 6.148193359375, "learning_rate": 1.6595703401020844e-09, "loss": 0.0061, "num_tokens": 65223785.0, "reward": 0.171875, "reward_std": 0.11443255096673965, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07751111111111111, "grad_norm": 1.4593330176053296, "kl": 0.49072265625, "learning_rate": 1.4588024325756788e-09, "loss": 0.0005, "num_tokens": 65373753.0, "reward": 0.3671875, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.3671875, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07768888888888889, "grad_norm": 5572.21205279157, "kl": 209.56298828125, "learning_rate": 1.2709384929615596e-09, "loss": 0.2096, "num_tokens": 65523673.0, "reward": 0.34375, "reward_std": 0.22953036427497864, "rewards/equation_reward_func/mean": 0.34375, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07786666666666667, "grad_norm": 1.8911109055150177, "kl": 0.3935546875, "learning_rate": 1.0959882749354277e-09, "loss": 0.0004, "num_tokens": 65673945.0, "reward": 0.4140625, "reward_std": 0.1753891110420227, "rewards/equation_reward_func/mean": 0.4140625, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07804444444444444, "grad_norm": 1.4827882448254748, "kl": 0.5185546875, "learning_rate": 9.339608617077165e-10, "loss": 0.0005, "num_tokens": 65823801.0, "reward": 0.5, "reward_std": 0.16109731793403625, "rewards/equation_reward_func/mean": 0.5, "rewards/equation_reward_func/std": 0.5019646286964417, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 1019.40625, "completions/mean_terminated_length": 436.0, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.07822222222222222, "grad_norm": 1.27265598732049, "kl": 0.432373046875, "learning_rate": 7.848646655519986e-10, "loss": 0.0004, "num_tokens": 65973309.0, "reward": 0.5703125, "reward_std": 0.20818254351615906, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0784, "grad_norm": 1.2103692638392465, "kl": 0.33837890625, "learning_rate": 6.487074273681114e-10, "loss": 0.0003, "num_tokens": 66123549.0, "reward": 0.1953125, "reward_std": 0.15843652188777924, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07857777777777777, "grad_norm": 1.089228811163516, "kl": 0.360595703125, "learning_rate": 5.254962162804799e-10, "loss": 0.0004, "num_tokens": 66273645.0, "reward": 0.21875, "reward_std": 0.1665782630443573, "rewards/equation_reward_func/mean": 0.21875, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 1022.078125, "completions/mean_terminated_length": 778.0, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 0.07875555555555555, "grad_norm": 0.841479611863863, "kl": 0.33935546875, "learning_rate": 4.152374292708538e-10, "loss": 0.0003, "num_tokens": 66423527.0, "reward": 0.3203125, "reward_std": 0.10253482311964035, "rewards/equation_reward_func/mean": 0.3203125, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07893333333333333, "grad_norm": 6.083455193885405, "kl": 0.822998046875, "learning_rate": 3.1793679084632375e-10, "loss": 0.0008, "num_tokens": 66573719.0, "reward": 0.2734375, "reward_std": 0.22072336077690125, "rewards/equation_reward_func/mean": 0.2734375, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 1013.4140625, "completions/mean_terminated_length": 346.5, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.0791111111111111, "grad_norm": 0.963697467301106, "kl": 0.53759765625, "learning_rate": 2.3359935274214204e-10, "loss": 0.0026, "num_tokens": 66722540.0, "reward": 0.6015625, "reward_std": 0.07394562661647797, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0792888888888889, "grad_norm": 1.4017867838041997, "kl": 0.3714599609375, "learning_rate": 1.6222949365926608e-10, "loss": 0.0004, "num_tokens": 66872716.0, "reward": 0.4453125, "reward_std": 0.16503483057022095, "rewards/equation_reward_func/mean": 0.4453125, "rewards/equation_reward_func/std": 0.4989531338214874, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07946666666666667, "grad_norm": 1.50099224810612, "kl": 0.334716796875, "learning_rate": 1.0383091903720665e-10, "loss": 0.0003, "num_tokens": 67022844.0, "reward": 0.234375, "reward_std": 0.042695626616477966, "rewards/equation_reward_func/mean": 0.234375, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07964444444444445, "grad_norm": 1.2274435541781965, "kl": 0.46044921875, "learning_rate": 5.84066608615985e-11, "loss": 0.0005, "num_tokens": 67172908.0, "reward": 0.5703125, "reward_std": 0.12984731793403625, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 1017.875, "completions/mean_terminated_length": 240.0, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.07982222222222222, "grad_norm": 2.997293391656188, "kl": 0.51318359375, "learning_rate": 2.595907750671533e-11, "loss": 0.0054, "num_tokens": 67322028.0, "reward": 0.765625, "reward_std": 0.262323796749115, "rewards/equation_reward_func/mean": 0.765625, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.08, "grad_norm": 1.0257616178286506, "kl": 0.253173828125, "learning_rate": 6.489853613067531e-12, "loss": 0.0003, "num_tokens": 67472156.0, "reward": 0.046875, "reward_std": 0.10077822208404541, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 450 }, { "epoch": 0.08, "step": 450, "total_flos": 0.0, "train_loss": 0.003569204885145862, "train_runtime": 6050.2879, "train_samples_per_second": 9.52, "train_steps_per_second": 0.074 } ], "logging_steps": 1, "max_steps": 450, "num_input_tokens_seen": 67472156, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }