{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.999111111111111, "eval_steps": 500, "global_step": 562, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 1020.078125, "completions/mean_terminated_length": 773.0, "completions/min_length": 588.0, "completions/min_terminated_length": 588.0, "epoch": 0.0035555555555555557, "grad_norm": 0.2771470571706861, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 149594.0, "reward": 0.0078125, "reward_std": 0.009021097794175148, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0071111111111111115, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.941176470588235e-08, "loss": 0.0, "num_tokens": 299738.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 1014.9609375, "completions/mean_terminated_length": 445.5, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.010666666666666666, "grad_norm": 0.3910424013646084, "kl": 0.0012340545654296875, "learning_rate": 5.88235294117647e-08, "loss": 0.0, "num_tokens": 448737.0, "reward": 0.01171875, "reward_std": 0.016833597794175148, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 1015.1328125, "completions/mean_terminated_length": 456.5, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.014222222222222223, "grad_norm": 0.7477560337297564, "kl": 0.0012416839599609375, "learning_rate": 8.823529411764706e-08, "loss": 0.0, "num_tokens": 597734.0, "reward": 0.0078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.017777777777777778, "grad_norm": 0.004210061181011468, "kl": 0.0011844635009765625, "learning_rate": 1.176470588235294e-07, "loss": 0.0, "num_tokens": 747806.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.021333333333333333, "grad_norm": 0.30463059980407714, "kl": 0.0012011528015136719, "learning_rate": 1.4705882352941175e-07, "loss": 0.0, "num_tokens": 897894.0, "reward": 0.00390625, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.024888888888888887, "grad_norm": 0.9478569692121466, "kl": 0.0013608932495117188, "learning_rate": 1.764705882352941e-07, "loss": 0.0, "num_tokens": 1048030.0, "reward": 0.00390625, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 1023.3125, "completions/mean_terminated_length": 936.0, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "epoch": 0.028444444444444446, "grad_norm": 0.0033097224749988386, "kl": 0.0013580322265625, "learning_rate": 2.0588235294117645e-07, "loss": 0.0, "num_tokens": 1198082.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.032, "grad_norm": 0.3396430847864516, "kl": 0.0011739730834960938, "learning_rate": 2.352941176470588e-07, "loss": 0.0, "num_tokens": 1348102.0, "reward": 0.0078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.035555555555555556, "grad_norm": 0.0019085271667908846, "kl": 0.0013241767883300781, "learning_rate": 2.6470588235294114e-07, "loss": 0.0, "num_tokens": 1498154.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03911111111111111, "grad_norm": 0.12196802997866993, "kl": 0.003757476806640625, "learning_rate": 2.941176470588235e-07, "loss": 0.0, "num_tokens": 1648294.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 1022.0859375, "completions/mean_terminated_length": 779.0, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 0.042666666666666665, "grad_norm": 0.353412956717465, "kl": 0.0011844635009765625, "learning_rate": 3.2352941176470586e-07, "loss": 0.0, "num_tokens": 1798157.0, "reward": 0.0078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 1015.5, "completions/mean_terminated_length": 661.3333740234375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.04622222222222222, "grad_norm": 0.4177192368926344, "kl": 0.0013208389282226562, "learning_rate": 3.529411764705882e-07, "loss": 0.0019, "num_tokens": 1947113.0, "reward": 0.0078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 1019.8984375, "completions/mean_terminated_length": 499.0, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 0.049777777777777775, "grad_norm": 1.030360015389124, "kl": 0.0012826919555664062, "learning_rate": 3.8235294117647053e-07, "loss": 0.0, "num_tokens": 2096668.0, "reward": 0.0078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05333333333333334, "grad_norm": 0.006130423266424807, "kl": 0.001644134521484375, "learning_rate": 4.117647058823529e-07, "loss": 0.0, "num_tokens": 2246772.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05688888888888889, "grad_norm": 0.47084182350703124, "kl": 0.0013189315795898438, "learning_rate": 4.4117647058823526e-07, "loss": 0.0, "num_tokens": 2396868.0, "reward": 0.0078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.060444444444444446, "grad_norm": 0.39485183865726337, "kl": 0.0012559890747070312, "learning_rate": 4.705882352941176e-07, "loss": 0.0, "num_tokens": 2546980.0, "reward": 0.0078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 1019.0, "completions/mean_terminated_length": 384.0, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.064, "grad_norm": 0.5277276749582551, "kl": 0.0015764236450195312, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 2696456.0, "reward": 0.01171875, "reward_std": 0.016833597794175148, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 1023.203125, "completions/mean_terminated_length": 922.0, "completions/min_length": 922.0, "completions/min_terminated_length": 922.0, "epoch": 0.06755555555555555, "grad_norm": 0.6327836514564609, "kl": 0.0015048980712890625, "learning_rate": 4.999958464872182e-07, "loss": 0.0, "num_tokens": 2846446.0, "reward": 0.01171875, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 1023.609375, "completions/mean_terminated_length": 974.0, "completions/min_length": 974.0, "completions/min_terminated_length": 974.0, "epoch": 0.07111111111111111, "grad_norm": 0.3191632805099472, "kl": 0.0015163421630859375, "learning_rate": 4.999833860868863e-07, "loss": 0.0, "num_tokens": 2996508.0, "reward": 0.00390625, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 1020.78125, "completions/mean_terminated_length": 612.0, "completions/min_length": 612.0, "completions/min_terminated_length": 612.0, "epoch": 0.07466666666666667, "grad_norm": 0.5895570635639454, "kl": 0.00185394287109375, "learning_rate": 4.999626192130396e-07, "loss": 0.0, "num_tokens": 3146148.0, "reward": 0.015625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 1017.4453125, "completions/mean_terminated_length": 185.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.07822222222222222, "grad_norm": 0.8536716379125333, "kl": 0.002094268798828125, "learning_rate": 4.99933546555722e-07, "loss": -0.0036, "num_tokens": 3295353.0, "reward": 0.0234375, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 1022.59375, "completions/mean_terminated_length": 844.0, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "epoch": 0.08177777777777778, "grad_norm": 0.761179697351255, "kl": 0.0025005340576171875, "learning_rate": 4.998961690809627e-07, "loss": -0.0012, "num_tokens": 3445333.0, "reward": 0.015625, "reward_std": 0.024646097794175148, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.08533333333333333, "grad_norm": 0.9164427994280226, "kl": 0.0030078887939453125, "learning_rate": 4.998504880307444e-07, "loss": 0.0, "num_tokens": 3595429.0, "reward": 0.03125, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 1021.15625, "completions/mean_terminated_length": 660.0, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.08888888888888889, "grad_norm": 0.49620338336363207, "kl": 0.0037441253662109375, "learning_rate": 4.997965049229614e-07, "loss": 0.0, "num_tokens": 3745113.0, "reward": 0.01171875, "reward_std": 0.016833597794175148, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.09244444444444444, "grad_norm": 0.9470188431084309, "kl": 0.0042877197265625, "learning_rate": 4.997342215513703e-07, "loss": 0.0, "num_tokens": 3895201.0, "reward": 0.0234375, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 1017.71875, "completions/mean_terminated_length": 622.0, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.096, "grad_norm": 0.9027646554963968, "kl": 0.005214691162109375, "learning_rate": 4.99663639985529e-07, "loss": -0.0028, "num_tokens": 4044461.0, "reward": 0.0234375, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 1016.2734375, "completions/mean_terminated_length": 694.3333740234375, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.09955555555555555, "grad_norm": 1.1212987874003704, "kl": 0.0063610076904296875, "learning_rate": 4.995847625707292e-07, "loss": -0.0012, "num_tokens": 4193588.0, "reward": 0.03515625, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.10311111111111111, "grad_norm": 0.5105677644040013, "kl": 0.0079345703125, "learning_rate": 4.994975919279175e-07, "loss": 0.0, "num_tokens": 4343632.0, "reward": 0.0078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.10666666666666667, "grad_norm": 0.40495448135577117, "kl": 0.010833740234375, "learning_rate": 4.994021309536092e-07, "loss": 0.0, "num_tokens": 4493784.0, "reward": 0.01171875, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 1022.296875, "completions/mean_terminated_length": 806.0, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "epoch": 0.11022222222222222, "grad_norm": 1.1342927264328857, "kl": 0.010311126708984375, "learning_rate": 4.992983828197911e-07, "loss": 0.0, "num_tokens": 4643598.0, "reward": 0.0234375, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.11377777777777778, "grad_norm": 0.7512859872904276, "kl": 0.01190948486328125, "learning_rate": 4.991863509738169e-07, "loss": 0.0, "num_tokens": 4793666.0, "reward": 0.04296875, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 1019.96875, "completions/mean_terminated_length": 508.0, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 0.11733333333333333, "grad_norm": 0.9416577312187854, "kl": 0.02165985107421875, "learning_rate": 4.990660391382923e-07, "loss": -0.0021, "num_tokens": 4943234.0, "reward": 0.046875, "reward_std": 0.05831329524517059, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.0234375, "rewards/format_reward_func/std": 0.15188287198543549, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.12088888888888889, "grad_norm": 0.7545248940723044, "kl": 0.0414276123046875, "learning_rate": 4.989374513109511e-07, "loss": 0.0, "num_tokens": 5093326.0, "reward": 0.0390625, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.12444444444444444, "grad_norm": 0.7218584701517031, "kl": 0.0307769775390625, "learning_rate": 4.988005917645229e-07, "loss": 0.0, "num_tokens": 5243446.0, "reward": 0.03515625, "reward_std": 0.057104695588350296, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1746762990951538, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.128, "grad_norm": 0.8075422097889196, "kl": 0.043731689453125, "learning_rate": 4.986554650465906e-07, "loss": 0.0, "num_tokens": 5393610.0, "reward": 0.02734375, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.13155555555555556, "grad_norm": 0.6475461270291981, "kl": 0.044952392578125, "learning_rate": 4.985020759794397e-07, "loss": 0.0, "num_tokens": 5543710.0, "reward": 0.03515625, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.1351111111111111, "grad_norm": 0.754339390837337, "kl": 0.0526123046875, "learning_rate": 4.983404296598978e-07, "loss": 0.0001, "num_tokens": 5693790.0, "reward": 0.06640625, "reward_std": 0.0973757952451706, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.0390625, "rewards/format_reward_func/std": 0.194504976272583, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.13866666666666666, "grad_norm": 0.7041423598747788, "kl": 0.057220458984375, "learning_rate": 4.981705314591655e-07, "loss": 0.0001, "num_tokens": 5843866.0, "reward": 0.05859375, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.0390625, "rewards/format_reward_func/std": 0.194504976272583, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.14222222222222222, "grad_norm": 0.649800348452034, "kl": 0.037445068359375, "learning_rate": 4.979923870226372e-07, "loss": 0.0, "num_tokens": 5994018.0, "reward": 0.0546875, "reward_std": 0.07767495512962341, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 1021.2578125, "completions/mean_terminated_length": 673.0, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 0.14577777777777778, "grad_norm": 0.9175028834738074, "kl": 0.047149658203125, "learning_rate": 4.978060022697148e-07, "loss": 0.0, "num_tokens": 6143759.0, "reward": 0.1015625, "reward_std": 0.12323048710823059, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.29262590408325195, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 1021.46875, "completions/mean_terminated_length": 700.0, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "epoch": 0.14933333333333335, "grad_norm": 0.8957971400137922, "kl": 0.052337646484375, "learning_rate": 4.976113833936098e-07, "loss": 0.0039, "num_tokens": 6293559.0, "reward": 0.1015625, "reward_std": 0.13169725239276886, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.078125, "rewards/format_reward_func/std": 0.2694226801395416, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.15288888888888888, "grad_norm": 2.0122883915276466, "kl": 0.170257568359375, "learning_rate": 4.974085368611381e-07, "loss": 0.0002, "num_tokens": 6443715.0, "reward": 0.0703125, "reward_std": 0.09803006052970886, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.0703125, "rewards/format_reward_func/std": 0.2566775679588318, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.15644444444444444, "grad_norm": 7.0766541269315875, "kl": 0.728851318359375, "learning_rate": 4.971974694125051e-07, "loss": 0.0007, "num_tokens": 6593715.0, "reward": 0.140625, "reward_std": 0.1269671469926834, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.1640625, "rewards/format_reward_func/std": 0.371787428855896, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16, "grad_norm": 4.872646650485278, "kl": 0.322509765625, "learning_rate": 4.969781880610813e-07, "loss": 0.0003, "num_tokens": 6743827.0, "reward": 0.1640625, "reward_std": 0.19241680204868317, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.1796875, "rewards/format_reward_func/std": 0.3854354918003082, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16355555555555557, "grad_norm": 1.0088881954713835, "kl": 0.070465087890625, "learning_rate": 4.967507000931702e-07, "loss": 0.0001, "num_tokens": 6893915.0, "reward": 0.1953125, "reward_std": 0.1971469223499298, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.1953125, "rewards/format_reward_func/std": 0.3979988098144531, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.1671111111111111, "grad_norm": 3.180876744240654, "kl": 0.444976806640625, "learning_rate": 4.965150130677651e-07, "loss": 0.0004, "num_tokens": 7043971.0, "reward": 0.19921875, "reward_std": 0.2139805108308792, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.41502299904823303, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17066666666666666, "grad_norm": 1.2194553812166182, "kl": 0.196197509765625, "learning_rate": 4.962711348162987e-07, "loss": 0.0002, "num_tokens": 7194079.0, "reward": 0.1640625, "reward_std": 0.16161686182022095, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.171875, "rewards/format_reward_func/std": 0.3787541687488556, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17422222222222222, "grad_norm": 1.3058480703859512, "kl": 0.093231201171875, "learning_rate": 4.960190734423824e-07, "loss": 0.0001, "num_tokens": 7344127.0, "reward": 0.2109375, "reward_std": 0.2336813509464264, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.434714138507843, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17777777777777778, "grad_norm": 0.8822642087267891, "kl": 0.10333251953125, "learning_rate": 4.957588373215373e-07, "loss": 0.0001, "num_tokens": 7494299.0, "reward": 0.14453125, "reward_std": 0.15854540467262268, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.1953125, "rewards/format_reward_func/std": 0.3979988098144531, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18133333333333335, "grad_norm": 5.708085997498834, "kl": 0.7578125, "learning_rate": 4.954904351009156e-07, "loss": 0.0008, "num_tokens": 7644343.0, "reward": 0.2421875, "reward_std": 0.24566304683685303, "rewards/equation_reward_func/mean": 0.203125, "rewards/equation_reward_func/std": 0.40390563011169434, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.4513758420944214, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 1020.109375, "completions/mean_terminated_length": 526.0, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 0.18488888888888888, "grad_norm": 1.323493532471514, "kl": 0.12872314453125, "learning_rate": 4.952138756990142e-07, "loss": 0.0001, "num_tokens": 7793957.0, "reward": 0.19921875, "reward_std": 0.19263195991516113, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.265625, "rewards/format_reward_func/std": 0.44340085983276367, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18844444444444444, "grad_norm": 1.4983556320698812, "kl": 0.248046875, "learning_rate": 4.949291683053768e-07, "loss": 0.0002, "num_tokens": 7943977.0, "reward": 0.2109375, "reward_std": 0.22092357277870178, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.434714138507843, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.192, "grad_norm": 2.4977361409936805, "kl": 0.43695068359375, "learning_rate": 4.946363223802901e-07, "loss": 0.0004, "num_tokens": 8094081.0, "reward": 0.2734375, "reward_std": 0.211696058511734, "rewards/equation_reward_func/mean": 0.2265625, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 0.3203125, "rewards/format_reward_func/std": 0.4684300124645233, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 1022.2265625, "completions/mean_terminated_length": 797.0, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 0.19555555555555557, "grad_norm": 1.6221417605008561, "kl": 0.24462890625, "learning_rate": 4.943353476544681e-07, "loss": -0.0012, "num_tokens": 8243906.0, "reward": 0.23046875, "reward_std": 0.19329938292503357, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.3046875, "rewards/format_reward_func/std": 0.46208351850509644, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.1991111111111111, "grad_norm": 2.6760159527432683, "kl": 0.376220703125, "learning_rate": 4.940262541287302e-07, "loss": 0.0004, "num_tokens": 8393978.0, "reward": 0.21875, "reward_std": 0.18384575843811035, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.4513758420944214, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20266666666666666, "grad_norm": 22.524324011242694, "kl": 2.3280029296875, "learning_rate": 4.937090520736671e-07, "loss": 0.0023, "num_tokens": 8544074.0, "reward": 0.1953125, "reward_std": 0.22807088494300842, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.2734375, "rewards/format_reward_func/std": 0.447474867105484, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20622222222222222, "grad_norm": 6.300577551599378, "kl": 0.8486328125, "learning_rate": 4.933837520293017e-07, "loss": 0.0008, "num_tokens": 8694182.0, "reward": 0.22265625, "reward_std": 0.20167279243469238, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.296875, "rewards/format_reward_func/std": 0.45867621898651123, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20977777777777779, "grad_norm": 28.006158731494125, "kl": 0.86181640625, "learning_rate": 4.930503648047367e-07, "loss": 0.0009, "num_tokens": 8844278.0, "reward": 0.2734375, "reward_std": 0.2949552536010742, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.3515625, "rewards/format_reward_func/std": 0.4793342351913452, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21333333333333335, "grad_norm": 9.33584965819257, "kl": 0.4361572265625, "learning_rate": 4.927089014777972e-07, "loss": 0.0004, "num_tokens": 8994330.0, "reward": 0.28125, "reward_std": 0.24258936941623688, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.3671875, "rewards/format_reward_func/std": 0.4839322865009308, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21688888888888888, "grad_norm": 6.303532871380138, "kl": 0.6859130859375, "learning_rate": 4.923593733946614e-07, "loss": 0.0007, "num_tokens": 9144434.0, "reward": 0.296875, "reward_std": 0.21597611904144287, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.421875, "rewards/format_reward_func/std": 0.4957992732524872, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22044444444444444, "grad_norm": 8.57852289803169, "kl": 1.74853515625, "learning_rate": 4.920017921694841e-07, "loss": 0.0018, "num_tokens": 9294478.0, "reward": 0.2890625, "reward_std": 0.20475518703460693, "rewards/equation_reward_func/mean": 0.2109375, "rewards/equation_reward_func/std": 0.4095771610736847, "rewards/format_reward_func/mean": 0.3671875, "rewards/format_reward_func/std": 0.4839322865009308, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.224, "grad_norm": 3.8660166772097195, "kl": 0.3358154296875, "learning_rate": 4.91636169684011e-07, "loss": 0.0003, "num_tokens": 9444610.0, "reward": 0.2734375, "reward_std": 0.23609855771064758, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.3828125, "rewards/format_reward_func/std": 0.4879830479621887, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22755555555555557, "grad_norm": 12.93870306614523, "kl": 1.1669921875, "learning_rate": 4.912625180871833e-07, "loss": 0.0012, "num_tokens": 9594650.0, "reward": 0.25390625, "reward_std": 0.23785054683685303, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.4860251843929291, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2311111111111111, "grad_norm": 7.511898616426601, "kl": 0.55322265625, "learning_rate": 4.908808497947346e-07, "loss": 0.0006, "num_tokens": 9744694.0, "reward": 0.33984375, "reward_std": 0.27659574151039124, "rewards/equation_reward_func/mean": 0.234375, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.4453125, "rewards/format_reward_func/std": 0.4989531338214874, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23466666666666666, "grad_norm": 1.8438447972930012, "kl": 0.2886962890625, "learning_rate": 4.904911774887779e-07, "loss": 0.0003, "num_tokens": 9894774.0, "reward": 0.3515625, "reward_std": 0.24986067414283752, "rewards/equation_reward_func/mean": 0.2890625, "rewards/equation_reward_func/std": 0.45510825514793396, "rewards/format_reward_func/mean": 0.4140625, "rewards/format_reward_func/std": 0.49449479579925537, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 1021.21875, "completions/mean_terminated_length": 668.0, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "epoch": 0.23822222222222222, "grad_norm": 1.4441757047110426, "kl": 0.230224609375, "learning_rate": 4.900935141173842e-07, "loss": 0.0016, "num_tokens": 10044558.0, "reward": 0.234375, "reward_std": 0.21178939938545227, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.3359375, "rewards/format_reward_func/std": 0.47417303919792175, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24177777777777779, "grad_norm": 1.6691952805088401, "kl": 0.329833984375, "learning_rate": 4.896878728941531e-07, "loss": 0.0003, "num_tokens": 10194698.0, "reward": 0.203125, "reward_std": 0.20396818220615387, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.3046875, "rewards/format_reward_func/std": 0.46208351850509644, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24533333333333332, "grad_norm": 13.720929458457615, "kl": 3.0303955078125, "learning_rate": 4.892742672977722e-07, "loss": 0.003, "num_tokens": 10344786.0, "reward": 0.3203125, "reward_std": 0.25942516326904297, "rewards/equation_reward_func/mean": 0.21875, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.421875, "rewards/format_reward_func/std": 0.4957992732524872, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24888888888888888, "grad_norm": 2.4164360870318733, "kl": 0.5142822265625, "learning_rate": 4.888527110715709e-07, "loss": 0.0005, "num_tokens": 10494850.0, "reward": 0.29296875, "reward_std": 0.2201562374830246, "rewards/equation_reward_func/mean": 0.2109375, "rewards/equation_reward_func/std": 0.4095771610736847, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.4860251843929291, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 1020.3125, "completions/mean_terminated_length": 552.0, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 0.25244444444444447, "grad_norm": 1.7293321306646259, "kl": 0.32080078125, "learning_rate": 4.884232182230623e-07, "loss": -0.003, "num_tokens": 10644490.0, "reward": 0.234375, "reward_std": 0.23798327147960663, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.328125, "rewards/format_reward_func/std": 0.4713755249977112, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.256, "grad_norm": 1.115711151199567, "kl": 0.373046875, "learning_rate": 4.879858030234789e-07, "loss": 0.0004, "num_tokens": 10794594.0, "reward": 0.28125, "reward_std": 0.17779618501663208, "rewards/equation_reward_func/mean": 0.21875, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.47682511806488037, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25955555555555554, "grad_norm": 24.35412756965178, "kl": 2.08587646484375, "learning_rate": 4.875404800072976e-07, "loss": 0.0021, "num_tokens": 10944666.0, "reward": 0.2109375, "reward_std": 0.2127828449010849, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4653336703777313, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.26311111111111113, "grad_norm": 1554.2762073710478, "kl": 112.7879638671875, "learning_rate": 4.870872639717572e-07, "loss": 0.1126, "num_tokens": 11094806.0, "reward": 0.17578125, "reward_std": 0.2021140456199646, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.4513758420944214, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.26666666666666666, "grad_norm": 1.1410728341145782, "kl": 0.2967529296875, "learning_rate": 4.866261699763664e-07, "loss": 0.0003, "num_tokens": 11244874.0, "reward": 0.24609375, "reward_std": 0.2407177835702896, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.3046875, "rewards/format_reward_func/std": 0.46208351850509644, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2702222222222222, "grad_norm": 6.430383302085185, "kl": 1.0128173828125, "learning_rate": 4.861572133424035e-07, "loss": 0.001, "num_tokens": 11394958.0, "reward": 0.28515625, "reward_std": 0.2505149245262146, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.4860251843929291, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2737777777777778, "grad_norm": 7.0576042073325, "kl": 1.8004150390625, "learning_rate": 4.856804096524078e-07, "loss": 0.0018, "num_tokens": 11545110.0, "reward": 0.22265625, "reward_std": 0.16942934691905975, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.296875, "rewards/format_reward_func/std": 0.45867621898651123, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2773333333333333, "grad_norm": 56.11950857705648, "kl": 7.319580078125, "learning_rate": 4.851957747496606e-07, "loss": 0.0073, "num_tokens": 11695202.0, "reward": 0.1953125, "reward_std": 0.19627749919891357, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.2421875, "rewards/format_reward_func/std": 0.4300905168056488, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2808888888888889, "grad_norm": 1.1894403579564377, "kl": 0.2705078125, "learning_rate": 4.847033247376605e-07, "loss": 0.0003, "num_tokens": 11845266.0, "reward": 0.16796875, "reward_std": 0.20112939178943634, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.2421875, "rewards/format_reward_func/std": 0.4300905168056488, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.28444444444444444, "grad_norm": 15.58649087673573, "kl": 5.33154296875, "learning_rate": 4.842030759795866e-07, "loss": 0.0053, "num_tokens": 11995326.0, "reward": 0.20703125, "reward_std": 0.19561228156089783, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.2890625, "rewards/format_reward_func/std": 0.45510825514793396, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.288, "grad_norm": 1.2211524541098628, "kl": 0.345703125, "learning_rate": 4.836950450977558e-07, "loss": 0.0003, "num_tokens": 12145330.0, "reward": 0.28515625, "reward_std": 0.2513952851295471, "rewards/equation_reward_func/mean": 0.2265625, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.47682511806488037, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.29155555555555557, "grad_norm": 11.595035432688796, "kl": 0.9814453125, "learning_rate": 4.831792489730703e-07, "loss": 0.001, "num_tokens": 12295462.0, "reward": 0.171875, "reward_std": 0.16823168098926544, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.1953125, "rewards/format_reward_func/std": 0.3979988098144531, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2951111111111111, "grad_norm": 1.1924550929402917, "kl": 0.3919677734375, "learning_rate": 4.826557047444563e-07, "loss": 0.0004, "num_tokens": 12445498.0, "reward": 0.14453125, "reward_std": 0.18077430129051208, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.1953125, "rewards/format_reward_func/std": 0.3979988098144531, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2986666666666667, "grad_norm": 21.230632420638482, "kl": 3.0062255859375, "learning_rate": 4.821244298082951e-07, "loss": 0.003, "num_tokens": 12595542.0, "reward": 0.22265625, "reward_std": 0.22652746737003326, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.2734375, "rewards/format_reward_func/std": 0.447474867105484, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3022222222222222, "grad_norm": 1.0369057223186056, "kl": 0.2333984375, "learning_rate": 4.815854418178445e-07, "loss": 0.0002, "num_tokens": 12745614.0, "reward": 0.19140625, "reward_std": 0.20221619307994843, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.2578125, "rewards/format_reward_func/std": 0.43914902210235596, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30577777777777776, "grad_norm": 1.0278344273119406, "kl": 0.1983642578125, "learning_rate": 4.810387586826527e-07, "loss": 0.0002, "num_tokens": 12895698.0, "reward": 0.2734375, "reward_std": 0.2464390993118286, "rewards/equation_reward_func/mean": 0.203125, "rewards/equation_reward_func/std": 0.40390563011169434, "rewards/format_reward_func/mean": 0.34375, "rewards/format_reward_func/std": 0.47682511806488037, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30933333333333335, "grad_norm": 9.38166565142255, "kl": 1.9534912109375, "learning_rate": 4.804843985679626e-07, "loss": 0.002, "num_tokens": 13045762.0, "reward": 0.171875, "reward_std": 0.2152000367641449, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.2109375, "rewards/format_reward_func/std": 0.4095771610736847, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3128888888888889, "grad_norm": 1.1768601924784283, "kl": 0.3209228515625, "learning_rate": 4.799223798941089e-07, "loss": 0.0003, "num_tokens": 13195926.0, "reward": 0.16015625, "reward_std": 0.18319149315357208, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.41502299904823303, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3164444444444444, "grad_norm": 0.6415947722993769, "kl": 0.1815185546875, "learning_rate": 4.793527213359058e-07, "loss": 0.0002, "num_tokens": 13346058.0, "reward": 0.1015625, "reward_std": 0.1392945945262909, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.3320184051990509, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32, "grad_norm": 1.1209935354079488, "kl": 0.1925048828125, "learning_rate": 4.787754418220257e-07, "loss": 0.0002, "num_tokens": 13496054.0, "reward": 0.171875, "reward_std": 0.21861067414283752, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.2265625, "rewards/format_reward_func/std": 0.4202519655227661, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32355555555555554, "grad_norm": 1.1350649820782601, "kl": 0.2437744140625, "learning_rate": 4.781905605343716e-07, "loss": 0.0002, "num_tokens": 13646078.0, "reward": 0.16796875, "reward_std": 0.1717531979084015, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.234375, "rewards/format_reward_func/std": 0.42527204751968384, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32711111111111113, "grad_norm": 0.8036131940570752, "kl": 0.2301025390625, "learning_rate": 4.775980969074385e-07, "loss": 0.0002, "num_tokens": 13796150.0, "reward": 0.17578125, "reward_std": 0.1618429571390152, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.203125, "rewards/format_reward_func/std": 0.40390563011169434, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.33066666666666666, "grad_norm": 2.5669796312022144, "kl": 0.4324951171875, "learning_rate": 4.769980706276687e-07, "loss": 0.0004, "num_tokens": 13946262.0, "reward": 0.203125, "reward_std": 0.2259906381368637, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.2578125, "rewards/format_reward_func/std": 0.43914902210235596, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3342222222222222, "grad_norm": 1.5566319365843342, "kl": 0.2623291015625, "learning_rate": 4.7639050163279646e-07, "loss": 0.0003, "num_tokens": 14096386.0, "reward": 0.14453125, "reward_std": 0.1585453897714615, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.1796875, "rewards/format_reward_func/std": 0.3854354918003082, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3377777777777778, "grad_norm": 0.9827466599698991, "kl": 0.2091064453125, "learning_rate": 4.757754101111867e-07, "loss": 0.0002, "num_tokens": 14246454.0, "reward": 0.2109375, "reward_std": 0.2073678970336914, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.2734375, "rewards/format_reward_func/std": 0.447474867105484, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3413333333333333, "grad_norm": 0.8474200819506911, "kl": 0.1885986328125, "learning_rate": 4.751528165011633e-07, "loss": 0.0002, "num_tokens": 14396482.0, "reward": 0.1953125, "reward_std": 0.19494709372520447, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.434714138507843, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3448888888888889, "grad_norm": 0.8935139225334848, "kl": 0.19091796875, "learning_rate": 4.7452274149033036e-07, "loss": 0.0002, "num_tokens": 14546506.0, "reward": 0.22265625, "reward_std": 0.1896713674068451, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.2734375, "rewards/format_reward_func/std": 0.447474867105484, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.34844444444444445, "grad_norm": 0.8908823710522052, "kl": 0.1903076171875, "learning_rate": 4.738852060148848e-07, "loss": 0.0002, "num_tokens": 14696570.0, "reward": 0.16015625, "reward_std": 0.17767438292503357, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.1953125, "rewards/format_reward_func/std": 0.3979988098144531, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.352, "grad_norm": 0.8940407168561494, "kl": 0.15765380859375, "learning_rate": 4.7324023125892067e-07, "loss": 0.0002, "num_tokens": 14846678.0, "reward": 0.13671875, "reward_std": 0.16569268703460693, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.2109375, "rewards/format_reward_func/std": 0.4095771610736847, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.35555555555555557, "grad_norm": 1.0372146373302493, "kl": 0.2196044921875, "learning_rate": 4.7258783865372496e-07, "loss": 0.0002, "num_tokens": 14996674.0, "reward": 0.296875, "reward_std": 0.29111427068710327, "rewards/equation_reward_func/mean": 0.2421875, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.3515625, "rewards/format_reward_func/std": 0.4793342351913452, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3591111111111111, "grad_norm": 3.8175886041475904, "kl": 0.525634765625, "learning_rate": 4.719280498770659e-07, "loss": 0.0005, "num_tokens": 15146770.0, "reward": 0.265625, "reward_std": 0.1947406530380249, "rewards/equation_reward_func/mean": 0.234375, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.296875, "rewards/format_reward_func/std": 0.45867621898651123, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3626666666666667, "grad_norm": 1.7543264789310198, "kl": 0.295166015625, "learning_rate": 4.712608868524726e-07, "loss": 0.0003, "num_tokens": 15296818.0, "reward": 0.1796875, "reward_std": 0.18956050276756287, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.28125, "rewards/format_reward_func/std": 0.4513758420944214, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3662222222222222, "grad_norm": 1.0227525620642117, "kl": 0.20050048828125, "learning_rate": 4.70586371748506e-07, "loss": 0.0002, "num_tokens": 15446870.0, "reward": 0.2578125, "reward_std": 0.19561007618904114, "rewards/equation_reward_func/mean": 0.21875, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.296875, "rewards/format_reward_func/std": 0.45867621898651123, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36977777777777776, "grad_norm": 0.9640594800874314, "kl": 0.2506103515625, "learning_rate": 4.699045269780232e-07, "loss": 0.0003, "num_tokens": 15596974.0, "reward": 0.25390625, "reward_std": 0.21409572660923004, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.3203125, "rewards/format_reward_func/std": 0.4684300124645233, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37333333333333335, "grad_norm": 0.9924489289475691, "kl": 0.281982421875, "learning_rate": 4.692153751974318e-07, "loss": 0.0003, "num_tokens": 15747106.0, "reward": 0.2421875, "reward_std": 0.1833132803440094, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.328125, "rewards/format_reward_func/std": 0.4713755249977112, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3768888888888889, "grad_norm": 1.1958906934430737, "kl": 0.2791748046875, "learning_rate": 4.685189393059377e-07, "loss": 0.0003, "num_tokens": 15897182.0, "reward": 0.28515625, "reward_std": 0.2538124918937683, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.453125, "rewards/format_reward_func/std": 0.4997538626194, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3804444444444444, "grad_norm": 4.660458310479511, "kl": 0.49755859375, "learning_rate": 4.6781524244478374e-07, "loss": 0.0005, "num_tokens": 16047302.0, "reward": 0.31640625, "reward_std": 0.23313136398792267, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.5009832978248596, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.384, "grad_norm": 1.2324512257183518, "kl": 0.3973388671875, "learning_rate": 4.6710430799648143e-07, "loss": 0.0004, "num_tokens": 16197302.0, "reward": 0.3125, "reward_std": 0.19484493136405945, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.4765625, "rewards/format_reward_func/std": 0.5014128684997559, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38755555555555554, "grad_norm": 1.2549179396404038, "kl": 0.2296142578125, "learning_rate": 4.663861595840332e-07, "loss": 0.0002, "num_tokens": 16347306.0, "reward": 0.390625, "reward_std": 0.24787381291389465, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.4930621087551117, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.39111111111111113, "grad_norm": 1.078204553596969, "kl": 0.3565673828125, "learning_rate": 4.6566082107014795e-07, "loss": 0.0004, "num_tokens": 16497418.0, "reward": 0.27734375, "reward_std": 0.22730353474617004, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.49802759289741516, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.39466666666666667, "grad_norm": 13.095496410143678, "kl": 1.576416015625, "learning_rate": 4.649283165564479e-07, "loss": 0.0016, "num_tokens": 16647462.0, "reward": 0.30859375, "reward_std": 0.19462978839874268, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.4921875, "rewards/format_reward_func/std": 0.5019033551216125, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3982222222222222, "grad_norm": 1.1570771083116458, "kl": 0.3052978515625, "learning_rate": 4.6418867038266807e-07, "loss": 0.0003, "num_tokens": 16797554.0, "reward": 0.3046875, "reward_std": 0.227077454328537, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.5009832978248596, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4017777777777778, "grad_norm": 1.2112330269604161, "kl": 0.267578125, "learning_rate": 4.6344190712584713e-07, "loss": 0.0003, "num_tokens": 16947698.0, "reward": 0.3671875, "reward_std": 0.2886773347854614, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.5546875, "rewards/format_reward_func/std": 0.4989531338214874, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4053333333333333, "grad_norm": 1.0237828322135445, "kl": 0.248291015625, "learning_rate": 4.6268805159951086e-07, "loss": 0.0002, "num_tokens": 17097782.0, "reward": 0.33984375, "reward_std": 0.22488634288311005, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.546875, "rewards/format_reward_func/std": 0.4997538626194, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4088888888888889, "grad_norm": 3.3025894103980704, "kl": 0.5584716796875, "learning_rate": 4.619271288528478e-07, "loss": 0.0006, "num_tokens": 17247882.0, "reward": 0.27734375, "reward_std": 0.2406156361103058, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.453125, "rewards/format_reward_func/std": 0.4997538626194, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.41244444444444445, "grad_norm": 4.1382469339470225, "kl": 0.8095703125, "learning_rate": 4.611591641698768e-07, "loss": 0.0008, "num_tokens": 17397994.0, "reward": 0.28125, "reward_std": 0.1947515904903412, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.5009832978248596, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.416, "grad_norm": 15.502900105571765, "kl": 3.479736328125, "learning_rate": 4.6038418306860695e-07, "loss": 0.0035, "num_tokens": 17548122.0, "reward": 0.33203125, "reward_std": 0.2296164482831955, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.49802759289741516, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.41955555555555557, "grad_norm": 1.2758988778808227, "kl": 0.275390625, "learning_rate": 4.596022113001894e-07, "loss": 0.0003, "num_tokens": 17698266.0, "reward": 0.375, "reward_std": 0.203322634100914, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.49802759289741516, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4231111111111111, "grad_norm": 1.1770866594193163, "kl": 0.3016357421875, "learning_rate": 4.58813274848062e-07, "loss": 0.0003, "num_tokens": 17848314.0, "reward": 0.37890625, "reward_std": 0.23390743136405945, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.49802759289741516, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4266666666666667, "grad_norm": 2.4545856054451156, "kl": 0.7242431640625, "learning_rate": 4.5801739992708604e-07, "loss": 0.0007, "num_tokens": 17998410.0, "reward": 0.31640625, "reward_std": 0.208717942237854, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.5078125, "rewards/format_reward_func/std": 0.5019033551216125, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43022222222222223, "grad_norm": 1.4734406735109156, "kl": 0.3726806640625, "learning_rate": 4.572146129826746e-07, "loss": 0.0004, "num_tokens": 18148458.0, "reward": 0.46875, "reward_std": 0.23896577954292297, "rewards/equation_reward_func/mean": 0.2421875, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.6953125, "rewards/format_reward_func/std": 0.46208351850509644, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43377777777777776, "grad_norm": 5.758478512827765, "kl": 1.009033203125, "learning_rate": 4.5640494068991454e-07, "loss": 0.001, "num_tokens": 18298598.0, "reward": 0.32421875, "reward_std": 0.20069028437137604, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.5234375, "rewards/format_reward_func/std": 0.5014128684997559, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43733333333333335, "grad_norm": 1.1248450015614464, "kl": 0.288330078125, "learning_rate": 4.555884099526793e-07, "loss": 0.0003, "num_tokens": 18448610.0, "reward": 0.40234375, "reward_std": 0.20794187486171722, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.6171875, "rewards/format_reward_func/std": 0.4879830479621887, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4408888888888889, "grad_norm": 1.117208300016219, "kl": 0.271728515625, "learning_rate": 4.547650479027361e-07, "loss": 0.0003, "num_tokens": 18598694.0, "reward": 0.41015625, "reward_std": 0.20739847421646118, "rewards/equation_reward_func/mean": 0.2109375, "rewards/equation_reward_func/std": 0.4095771610736847, "rewards/format_reward_func/mean": 0.609375, "rewards/format_reward_func/std": 0.4898075461387634, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4444444444444444, "grad_norm": 1.212716720763158, "kl": 0.28369140625, "learning_rate": 4.53934881898843e-07, "loss": 0.0003, "num_tokens": 18748774.0, "reward": 0.3828125, "reward_std": 0.2037726789712906, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.4860251843929291, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.448, "grad_norm": 1.4912885010455994, "kl": 0.28466796875, "learning_rate": 4.5309793952584095e-07, "loss": 0.0003, "num_tokens": 18898894.0, "reward": 0.30078125, "reward_std": 0.2230125367641449, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.4921875, "rewards/format_reward_func/std": 0.5019033551216125, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.45155555555555554, "grad_norm": 1.6085493649140328, "kl": 0.369873046875, "learning_rate": 4.5225424859373684e-07, "loss": 0.0004, "num_tokens": 19048970.0, "reward": 0.35546875, "reward_std": 0.21158519387245178, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.4860251843929291, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.45511111111111113, "grad_norm": 8.1136624800545, "kl": 1.5572509765625, "learning_rate": 4.514038371367791e-07, "loss": 0.0016, "num_tokens": 19199078.0, "reward": 0.42578125, "reward_std": 0.19231687486171722, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.6953125, "rewards/format_reward_func/std": 0.46208351850509644, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.45866666666666667, "grad_norm": 1.4312492558258578, "kl": 0.3763427734375, "learning_rate": 4.5054673341252657e-07, "loss": 0.0004, "num_tokens": 19349186.0, "reward": 0.34765625, "reward_std": 0.1864890456199646, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.5859375, "rewards/format_reward_func/std": 0.49449479579925537, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4622222222222222, "grad_norm": 2.962105238864389, "kl": 0.706787109375, "learning_rate": 4.496829659009095e-07, "loss": 0.0007, "num_tokens": 19499346.0, "reward": 0.34765625, "reward_std": 0.18802587687969208, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.4930621087551117, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4657777777777778, "grad_norm": 354.1491510027472, "kl": 56.3818359375, "learning_rate": 4.488125633032831e-07, "loss": 0.0565, "num_tokens": 19649510.0, "reward": 0.33984375, "reward_std": 0.21212857961654663, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.578125, "rewards/format_reward_func/std": 0.4957992732524872, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4693333333333333, "grad_norm": 174.51099175867512, "kl": 9.54150390625, "learning_rate": 4.479355545414738e-07, "loss": 0.0096, "num_tokens": 19799590.0, "reward": 0.41796875, "reward_std": 0.19462977349758148, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.6640625, "rewards/format_reward_func/std": 0.47417303919792175, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4728888888888889, "grad_norm": 1.397870689490664, "kl": 0.516845703125, "learning_rate": 4.470519687568185e-07, "loss": 0.0005, "num_tokens": 19949690.0, "reward": 0.38671875, "reward_std": 0.18506528437137604, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.609375, "rewards/format_reward_func/std": 0.4898075461387634, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.47644444444444445, "grad_norm": 1.0641630797189836, "kl": 0.30322265625, "learning_rate": 4.4616183530919604e-07, "loss": 0.0003, "num_tokens": 20099654.0, "reward": 0.4609375, "reward_std": 0.17670938372612, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.734375, "rewards/format_reward_func/std": 0.44340085983276367, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.48, "grad_norm": 6.158216503528497, "kl": 1.213623046875, "learning_rate": 4.452651837760515e-07, "loss": 0.0012, "num_tokens": 20249794.0, "reward": 0.42578125, "reward_std": 0.17373128235340118, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.47682511806488037, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.48355555555555557, "grad_norm": 1.3444116770940566, "kl": 0.523681640625, "learning_rate": 4.443620439514138e-07, "loss": 0.0005, "num_tokens": 20399882.0, "reward": 0.3984375, "reward_std": 0.16162778437137604, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.6953125, "rewards/format_reward_func/std": 0.46208351850509644, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4871111111111111, "grad_norm": 1.531233557438578, "kl": 0.55908203125, "learning_rate": 4.4345244584490535e-07, "loss": 0.0006, "num_tokens": 20550006.0, "reward": 0.48046875, "reward_std": 0.21245458722114563, "rewards/equation_reward_func/mean": 0.28125, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.6796875, "rewards/format_reward_func/std": 0.4684300124645233, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49066666666666664, "grad_norm": 7.99434662228326, "kl": 1.595458984375, "learning_rate": 4.4253641968074505e-07, "loss": 0.0016, "num_tokens": 20700002.0, "reward": 0.43359375, "reward_std": 0.20639410614967346, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.671875, "rewards/format_reward_func/std": 0.4713755249977112, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49422222222222223, "grad_norm": 3.051331037407785, "kl": 0.939697265625, "learning_rate": 4.41613995896744e-07, "loss": 0.0009, "num_tokens": 20850102.0, "reward": 0.3671875, "reward_std": 0.19056487083435059, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.640625, "rewards/format_reward_func/std": 0.481702595949173, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49777777777777776, "grad_norm": 4.814397537365208, "kl": 1.9078369140625, "learning_rate": 4.40685205143294e-07, "loss": 0.0019, "num_tokens": 21000234.0, "reward": 0.375, "reward_std": 0.1520632803440094, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.6171875, "rewards/format_reward_func/std": 0.4879830479621887, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5013333333333333, "grad_norm": 3.2517027439890924, "kl": 0.801025390625, "learning_rate": 4.3975007828234914e-07, "loss": 0.0008, "num_tokens": 21150326.0, "reward": 0.43359375, "reward_std": 0.2021140456199646, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.703125, "rewards/format_reward_func/std": 0.45867621898651123, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5048888888888889, "grad_norm": 1.663987879451297, "kl": 0.525634765625, "learning_rate": 4.3880864638640035e-07, "loss": 0.0005, "num_tokens": 21300382.0, "reward": 0.4609375, "reward_std": 0.17912657558918, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.7421875, "rewards/format_reward_func/std": 0.43914902210235596, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5084444444444445, "grad_norm": 1.6351203503838305, "kl": 0.519287109375, "learning_rate": 4.37860940737443e-07, "loss": 0.0005, "num_tokens": 21450566.0, "reward": 0.26171875, "reward_std": 0.21883678436279297, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.5078125, "rewards/format_reward_func/std": 0.5019033551216125, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.512, "grad_norm": 1.366258610631795, "kl": 0.574462890625, "learning_rate": 4.3690699282593723e-07, "loss": 0.0006, "num_tokens": 21600694.0, "reward": 0.4140625, "reward_std": 0.22181487083435059, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.671875, "rewards/format_reward_func/std": 0.4713755249977112, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5155555555555555, "grad_norm": 1.2979528663337039, "kl": 0.371826171875, "learning_rate": 4.3594683434976186e-07, "loss": 0.0004, "num_tokens": 21750818.0, "reward": 0.41015625, "reward_std": 0.23919188976287842, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.6328125, "rewards/format_reward_func/std": 0.4839322865009308, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5191111111111111, "grad_norm": 13.971745146487256, "kl": 2.259521484375, "learning_rate": 4.3498049721316087e-07, "loss": 0.0023, "num_tokens": 21900886.0, "reward": 0.48828125, "reward_std": 0.18275237083435059, "rewards/equation_reward_func/mean": 0.28125, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.6953125, "rewards/format_reward_func/std": 0.46208351850509644, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5226666666666666, "grad_norm": 1.5151109592881105, "kl": 0.50927734375, "learning_rate": 4.340080135256835e-07, "loss": 0.0005, "num_tokens": 22050994.0, "reward": 0.375, "reward_std": 0.1720726490020752, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.609375, "rewards/format_reward_func/std": 0.4898075461387634, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5262222222222223, "grad_norm": 3.565613250452234, "kl": 1.1435546875, "learning_rate": 4.3302941560111716e-07, "loss": 0.0011, "num_tokens": 22201058.0, "reward": 0.44140625, "reward_std": 0.1594257354736328, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4653336703777313, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5297777777777778, "grad_norm": 1.1082031850469054, "kl": 0.389892578125, "learning_rate": 4.3204473595641367e-07, "loss": 0.0004, "num_tokens": 22351074.0, "reward": 0.5, "reward_std": 0.17966999113559723, "rewards/equation_reward_func/mean": 0.234375, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.765625, "rewards/format_reward_func/std": 0.42527204751968384, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5333333333333333, "grad_norm": 29.666704145228632, "kl": 5.103271484375, "learning_rate": 4.3105400731060896e-07, "loss": 0.0051, "num_tokens": 22501194.0, "reward": 0.44140625, "reward_std": 0.161842942237854, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.434714138507843, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5368888888888889, "grad_norm": 72.53647797594527, "kl": 2.019775390625, "learning_rate": 4.300572625837359e-07, "loss": 0.002, "num_tokens": 22651314.0, "reward": 0.3828125, "reward_std": 0.20705929398536682, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.7109375, "rewards/format_reward_func/std": 0.45510825514793396, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5404444444444444, "grad_norm": 1.1753775278093777, "kl": 0.44921875, "learning_rate": 4.2905453489573007e-07, "loss": 0.0004, "num_tokens": 22801386.0, "reward": 0.41015625, "reward_std": 0.15327188372612, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.7265625, "rewards/format_reward_func/std": 0.447474867105484, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.544, "grad_norm": 1.8103430830421683, "kl": 0.861328125, "learning_rate": 4.280458575653296e-07, "loss": 0.0009, "num_tokens": 22951478.0, "reward": 0.4375, "reward_std": 0.16887937486171722, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.7265625, "rewards/format_reward_func/std": 0.447474867105484, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5475555555555556, "grad_norm": 92.17653023102632, "kl": 29.57275390625, "learning_rate": 4.2703126410896815e-07, "loss": 0.0296, "num_tokens": 23101526.0, "reward": 0.421875, "reward_std": 0.2033226490020752, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.7265625, "rewards/format_reward_func/std": 0.447474867105484, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5511111111111111, "grad_norm": 1.8601363913983529, "kl": 0.711669921875, "learning_rate": 4.2601078823966065e-07, "loss": 0.0007, "num_tokens": 23251622.0, "reward": 0.3984375, "reward_std": 0.23224879801273346, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.671875, "rewards/format_reward_func/std": 0.4713755249977112, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5546666666666666, "grad_norm": 1.9655716755995285, "kl": 0.5400390625, "learning_rate": 4.249844638658837e-07, "loss": 0.0005, "num_tokens": 23401694.0, "reward": 0.4140625, "reward_std": 0.1894671618938446, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.71875, "rewards/format_reward_func/std": 0.4513758420944214, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5582222222222222, "grad_norm": 1.2512804150325183, "kl": 0.381103515625, "learning_rate": 4.2395232509044856e-07, "loss": 0.0004, "num_tokens": 23551754.0, "reward": 0.4140625, "reward_std": 0.15502388775348663, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.78125, "rewards/format_reward_func/std": 0.41502299904823303, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5617777777777778, "grad_norm": 4.6864178808609065, "kl": 1.0859375, "learning_rate": 4.229144062093679e-07, "loss": 0.0011, "num_tokens": 23701886.0, "reward": 0.40234375, "reward_std": 0.17801132798194885, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.6796875, "rewards/format_reward_func/std": 0.4684300124645233, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5653333333333334, "grad_norm": 1.2185679157403768, "kl": 0.41064453125, "learning_rate": 4.218707417107166e-07, "loss": 0.0004, "num_tokens": 23852050.0, "reward": 0.37109375, "reward_std": 0.1713140904903412, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.6796875, "rewards/format_reward_func/std": 0.4684300124645233, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5688888888888889, "grad_norm": 1.248032189193752, "kl": 0.443359375, "learning_rate": 4.208213662734852e-07, "loss": 0.0004, "num_tokens": 24002034.0, "reward": 0.48828125, "reward_std": 0.14753741025924683, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.859375, "rewards/format_reward_func/std": 0.3490002751350403, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5724444444444444, "grad_norm": 1.0325417882527244, "kl": 0.435546875, "learning_rate": 4.197663147664281e-07, "loss": 0.0004, "num_tokens": 24152122.0, "reward": 0.44140625, "reward_std": 0.1556890904903412, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.828125, "rewards/format_reward_func/std": 0.3787541687488556, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.576, "grad_norm": 1.1687197449448241, "kl": 0.60693359375, "learning_rate": 4.187056222469046e-07, "loss": 0.0006, "num_tokens": 24302178.0, "reward": 0.5, "reward_std": 0.18526950478553772, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.8515625, "rewards/format_reward_func/std": 0.356930136680603, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5795555555555556, "grad_norm": 0.8560481833926494, "kl": 0.405029296875, "learning_rate": 4.1763932395971433e-07, "loss": 0.0004, "num_tokens": 24452246.0, "reward": 0.51953125, "reward_std": 0.08835469186306, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.8984375, "rewards/format_reward_func/std": 0.3032590448856354, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5831111111111111, "grad_norm": 0.9227050375160734, "kl": 0.361572265625, "learning_rate": 4.1656745533592565e-07, "loss": 0.0004, "num_tokens": 24602314.0, "reward": 0.51953125, "reward_std": 0.112550750374794, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.8984375, "rewards/format_reward_func/std": 0.3032590448856354, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5866666666666667, "grad_norm": 2.9673521250500965, "kl": 0.835693359375, "learning_rate": 4.1549005199169887e-07, "loss": 0.0008, "num_tokens": 24752402.0, "reward": 0.48828125, "reward_std": 0.125758558511734, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.3320184051990509, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5902222222222222, "grad_norm": 59803.04638668722, "kl": 5376.424560546875, "learning_rate": 4.1440714972710245e-07, "loss": 5.3755, "num_tokens": 24902430.0, "reward": 0.50390625, "reward_std": 0.11067695170640945, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.3320184051990509, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5937777777777777, "grad_norm": 0.9872798550394157, "kl": 0.467041015625, "learning_rate": 4.1331878452492366e-07, "loss": 0.0005, "num_tokens": 25052534.0, "reward": 0.53125, "reward_std": 0.135988250374794, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.8984375, "rewards/format_reward_func/std": 0.3032590448856354, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5973333333333334, "grad_norm": 0.9968829039320042, "kl": 0.5625, "learning_rate": 4.122249925494726e-07, "loss": 0.0006, "num_tokens": 25202638.0, "reward": 0.546875, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.8984375, "rewards/format_reward_func/std": 0.3032590448856354, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6008888888888889, "grad_norm": 1.407994078507014, "kl": 0.69482421875, "learning_rate": 4.111258101453809e-07, "loss": 0.0007, "num_tokens": 25352698.0, "reward": 0.47265625, "reward_std": 0.07933359593153, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.8828125, "rewards/format_reward_func/std": 0.322907418012619, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6044444444444445, "grad_norm": 0.9550324441837219, "kl": 0.371337890625, "learning_rate": 4.10021273836394e-07, "loss": 0.0004, "num_tokens": 25502766.0, "reward": 0.47265625, "reward_std": 0.09077189117670059, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.8984375, "rewards/format_reward_func/std": 0.3032590448856354, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.608, "grad_norm": 0.9530508768607797, "kl": 0.538818359375, "learning_rate": 4.0891142032415717e-07, "loss": 0.0005, "num_tokens": 25652830.0, "reward": 0.4765625, "reward_std": 0.0739382952451706, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.29262590408325195, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6115555555555555, "grad_norm": 1.3515658288339731, "kl": 0.4130859375, "learning_rate": 4.0779628648699647e-07, "loss": 0.0004, "num_tokens": 25802866.0, "reward": 0.46484375, "reward_std": 0.11960469186306, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.8359375, "rewards/format_reward_func/std": 0.371787428855896, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6151111111111112, "grad_norm": 5.062381594275897, "kl": 1.2841796875, "learning_rate": 4.066759093786931e-07, "loss": 0.0013, "num_tokens": 25952918.0, "reward": 0.5703125, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6186666666666667, "grad_norm": 1.884658172401711, "kl": 0.83984375, "learning_rate": 4.055503262272521e-07, "loss": 0.0008, "num_tokens": 26103022.0, "reward": 0.5, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.921875, "rewards/format_reward_func/std": 0.2694226801395416, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6222222222222222, "grad_norm": 0.899477885337483, "kl": 0.36328125, "learning_rate": 4.044195744336656e-07, "loss": 0.0004, "num_tokens": 26253170.0, "reward": 0.50390625, "reward_std": 0.1130007952451706, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.8828125, "rewards/format_reward_func/std": 0.322907418012619, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6257777777777778, "grad_norm": 1.1287658852126066, "kl": 0.556884765625, "learning_rate": 4.0328369157066975e-07, "loss": 0.0006, "num_tokens": 26403294.0, "reward": 0.5, "reward_std": 0.10277109593153, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.8984375, "rewards/format_reward_func/std": 0.3032590448856354, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6293333333333333, "grad_norm": 0.9554300863560559, "kl": 0.439453125, "learning_rate": 4.021427153814965e-07, "loss": 0.0004, "num_tokens": 26553438.0, "reward": 0.49609375, "reward_std": 0.0859375, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24301259219646454, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6328888888888888, "grad_norm": 1.8696085163129879, "kl": 0.6962890625, "learning_rate": 4.009966837786194e-07, "loss": 0.0007, "num_tokens": 26703546.0, "reward": 0.45703125, "reward_std": 0.12014809250831604, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.8515625, "rewards/format_reward_func/std": 0.356930136680603, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6364444444444445, "grad_norm": 0.8445236924167039, "kl": 0.44580078125, "learning_rate": 3.9984563484249355e-07, "loss": 0.0004, "num_tokens": 26853626.0, "reward": 0.46484375, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.9296875, "rewards/format_reward_func/std": 0.2566775679588318, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.64, "grad_norm": 8.514060274236515, "kl": 1.341064453125, "learning_rate": 3.98689606820291e-07, "loss": 0.0013, "num_tokens": 27003610.0, "reward": 0.52734375, "reward_std": 0.07933359593153, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.921875, "rewards/format_reward_func/std": 0.2694226801395416, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6435555555555555, "grad_norm": 1.0918541430694348, "kl": 0.39501953125, "learning_rate": 3.975286381246288e-07, "loss": 0.0004, "num_tokens": 27153798.0, "reward": 0.4609375, "reward_std": 0.08714609593153, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.29262590408325195, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6471111111111111, "grad_norm": 0.6639080655090452, "kl": 0.39892578125, "learning_rate": 3.963627673322936e-07, "loss": 0.0004, "num_tokens": 27303818.0, "reward": 0.55078125, "reward_std": 0.0703125, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6506666666666666, "grad_norm": 0.8008104294625259, "kl": 0.34619140625, "learning_rate": 3.951920331829592e-07, "loss": 0.0003, "num_tokens": 27453838.0, "reward": 0.53125, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6542222222222223, "grad_norm": 2.4570583505386767, "kl": 0.77587890625, "learning_rate": 3.9401647457789977e-07, "loss": 0.0008, "num_tokens": 27603914.0, "reward": 0.49609375, "reward_std": 0.07933359593153, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9140625, "rewards/format_reward_func/std": 0.2813730239868164, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6577777777777778, "grad_norm": 0.7694633251007069, "kl": 0.36328125, "learning_rate": 3.9283613057869683e-07, "loss": 0.0004, "num_tokens": 27754050.0, "reward": 0.5, "reward_std": 0.0739382952451706, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24301259219646454, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6613333333333333, "grad_norm": 49.68125956240344, "kl": 6.21240234375, "learning_rate": 3.9165104040594144e-07, "loss": 0.0062, "num_tokens": 27904122.0, "reward": 0.48828125, "reward_std": 0.11541798710823059, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.3320184051990509, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6648888888888889, "grad_norm": 1.597010810734867, "kl": 0.74072265625, "learning_rate": 3.9046124343793104e-07, "loss": 0.0007, "num_tokens": 28054230.0, "reward": 0.50390625, "reward_std": 0.08835469186306, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.921875, "rewards/format_reward_func/std": 0.2694226801395416, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6684444444444444, "grad_norm": 0.8478903871951797, "kl": 0.490966796875, "learning_rate": 3.8926677920936093e-07, "loss": 0.0005, "num_tokens": 28204330.0, "reward": 0.53125, "reward_std": 0.09858439117670059, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.921875, "rewards/format_reward_func/std": 0.2694226801395416, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.672, "grad_norm": 0.7803466603833829, "kl": 0.361328125, "learning_rate": 3.880676874100106e-07, "loss": 0.0004, "num_tokens": 28354446.0, "reward": 0.56640625, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6755555555555556, "grad_norm": 0.6667441813180046, "kl": 0.370361328125, "learning_rate": 3.868640078834251e-07, "loss": 0.0004, "num_tokens": 28504506.0, "reward": 0.53515625, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6791111111111111, "grad_norm": 0.9717479349571527, "kl": 0.352783203125, "learning_rate": 3.856557806255907e-07, "loss": 0.0004, "num_tokens": 28654702.0, "reward": 0.50390625, "reward_std": 0.11058359593153, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9140625, "rewards/format_reward_func/std": 0.2813730239868164, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6826666666666666, "grad_norm": 0.6531105470033437, "kl": 0.4169921875, "learning_rate": 3.844430457836064e-07, "loss": 0.0004, "num_tokens": 28804750.0, "reward": 0.53125, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6862222222222222, "grad_norm": 0.7581030751348999, "kl": 0.4326171875, "learning_rate": 3.8322584365434934e-07, "loss": 0.0004, "num_tokens": 28954890.0, "reward": 0.53515625, "reward_std": 0.08548745512962341, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.9140625, "rewards/format_reward_func/std": 0.2813730239868164, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6897777777777778, "grad_norm": 0.8123280487022461, "kl": 0.378173828125, "learning_rate": 3.8200421468313646e-07, "loss": 0.0004, "num_tokens": 29104990.0, "reward": 0.515625, "reward_std": 0.07767495512962341, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6933333333333334, "grad_norm": 0.9557167275680298, "kl": 0.458251953125, "learning_rate": 3.807781994623802e-07, "loss": 0.0005, "num_tokens": 29255106.0, "reward": 0.4765625, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.8984375, "rewards/format_reward_func/std": 0.3032590448856354, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6968888888888889, "grad_norm": 20.914129417208414, "kl": 2.05908203125, "learning_rate": 3.7954783873023946e-07, "loss": 0.0021, "num_tokens": 29405238.0, "reward": 0.51171875, "reward_std": 0.08416798710823059, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9296875, "rewards/format_reward_func/std": 0.2566775679588318, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7004444444444444, "grad_norm": 0.8290581326894807, "kl": 0.460693359375, "learning_rate": 3.7831317336926674e-07, "loss": 0.0005, "num_tokens": 29555306.0, "reward": 0.53125, "reward_std": 0.0739382952451706, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.704, "grad_norm": 0.7450940079039334, "kl": 0.443115234375, "learning_rate": 3.7707424440504863e-07, "loss": 0.0004, "num_tokens": 29705410.0, "reward": 0.48046875, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24301259219646454, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7075555555555556, "grad_norm": 0.7205252163706303, "kl": 0.453125, "learning_rate": 3.758310930048436e-07, "loss": 0.0005, "num_tokens": 29855442.0, "reward": 0.53125, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7111111111111111, "grad_norm": 1.0214742583016907, "kl": 0.45263671875, "learning_rate": 3.7458376047621356e-07, "loss": 0.0005, "num_tokens": 30005478.0, "reward": 0.55078125, "reward_std": 0.12443909049034119, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.9296875, "rewards/format_reward_func/std": 0.2566775679588318, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7146666666666667, "grad_norm": 4.352162912273971, "kl": 1.650634765625, "learning_rate": 3.733322882656511e-07, "loss": 0.0016, "num_tokens": 30155550.0, "reward": 0.515625, "reward_std": 0.08427885174751282, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9296875, "rewards/format_reward_func/std": 0.2566775679588318, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7182222222222222, "grad_norm": 0.8642171883878098, "kl": 0.646728515625, "learning_rate": 3.7207671795720296e-07, "loss": 0.0006, "num_tokens": 30305614.0, "reward": 0.53125, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7217777777777777, "grad_norm": 0.9770611026209789, "kl": 0.48193359375, "learning_rate": 3.7081709127108767e-07, "loss": 0.0005, "num_tokens": 30455722.0, "reward": 0.50390625, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7253333333333334, "grad_norm": 0.9670302361571421, "kl": 0.5126953125, "learning_rate": 3.695534500623096e-07, "loss": 0.0005, "num_tokens": 30605826.0, "reward": 0.48828125, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7288888888888889, "grad_norm": 3.23663467525944, "kl": 0.73828125, "learning_rate": 3.68285836319268e-07, "loss": 0.0007, "num_tokens": 30755898.0, "reward": 0.49609375, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7324444444444445, "grad_norm": 6.934307670841274, "kl": 0.99365234375, "learning_rate": 3.6701429216236204e-07, "loss": 0.001, "num_tokens": 30905906.0, "reward": 0.5703125, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.736, "grad_norm": 0.8933825229974401, "kl": 0.4169921875, "learning_rate": 3.657388598425908e-07, "loss": 0.0004, "num_tokens": 31055970.0, "reward": 0.5390625, "reward_std": 0.10892495512962341, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.921875, "rewards/format_reward_func/std": 0.2694226801395416, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7395555555555555, "grad_norm": 0.7876520940886738, "kl": 0.554931640625, "learning_rate": 3.644595817401501e-07, "loss": 0.0006, "num_tokens": 31206030.0, "reward": 0.5390625, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7431111111111111, "grad_norm": 0.30620401262295277, "kl": 0.384521484375, "learning_rate": 3.631765003630233e-07, "loss": 0.0004, "num_tokens": 31356098.0, "reward": 0.51953125, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7466666666666667, "grad_norm": 0.565341568697795, "kl": 0.37255859375, "learning_rate": 3.6188965834556964e-07, "loss": 0.0004, "num_tokens": 31506174.0, "reward": 0.54296875, "reward_std": 0.041479695588350296, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7502222222222222, "grad_norm": 0.7183038048403314, "kl": 0.530029296875, "learning_rate": 3.605990984471073e-07, "loss": 0.0005, "num_tokens": 31656230.0, "reward": 0.53515625, "reward_std": 0.05050079524517059, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7537777777777778, "grad_norm": 0.9592998793501326, "kl": 0.6806640625, "learning_rate": 3.5930486355049254e-07, "loss": 0.0007, "num_tokens": 31806246.0, "reward": 0.5625, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7573333333333333, "grad_norm": 0.5967010195878546, "kl": 0.388671875, "learning_rate": 3.580069966606949e-07, "loss": 0.0004, "num_tokens": 31956354.0, "reward": 0.52734375, "reward_std": 0.045216359198093414, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7608888888888888, "grad_norm": 0.5984534897529947, "kl": 0.3974609375, "learning_rate": 3.5670554090336804e-07, "loss": 0.0004, "num_tokens": 32106454.0, "reward": 0.515625, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7644444444444445, "grad_norm": 1.527700749087867, "kl": 0.652587890625, "learning_rate": 3.55400539523417e-07, "loss": 0.0007, "num_tokens": 32256526.0, "reward": 0.53125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.768, "grad_norm": 0.7168370595424849, "kl": 0.469970703125, "learning_rate": 3.5409203588356096e-07, "loss": 0.0005, "num_tokens": 32406606.0, "reward": 0.546875, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7715555555555556, "grad_norm": 0.6937218726790086, "kl": 0.48388671875, "learning_rate": 3.527800734628927e-07, "loss": 0.0005, "num_tokens": 32556662.0, "reward": 0.55859375, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7751111111111111, "grad_norm": 0.5625106257141403, "kl": 0.36279296875, "learning_rate": 3.5146469585543386e-07, "loss": 0.0004, "num_tokens": 32706774.0, "reward": 0.546875, "reward_std": 0.05170939117670059, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7786666666666666, "grad_norm": 1.8811641758087445, "kl": 0.4970703125, "learning_rate": 3.501459467686859e-07, "loss": 0.0005, "num_tokens": 32856802.0, "reward": 0.5390625, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7822222222222223, "grad_norm": 0.717878846553723, "kl": 0.44091796875, "learning_rate": 3.4882387002217837e-07, "loss": 0.0004, "num_tokens": 33006962.0, "reward": 0.51171875, "reward_std": 0.0703125, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7857777777777778, "grad_norm": 4.888546951880972, "kl": 1.05029296875, "learning_rate": 3.474985095460127e-07, "loss": 0.0011, "num_tokens": 33157078.0, "reward": 0.5625, "reward_std": 0.07152109593153, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7893333333333333, "grad_norm": 0.9668364386663721, "kl": 0.631591796875, "learning_rate": 3.4616990937940207e-07, "loss": 0.0006, "num_tokens": 33307122.0, "reward": 0.515625, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7928888888888889, "grad_norm": 0.5721228641007269, "kl": 0.40625, "learning_rate": 3.448381136692089e-07, "loss": 0.0004, "num_tokens": 33457150.0, "reward": 0.53515625, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7964444444444444, "grad_norm": 0.7641757061343398, "kl": 0.369384765625, "learning_rate": 3.435031666684771e-07, "loss": 0.0004, "num_tokens": 33607274.0, "reward": 0.48828125, "reward_std": 0.06986245512962341, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24301259219646454, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8, "grad_norm": 0.676066907329912, "kl": 0.421630859375, "learning_rate": 3.421651127349622e-07, "loss": 0.0004, "num_tokens": 33757358.0, "reward": 0.53125, "reward_std": 0.04929219186306, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8035555555555556, "grad_norm": 0.3458307366778419, "kl": 0.41552734375, "learning_rate": 3.4082399632965696e-07, "loss": 0.0004, "num_tokens": 33907474.0, "reward": 0.51953125, "reward_std": 0.016833597794175148, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8071111111111111, "grad_norm": 28.831014334944417, "kl": 3.250244140625, "learning_rate": 3.394798620153147e-07, "loss": 0.0033, "num_tokens": 34057594.0, "reward": 0.54296875, "reward_std": 0.08835469186306, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8106666666666666, "grad_norm": 0.6851911505727085, "kl": 0.3564453125, "learning_rate": 3.3813275445496766e-07, "loss": 0.0004, "num_tokens": 34207678.0, "reward": 0.49609375, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8142222222222222, "grad_norm": 4.524751208833203, "kl": 0.817626953125, "learning_rate": 3.367827184104437e-07, "loss": 0.0008, "num_tokens": 34357638.0, "reward": 0.5625, "reward_std": 0.05357225611805916, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8177777777777778, "grad_norm": 0.7657245453002164, "kl": 0.416748046875, "learning_rate": 3.354297987408784e-07, "loss": 0.0004, "num_tokens": 34507658.0, "reward": 0.5234375, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24301259219646454, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8213333333333334, "grad_norm": 0.6307999515422514, "kl": 0.386962890625, "learning_rate": 3.340740404012251e-07, "loss": 0.0004, "num_tokens": 34657738.0, "reward": 0.56640625, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8248888888888889, "grad_norm": 0.7615234790182357, "kl": 0.5302734375, "learning_rate": 3.3271548844076034e-07, "loss": 0.0005, "num_tokens": 34807830.0, "reward": 0.5, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24301259219646454, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8284444444444444, "grad_norm": 45.22177711175558, "kl": 7.133056640625, "learning_rate": 3.313541880015877e-07, "loss": 0.0071, "num_tokens": 34957886.0, "reward": 0.49609375, "reward_std": 0.05710469186306, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.832, "grad_norm": 0.4728352153390914, "kl": 0.421142578125, "learning_rate": 3.299901843171374e-07, "loss": 0.0004, "num_tokens": 35108018.0, "reward": 0.51953125, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8355555555555556, "grad_norm": 0.7556394805634985, "kl": 0.410888671875, "learning_rate": 3.2862352271066324e-07, "loss": 0.0004, "num_tokens": 35258154.0, "reward": 0.51171875, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8391111111111111, "grad_norm": 60.219344008615124, "kl": 9.630859375, "learning_rate": 3.272542485937368e-07, "loss": 0.0097, "num_tokens": 35408250.0, "reward": 0.53515625, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8426666666666667, "grad_norm": 0.6258122056705749, "kl": 0.3935546875, "learning_rate": 3.2588240746473866e-07, "loss": 0.0004, "num_tokens": 35558338.0, "reward": 0.546875, "reward_std": 0.04400775954127312, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8462222222222222, "grad_norm": 0.7556560670668717, "kl": 0.403564453125, "learning_rate": 3.245080449073459e-07, "loss": 0.0004, "num_tokens": 35708434.0, "reward": 0.5078125, "reward_std": 0.07152109593153, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8497777777777777, "grad_norm": 0.6981235833583292, "kl": 0.39892578125, "learning_rate": 3.231312065890183e-07, "loss": 0.0004, "num_tokens": 35858494.0, "reward": 0.51953125, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8533333333333334, "grad_norm": 0.7057474719839335, "kl": 0.416748046875, "learning_rate": 3.217519382594801e-07, "loss": 0.0004, "num_tokens": 36008622.0, "reward": 0.5390625, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8568888888888889, "grad_norm": 0.6774457062153861, "kl": 0.3857421875, "learning_rate": 3.203702857492005e-07, "loss": 0.0004, "num_tokens": 36158658.0, "reward": 0.56640625, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8604444444444445, "grad_norm": 0.8579385643539741, "kl": 0.399658203125, "learning_rate": 3.189862949678704e-07, "loss": 0.0004, "num_tokens": 36308694.0, "reward": 0.48046875, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.921875, "rewards/format_reward_func/std": 0.2694226801395416, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.864, "grad_norm": 0.8918776174630627, "kl": 0.478515625, "learning_rate": 3.1760001190287695e-07, "loss": 0.0005, "num_tokens": 36458806.0, "reward": 0.50390625, "reward_std": 0.0859375, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9296875, "rewards/format_reward_func/std": 0.2566775679588318, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8675555555555555, "grad_norm": 14.470212904907461, "kl": 1.18115234375, "learning_rate": 3.162114826177756e-07, "loss": 0.0012, "num_tokens": 36608858.0, "reward": 0.5, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8711111111111111, "grad_norm": 0.8767487112614263, "kl": 0.48486328125, "learning_rate": 3.148207532507595e-07, "loss": 0.0005, "num_tokens": 36758898.0, "reward": 0.5546875, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8746666666666667, "grad_norm": 0.7666837173815899, "kl": 0.53466796875, "learning_rate": 3.134278700131262e-07, "loss": 0.0005, "num_tokens": 36909062.0, "reward": 0.51953125, "reward_std": 0.0817507952451706, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.921875, "rewards/format_reward_func/std": 0.2694226801395416, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8782222222222222, "grad_norm": 0.4660959690863843, "kl": 0.361572265625, "learning_rate": 3.1203287918774224e-07, "loss": 0.0004, "num_tokens": 37059178.0, "reward": 0.50390625, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8817777777777778, "grad_norm": 0.7433620210188535, "kl": 0.40185546875, "learning_rate": 3.106358271275056e-07, "loss": 0.0004, "num_tokens": 37209262.0, "reward": 0.484375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24301259219646454, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8853333333333333, "grad_norm": 1.3225278900078785, "kl": 0.643310546875, "learning_rate": 3.0923676025380483e-07, "loss": 0.0006, "num_tokens": 37359390.0, "reward": 0.52734375, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8888888888888888, "grad_norm": 1.3565201947424719, "kl": 0.63818359375, "learning_rate": 3.078357250549772e-07, "loss": 0.0006, "num_tokens": 37509438.0, "reward": 0.5625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24301259219646454, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8924444444444445, "grad_norm": 0.7985946426687209, "kl": 0.533203125, "learning_rate": 3.064327680847635e-07, "loss": 0.0005, "num_tokens": 37659486.0, "reward": 0.51171875, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.896, "grad_norm": 0.7239646058609255, "kl": 0.391357421875, "learning_rate": 3.0502793596076136e-07, "loss": 0.0004, "num_tokens": 37809642.0, "reward": 0.546875, "reward_std": 0.07393828779459, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8995555555555556, "grad_norm": 0.7771799102205033, "kl": 0.45751953125, "learning_rate": 3.0362127536287636e-07, "loss": 0.0005, "num_tokens": 37959778.0, "reward": 0.48828125, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9031111111111111, "grad_norm": 1.2830628968694793, "kl": 0.66552734375, "learning_rate": 3.022128330317705e-07, "loss": 0.0007, "num_tokens": 38109842.0, "reward": 0.56640625, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9066666666666666, "grad_norm": 0.5669445746630175, "kl": 0.4755859375, "learning_rate": 3.0080265576730977e-07, "loss": 0.0005, "num_tokens": 38260030.0, "reward": 0.48046875, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9102222222222223, "grad_norm": 0.6356374575865582, "kl": 0.4111328125, "learning_rate": 2.993907904270084e-07, "loss": 0.0004, "num_tokens": 38410070.0, "reward": 0.49609375, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24301259219646454, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9137777777777778, "grad_norm": 5.301041165177461, "kl": 1.978759765625, "learning_rate": 2.979772839244723e-07, "loss": 0.002, "num_tokens": 38560230.0, "reward": 0.53125, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9173333333333333, "grad_norm": 0.6352697465433857, "kl": 0.384033203125, "learning_rate": 2.965621832278401e-07, "loss": 0.0004, "num_tokens": 38710354.0, "reward": 0.52734375, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9208888888888889, "grad_norm": 0.325841215358069, "kl": 0.458740234375, "learning_rate": 2.951455353582224e-07, "loss": 0.0005, "num_tokens": 38860410.0, "reward": 0.53125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9244444444444444, "grad_norm": 0.6229004603021032, "kl": 0.416748046875, "learning_rate": 2.937273873881396e-07, "loss": 0.0004, "num_tokens": 39010430.0, "reward": 0.53125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.928, "grad_norm": 1.3942727444158145, "kl": 0.628662109375, "learning_rate": 2.9230778643995724e-07, "loss": 0.0006, "num_tokens": 39160466.0, "reward": 0.515625, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9315555555555556, "grad_norm": 0.736925497405656, "kl": 0.414794921875, "learning_rate": 2.90886779684321e-07, "loss": 0.0004, "num_tokens": 39310574.0, "reward": 0.53515625, "reward_std": 0.05710469186306, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.921875, "rewards/format_reward_func/std": 0.2694226801395416, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9351111111111111, "grad_norm": 1.3153915670345286, "kl": 0.77880859375, "learning_rate": 2.894644143385885e-07, "loss": 0.0008, "num_tokens": 39460734.0, "reward": 0.51953125, "reward_std": 0.0703125, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9386666666666666, "grad_norm": 0.3666627577855282, "kl": 0.4052734375, "learning_rate": 2.8804073766526095e-07, "loss": 0.0004, "num_tokens": 39610814.0, "reward": 0.54296875, "reward_std": 0.020570259541273117, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9422222222222222, "grad_norm": 2.316729417862906, "kl": 1.9755859375, "learning_rate": 2.866157969704125e-07, "loss": 0.002, "num_tokens": 39760914.0, "reward": 0.53125, "reward_std": 0.024646097794175148, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9457777777777778, "grad_norm": 0.5361641805790162, "kl": 0.3984375, "learning_rate": 2.851896396021181e-07, "loss": 0.0004, "num_tokens": 39911018.0, "reward": 0.53515625, "reward_std": 0.030584799125790596, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9493333333333334, "grad_norm": 2.578024238948283, "kl": 0.693115234375, "learning_rate": 2.837623129488808e-07, "loss": 0.0007, "num_tokens": 40061098.0, "reward": 0.51953125, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9528888888888889, "grad_norm": 0.5179523249592263, "kl": 0.380859375, "learning_rate": 2.823338644380566e-07, "loss": 0.0004, "num_tokens": 40211290.0, "reward": 0.50390625, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9564444444444444, "grad_norm": 0.6716890754838255, "kl": 0.4169921875, "learning_rate": 2.809043415342784e-07, "loss": 0.0004, "num_tokens": 40361402.0, "reward": 0.546875, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.96, "grad_norm": 1.8696840139920259, "kl": 0.512451171875, "learning_rate": 2.794737917378797e-07, "loss": 0.0005, "num_tokens": 40511494.0, "reward": 0.515625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9635555555555556, "grad_norm": 0.5775124536896724, "kl": 0.4365234375, "learning_rate": 2.780422625833153e-07, "loss": 0.0004, "num_tokens": 40661614.0, "reward": 0.53125, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9671111111111111, "grad_norm": 0.5174571918726184, "kl": 0.40380859375, "learning_rate": 2.766098016375823e-07, "loss": 0.0004, "num_tokens": 40811670.0, "reward": 0.53125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9706666666666667, "grad_norm": 10.856193948863917, "kl": 2.42578125, "learning_rate": 2.751764564986396e-07, "loss": 0.0024, "num_tokens": 40961810.0, "reward": 0.53125, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9742222222222222, "grad_norm": 0.5252541651736331, "kl": 0.414306640625, "learning_rate": 2.737422747938259e-07, "loss": 0.0004, "num_tokens": 41111894.0, "reward": 0.54296875, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9777777777777777, "grad_norm": 0.584952826070172, "kl": 0.364501953125, "learning_rate": 2.723073041782776e-07, "loss": 0.0004, "num_tokens": 41262010.0, "reward": 0.55078125, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9813333333333333, "grad_norm": 0.6583135007881525, "kl": 0.427001953125, "learning_rate": 2.708715923333451e-07, "loss": 0.0004, "num_tokens": 41412086.0, "reward": 0.56640625, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9848888888888889, "grad_norm": 0.4967388969256047, "kl": 0.44140625, "learning_rate": 2.6943518696500835e-07, "loss": 0.0004, "num_tokens": 41562150.0, "reward": 0.51953125, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9884444444444445, "grad_norm": 0.5621455003149533, "kl": 0.484619140625, "learning_rate": 2.6799813580229174e-07, "loss": 0.0005, "num_tokens": 41712282.0, "reward": 0.5625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.992, "grad_norm": 0.4746005261751424, "kl": 0.3916015625, "learning_rate": 2.6656048659567834e-07, "loss": 0.0004, "num_tokens": 41862370.0, "reward": 0.5546875, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9955555555555555, "grad_norm": 0.535734063401937, "kl": 0.441162109375, "learning_rate": 2.65122287115523e-07, "loss": 0.0004, "num_tokens": 42012482.0, "reward": 0.55078125, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9991111111111111, "grad_norm": 0.07429460937077817, "kl": 0.3955078125, "learning_rate": 2.63683585150465e-07, "loss": 0.0004, "num_tokens": 42162534.0, "reward": 0.53125, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0035555555555555, "grad_norm": 0.8470947965625311, "kl": 0.468505859375, "learning_rate": 2.622444285058404e-07, "loss": 0.0005, "num_tokens": 42312626.0, "reward": 0.49609375, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.007111111111111, "grad_norm": 0.7246290147592683, "kl": 0.474853515625, "learning_rate": 2.6080486500209347e-07, "loss": 0.0005, "num_tokens": 42462734.0, "reward": 0.5234375, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0106666666666666, "grad_norm": 0.7970433352687321, "kl": 0.417724609375, "learning_rate": 2.5936494247318733e-07, "loss": 0.0004, "num_tokens": 42612814.0, "reward": 0.5, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0142222222222221, "grad_norm": 0.5963898764489264, "kl": 0.490478515625, "learning_rate": 2.5792470876501517e-07, "loss": 0.0005, "num_tokens": 42762926.0, "reward": 0.4921875, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 1023.1171875, "completions/mean_terminated_length": 911.0, "completions/min_length": 911.0, "completions/min_terminated_length": 911.0, "epoch": 1.0177777777777777, "grad_norm": 0.7142379802123127, "kl": 0.412841796875, "learning_rate": 2.5648421173380974e-07, "loss": -0.0009, "num_tokens": 42912905.0, "reward": 0.51171875, "reward_std": 0.057104695588350296, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0213333333333334, "grad_norm": 0.6099036684005931, "kl": 0.409912109375, "learning_rate": 2.550434992445538e-07, "loss": 0.0004, "num_tokens": 43062965.0, "reward": 0.5390625, "reward_std": 0.053028859198093414, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.024888888888889, "grad_norm": 0.4520069380770881, "kl": 0.437255859375, "learning_rate": 2.536026191693893e-07, "loss": 0.0004, "num_tokens": 43213025.0, "reward": 0.5390625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0284444444444445, "grad_norm": 0.6319811742197129, "kl": 0.369873046875, "learning_rate": 2.521616193860266e-07, "loss": 0.0004, "num_tokens": 43363137.0, "reward": 0.50390625, "reward_std": 0.041479695588350296, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.032, "grad_norm": 2.6591058310513374, "kl": 0.842041015625, "learning_rate": 2.507205477761539e-07, "loss": 0.0008, "num_tokens": 43513221.0, "reward": 0.515625, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0355555555555556, "grad_norm": 0.5420344680148037, "kl": 0.37548828125, "learning_rate": 2.4927945222384613e-07, "loss": 0.0004, "num_tokens": 43663345.0, "reward": 0.52734375, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.039111111111111, "grad_norm": 0.5473848935681827, "kl": 0.51171875, "learning_rate": 2.4783838061397334e-07, "loss": 0.0005, "num_tokens": 43813417.0, "reward": 0.515625, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0426666666666666, "grad_norm": 0.7008917634695417, "kl": 0.634765625, "learning_rate": 2.4639738083061073e-07, "loss": 0.0006, "num_tokens": 43963469.0, "reward": 0.51171875, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0462222222222222, "grad_norm": 0.3295873056459611, "kl": 0.4091796875, "learning_rate": 2.4495650075544613e-07, "loss": 0.0004, "num_tokens": 44113521.0, "reward": 0.53125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0497777777777777, "grad_norm": 0.5414055832707978, "kl": 0.39501953125, "learning_rate": 2.435157882661903e-07, "loss": 0.0004, "num_tokens": 44263553.0, "reward": 0.54296875, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0533333333333332, "grad_norm": 0.4682537399126061, "kl": 0.397705078125, "learning_rate": 2.420752912349848e-07, "loss": 0.0004, "num_tokens": 44413705.0, "reward": 0.53515625, "reward_std": 0.045216359198093414, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.056888888888889, "grad_norm": 0.922022274288094, "kl": 0.557861328125, "learning_rate": 2.4063505752681265e-07, "loss": 0.0006, "num_tokens": 44563845.0, "reward": 0.46875, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.921875, "rewards/format_reward_func/std": 0.2694226801395416, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0604444444444445, "grad_norm": 0.2921901386740173, "kl": 0.396484375, "learning_rate": 2.3919513499790646e-07, "loss": 0.0004, "num_tokens": 44713933.0, "reward": 0.5625, "reward_std": 0.018042195588350296, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.064, "grad_norm": 0.45593720122261916, "kl": 0.45068359375, "learning_rate": 2.3775557149415953e-07, "loss": 0.0005, "num_tokens": 44863933.0, "reward": 0.5625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0675555555555556, "grad_norm": 0.37302671430025486, "kl": 0.4716796875, "learning_rate": 2.3631641484953493e-07, "loss": 0.0005, "num_tokens": 45014049.0, "reward": 0.5390625, "reward_std": 0.009021097794175148, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0711111111111111, "grad_norm": 0.39290892679689327, "kl": 0.373046875, "learning_rate": 2.3487771288447703e-07, "loss": 0.0004, "num_tokens": 45164101.0, "reward": 0.53515625, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0746666666666667, "grad_norm": 0.7056356953892552, "kl": 0.4267578125, "learning_rate": 2.3343951340432158e-07, "loss": 0.0004, "num_tokens": 45314165.0, "reward": 0.51953125, "reward_std": 0.06744526326656342, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0782222222222222, "grad_norm": 0.5027562199597438, "kl": 0.489501953125, "learning_rate": 2.3200186419770823e-07, "loss": 0.0005, "num_tokens": 45464281.0, "reward": 0.51953125, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0817777777777777, "grad_norm": 0.2975548497811407, "kl": 0.385498046875, "learning_rate": 2.3056481303499163e-07, "loss": 0.0004, "num_tokens": 45614357.0, "reward": 0.5390625, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0853333333333333, "grad_norm": 0.43699072382589177, "kl": 0.414306640625, "learning_rate": 2.291284076666549e-07, "loss": 0.0004, "num_tokens": 45764381.0, "reward": 0.5390625, "reward_std": 0.024646097794175148, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0888888888888888, "grad_norm": 0.7264220978249349, "kl": 0.537109375, "learning_rate": 2.2769269582172236e-07, "loss": 0.0005, "num_tokens": 45914405.0, "reward": 0.52734375, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0924444444444443, "grad_norm": 0.8246989232799073, "kl": 0.415771484375, "learning_rate": 2.262577252061741e-07, "loss": 0.0004, "num_tokens": 46064497.0, "reward": 0.56640625, "reward_std": 0.07933359593153, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.096, "grad_norm": 1.5117707972901218, "kl": 0.685791015625, "learning_rate": 2.2482354350136043e-07, "loss": 0.0007, "num_tokens": 46214529.0, "reward": 0.55078125, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0995555555555556, "grad_norm": 0.4384748657076267, "kl": 0.436279296875, "learning_rate": 2.2339019836241768e-07, "loss": 0.0004, "num_tokens": 46364633.0, "reward": 0.51953125, "reward_std": 0.016833597794175148, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1031111111111112, "grad_norm": 0.5367303888744025, "kl": 0.478515625, "learning_rate": 2.219577374166847e-07, "loss": 0.0005, "num_tokens": 46514697.0, "reward": 0.5078125, "reward_std": 0.03839729726314545, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1066666666666667, "grad_norm": 0.5858380888629433, "kl": 0.43408203125, "learning_rate": 2.2052620826212031e-07, "loss": 0.0004, "num_tokens": 46664821.0, "reward": 0.51953125, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1102222222222222, "grad_norm": 0.641916377352643, "kl": 0.55615234375, "learning_rate": 2.1909565846572158e-07, "loss": 0.0006, "num_tokens": 46814953.0, "reward": 0.48828125, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1137777777777778, "grad_norm": 0.33090250971908225, "kl": 0.41796875, "learning_rate": 2.1766613556194344e-07, "loss": 0.0004, "num_tokens": 46964997.0, "reward": 0.578125, "reward_std": 0.024646097794175148, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1173333333333333, "grad_norm": 15.61725314528695, "kl": 3.2099609375, "learning_rate": 2.1623768705111914e-07, "loss": 0.0032, "num_tokens": 47115081.0, "reward": 0.51171875, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1208888888888888, "grad_norm": 0.7107097215021569, "kl": 0.523681640625, "learning_rate": 2.1481036039788185e-07, "loss": 0.0005, "num_tokens": 47265145.0, "reward": 0.57421875, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1244444444444444, "grad_norm": 0.5636256684100517, "kl": 0.508544921875, "learning_rate": 2.133842030295875e-07, "loss": 0.0005, "num_tokens": 47415337.0, "reward": 0.51171875, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1280000000000001, "grad_norm": 0.5116667035209906, "kl": 0.435791015625, "learning_rate": 2.1195926233473905e-07, "loss": 0.0004, "num_tokens": 47565405.0, "reward": 0.55859375, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1315555555555556, "grad_norm": 0.6779613408018799, "kl": 0.4248046875, "learning_rate": 2.105355856614115e-07, "loss": 0.0004, "num_tokens": 47715461.0, "reward": 0.5546875, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1351111111111112, "grad_norm": 0.3921549457589845, "kl": 0.412353515625, "learning_rate": 2.0911322031567907e-07, "loss": 0.0004, "num_tokens": 47865549.0, "reward": 0.53125, "reward_std": 0.024646097794175148, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1386666666666667, "grad_norm": 0.4958321729986366, "kl": 0.4521484375, "learning_rate": 2.076922135600427e-07, "loss": 0.0005, "num_tokens": 48015629.0, "reward": 0.5390625, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1422222222222222, "grad_norm": 0.7423549988863264, "kl": 0.406005859375, "learning_rate": 2.0627261261186048e-07, "loss": 0.0004, "num_tokens": 48165705.0, "reward": 0.55078125, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1457777777777778, "grad_norm": 0.22087517892930575, "kl": 0.41162109375, "learning_rate": 2.0485446464177752e-07, "loss": 0.0004, "num_tokens": 48315833.0, "reward": 0.54296875, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1493333333333333, "grad_norm": 0.9688657800607915, "kl": 0.743408203125, "learning_rate": 2.034378167721599e-07, "loss": 0.0007, "num_tokens": 48465977.0, "reward": 0.5234375, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1528888888888889, "grad_norm": 0.5069073201298848, "kl": 0.3955078125, "learning_rate": 2.0202271607552766e-07, "loss": 0.0004, "num_tokens": 48616049.0, "reward": 0.51953125, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1564444444444444, "grad_norm": 0.30676846934679053, "kl": 0.37060546875, "learning_rate": 2.006092095729916e-07, "loss": 0.0004, "num_tokens": 48766117.0, "reward": 0.52734375, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.16, "grad_norm": 8.64784262784224, "kl": 1.320556640625, "learning_rate": 1.9919734423269018e-07, "loss": 0.0013, "num_tokens": 48916213.0, "reward": 0.53125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1635555555555555, "grad_norm": 0.5177358301467603, "kl": 0.4267578125, "learning_rate": 1.9778716696822948e-07, "loss": 0.0004, "num_tokens": 49066345.0, "reward": 0.546875, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1671111111111112, "grad_norm": 0.29160214419010005, "kl": 0.4140625, "learning_rate": 1.9637872463712362e-07, "loss": 0.0004, "num_tokens": 49216417.0, "reward": 0.53125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1706666666666667, "grad_norm": 0.37306669591193975, "kl": 0.389404296875, "learning_rate": 1.9497206403923864e-07, "loss": 0.0004, "num_tokens": 49366501.0, "reward": 0.546875, "reward_std": 0.024646097794175148, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1742222222222223, "grad_norm": 0.3457412118045277, "kl": 0.439697265625, "learning_rate": 1.9356723191523646e-07, "loss": 0.0004, "num_tokens": 49516501.0, "reward": 0.515625, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1777777777777778, "grad_norm": 0.49964396837602626, "kl": 0.417236328125, "learning_rate": 1.921642749450228e-07, "loss": 0.0004, "num_tokens": 49666537.0, "reward": 0.609375, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.2265625, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1813333333333333, "grad_norm": 0.4051353842023595, "kl": 0.473388671875, "learning_rate": 1.9076323974619512e-07, "loss": 0.0005, "num_tokens": 49816629.0, "reward": 0.5703125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1848888888888889, "grad_norm": 5.378834991563153, "kl": 1.024658203125, "learning_rate": 1.8936417287249446e-07, "loss": 0.001, "num_tokens": 49966761.0, "reward": 0.5390625, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1884444444444444, "grad_norm": 0.6720029560481326, "kl": 0.527099609375, "learning_rate": 1.8796712081225774e-07, "loss": 0.0005, "num_tokens": 50116905.0, "reward": 0.55859375, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 1022.515625, "completions/mean_terminated_length": 834.0, "completions/min_length": 834.0, "completions/min_terminated_length": 834.0, "epoch": 1.192, "grad_norm": 0.6578900739907899, "kl": 0.492431640625, "learning_rate": 1.8657212998687388e-07, "loss": 0.0005, "num_tokens": 50266775.0, "reward": 0.56640625, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1955555555555555, "grad_norm": 0.5860458852325504, "kl": 0.45849609375, "learning_rate": 1.8517924674924046e-07, "loss": 0.0005, "num_tokens": 50416907.0, "reward": 0.515625, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.199111111111111, "grad_norm": 0.5764488924096699, "kl": 0.3935546875, "learning_rate": 1.8378851738222439e-07, "loss": 0.0004, "num_tokens": 50566947.0, "reward": 0.5625, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2026666666666666, "grad_norm": 0.3101655260933977, "kl": 0.466552734375, "learning_rate": 1.82399988097123e-07, "loss": 0.0005, "num_tokens": 50717059.0, "reward": 0.54296875, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2062222222222223, "grad_norm": 0.25806921540657135, "kl": 0.427001953125, "learning_rate": 1.8101370503212962e-07, "loss": 0.0004, "num_tokens": 50867127.0, "reward": 0.54296875, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2097777777777778, "grad_norm": 0.38683952780832426, "kl": 0.398193359375, "learning_rate": 1.7962971425079946e-07, "loss": 0.0004, "num_tokens": 51017191.0, "reward": 0.55859375, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2133333333333334, "grad_norm": 0.43762529890937657, "kl": 0.40673828125, "learning_rate": 1.7824806174051994e-07, "loss": 0.0004, "num_tokens": 51167303.0, "reward": 0.55078125, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.216888888888889, "grad_norm": 0.4947585618325522, "kl": 0.500732421875, "learning_rate": 1.7686879341098172e-07, "loss": 0.0005, "num_tokens": 51317335.0, "reward": 0.5546875, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2204444444444444, "grad_norm": 0.29648752247386784, "kl": 0.454345703125, "learning_rate": 1.7549195509265407e-07, "loss": 0.0005, "num_tokens": 51467455.0, "reward": 0.52734375, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.224, "grad_norm": 0.3522267627578455, "kl": 0.41064453125, "learning_rate": 1.7411759253526137e-07, "loss": 0.0004, "num_tokens": 51617551.0, "reward": 0.5546875, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2275555555555555, "grad_norm": 3.809079002025788, "kl": 1.059814453125, "learning_rate": 1.7274575140626315e-07, "loss": 0.0011, "num_tokens": 51767667.0, "reward": 0.55078125, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.231111111111111, "grad_norm": 0.20335882404499606, "kl": 0.433837890625, "learning_rate": 1.713764772893368e-07, "loss": 0.0004, "num_tokens": 51917775.0, "reward": 0.55078125, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2346666666666666, "grad_norm": 0.40351519211548675, "kl": 0.44384765625, "learning_rate": 1.7000981568286263e-07, "loss": 0.0004, "num_tokens": 52067895.0, "reward": 0.546875, "reward_std": 0.024646097794175148, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2382222222222223, "grad_norm": 0.5942030016843242, "kl": 0.45703125, "learning_rate": 1.6864581199841226e-07, "loss": 0.0005, "num_tokens": 52218003.0, "reward": 0.51171875, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2417777777777779, "grad_norm": 0.529993570144883, "kl": 0.404052734375, "learning_rate": 1.6728451155923966e-07, "loss": 0.0004, "num_tokens": 52368071.0, "reward": 0.51953125, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2453333333333334, "grad_norm": 0.8041631283434129, "kl": 0.8837890625, "learning_rate": 1.6592595959877493e-07, "loss": 0.0009, "num_tokens": 52518159.0, "reward": 0.546875, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.248888888888889, "grad_norm": 1.6872516484862146, "kl": 0.507080078125, "learning_rate": 1.6457020125912158e-07, "loss": 0.0005, "num_tokens": 52668287.0, "reward": 0.51171875, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2524444444444445, "grad_norm": 0.6429164573265299, "kl": 0.455810546875, "learning_rate": 1.6321728158955633e-07, "loss": 0.0005, "num_tokens": 52818387.0, "reward": 0.5703125, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.256, "grad_norm": 0.6468681508234729, "kl": 0.43701171875, "learning_rate": 1.6186724554503237e-07, "loss": 0.0004, "num_tokens": 52968427.0, "reward": 0.51171875, "reward_std": 0.055230896919965744, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2595555555555555, "grad_norm": 0.44088485589071047, "kl": 0.397216796875, "learning_rate": 1.6052013798468528e-07, "loss": 0.0004, "num_tokens": 53118535.0, "reward": 0.51171875, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.263111111111111, "grad_norm": 0.3945491441045777, "kl": 0.38134765625, "learning_rate": 1.5917600367034302e-07, "loss": 0.0004, "num_tokens": 53268763.0, "reward": 0.4921875, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2666666666666666, "grad_norm": 0.638915571522292, "kl": 0.408447265625, "learning_rate": 1.578348872650378e-07, "loss": 0.0004, "num_tokens": 53418919.0, "reward": 0.48828125, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2702222222222221, "grad_norm": 0.5121027616204278, "kl": 0.429931640625, "learning_rate": 1.564968333315229e-07, "loss": 0.0004, "num_tokens": 53568999.0, "reward": 0.55859375, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2737777777777777, "grad_norm": 0.5563613028742419, "kl": 0.509521484375, "learning_rate": 1.5516188633079107e-07, "loss": 0.0005, "num_tokens": 53719115.0, "reward": 0.51953125, "reward_std": 0.04147969186306, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2773333333333334, "grad_norm": 0.5962361306068441, "kl": 0.387939453125, "learning_rate": 1.5383009062059794e-07, "loss": 0.0004, "num_tokens": 53869339.0, "reward": 0.52734375, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.280888888888889, "grad_norm": 0.8520264003008496, "kl": 0.6318359375, "learning_rate": 1.525014904539873e-07, "loss": 0.0006, "num_tokens": 54019455.0, "reward": 0.53125, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2844444444444445, "grad_norm": 0.4853809422457624, "kl": 0.45458984375, "learning_rate": 1.5117612997782158e-07, "loss": 0.0005, "num_tokens": 54169571.0, "reward": 0.52734375, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.288, "grad_norm": 0.3747330328198201, "kl": 0.42333984375, "learning_rate": 1.49854053231314e-07, "loss": 0.0004, "num_tokens": 54319603.0, "reward": 0.546875, "reward_std": 0.024646097794175148, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2915555555555556, "grad_norm": 0.5492601505442783, "kl": 0.47265625, "learning_rate": 1.4853530414456612e-07, "loss": 0.0005, "num_tokens": 54469607.0, "reward": 0.52734375, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.295111111111111, "grad_norm": 0.33780973836896694, "kl": 0.455810546875, "learning_rate": 1.4721992653710718e-07, "loss": 0.0005, "num_tokens": 54619703.0, "reward": 0.52734375, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2986666666666666, "grad_norm": 0.5018740934387684, "kl": 0.458251953125, "learning_rate": 1.45907964116439e-07, "loss": 0.0005, "num_tokens": 54769755.0, "reward": 0.5703125, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3022222222222222, "grad_norm": 0.48892083551158816, "kl": 0.4052734375, "learning_rate": 1.4459946047658305e-07, "loss": 0.0004, "num_tokens": 54919835.0, "reward": 0.5234375, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3057777777777777, "grad_norm": 0.35313257655187524, "kl": 0.4345703125, "learning_rate": 1.4329445909663194e-07, "loss": 0.0004, "num_tokens": 55069915.0, "reward": 0.53125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3093333333333335, "grad_norm": 1.3640081478037656, "kl": 0.619384765625, "learning_rate": 1.4199300333930515e-07, "loss": 0.0006, "num_tokens": 55219987.0, "reward": 0.49609375, "reward_std": 0.04620979726314545, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3128888888888888, "grad_norm": 0.39606833878919273, "kl": 0.41650390625, "learning_rate": 1.4069513644950744e-07, "loss": 0.0004, "num_tokens": 55370151.0, "reward": 0.4921875, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 1021.8046875, "completions/mean_terminated_length": 743.0, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 1.3164444444444445, "grad_norm": 0.44470406838575705, "kl": 0.40087890625, "learning_rate": 1.394009015528927e-07, "loss": 0.0004, "num_tokens": 55520026.0, "reward": 0.52734375, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.32, "grad_norm": 0.3552220431069685, "kl": 0.37841796875, "learning_rate": 1.3811034165443036e-07, "loss": 0.0004, "num_tokens": 55670030.0, "reward": 0.53515625, "reward_std": 0.025854695588350296, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3235555555555556, "grad_norm": 0.44156036308113156, "kl": 0.417724609375, "learning_rate": 1.3682349963697676e-07, "loss": 0.0004, "num_tokens": 55820110.0, "reward": 0.48828125, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3271111111111111, "grad_norm": 0.8049133934652881, "kl": 0.50244140625, "learning_rate": 1.3554041825985e-07, "loss": 0.0005, "num_tokens": 55970202.0, "reward": 0.5, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3306666666666667, "grad_norm": 0.4029732452394675, "kl": 0.392333984375, "learning_rate": 1.3426114015740915e-07, "loss": 0.0004, "num_tokens": 56120286.0, "reward": 0.5703125, "reward_std": 0.024646097794175148, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3342222222222222, "grad_norm": 0.6289438946544794, "kl": 0.447021484375, "learning_rate": 1.3298570783763805e-07, "loss": 0.0004, "num_tokens": 56270414.0, "reward": 0.57421875, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3377777777777777, "grad_norm": 0.5979204566593044, "kl": 0.445556640625, "learning_rate": 1.31714163680732e-07, "loss": 0.0004, "num_tokens": 56420570.0, "reward": 0.51171875, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3413333333333333, "grad_norm": 0.5602574601449087, "kl": 0.440185546875, "learning_rate": 1.3044654993769044e-07, "loss": 0.0004, "num_tokens": 56570618.0, "reward": 0.5859375, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3448888888888888, "grad_norm": 0.584595173061934, "kl": 0.387451171875, "learning_rate": 1.2918290872891236e-07, "loss": 0.0004, "num_tokens": 56720726.0, "reward": 0.55078125, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3484444444444446, "grad_norm": 0.38195153284699324, "kl": 0.42041015625, "learning_rate": 1.2792328204279712e-07, "loss": 0.0004, "num_tokens": 56870738.0, "reward": 0.5234375, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3519999999999999, "grad_norm": 0.534164470884044, "kl": 0.380126953125, "learning_rate": 1.2666771173434892e-07, "loss": 0.0004, "num_tokens": 57020866.0, "reward": 0.484375, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3555555555555556, "grad_norm": 0.31327674285986346, "kl": 0.404296875, "learning_rate": 1.2541623952378655e-07, "loss": 0.0004, "num_tokens": 57170962.0, "reward": 0.5, "reward_std": 0.022772299125790596, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3591111111111112, "grad_norm": 0.6333355146035587, "kl": 0.42822265625, "learning_rate": 1.2416890699515636e-07, "loss": 0.0004, "num_tokens": 57320938.0, "reward": 0.5703125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3626666666666667, "grad_norm": 0.8455919752181283, "kl": 0.582275390625, "learning_rate": 1.2292575559495143e-07, "loss": 0.0006, "num_tokens": 57470974.0, "reward": 0.5, "reward_std": 0.037403859198093414, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3662222222222222, "grad_norm": 0.5563433958155662, "kl": 0.4375, "learning_rate": 1.216868266307333e-07, "loss": 0.0004, "num_tokens": 57621114.0, "reward": 0.5234375, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3697777777777778, "grad_norm": 0.42654455439415234, "kl": 0.437744140625, "learning_rate": 1.2045216126976054e-07, "loss": 0.0004, "num_tokens": 57771166.0, "reward": 0.51953125, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3733333333333333, "grad_norm": 0.574456938538095, "kl": 0.464111328125, "learning_rate": 1.1922180053761985e-07, "loss": 0.0005, "num_tokens": 57921230.0, "reward": 0.5390625, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3768888888888888, "grad_norm": 0.7083015972044223, "kl": 0.5546875, "learning_rate": 1.1799578531686355e-07, "loss": 0.0006, "num_tokens": 58071282.0, "reward": 0.5390625, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3804444444444444, "grad_norm": 0.8083602759346432, "kl": 0.543212890625, "learning_rate": 1.1677415634565066e-07, "loss": 0.0005, "num_tokens": 58221330.0, "reward": 0.5625, "reward_std": 0.07206448912620544, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.384, "grad_norm": 0.8332269118489771, "kl": 0.646240234375, "learning_rate": 1.1555695421639369e-07, "loss": 0.0006, "num_tokens": 58371338.0, "reward": 0.515625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3875555555555557, "grad_norm": 0.6373859717105615, "kl": 0.530029296875, "learning_rate": 1.1434421937440927e-07, "loss": 0.0005, "num_tokens": 58521394.0, "reward": 0.51953125, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3911111111111112, "grad_norm": 0.5222272117027472, "kl": 0.438232421875, "learning_rate": 1.1313599211657493e-07, "loss": 0.0004, "num_tokens": 58671582.0, "reward": 0.50390625, "reward_std": 0.03619525954127312, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3946666666666667, "grad_norm": 0.44409467166709055, "kl": 0.444091796875, "learning_rate": 1.1193231258998933e-07, "loss": 0.0004, "num_tokens": 58821646.0, "reward": 0.56640625, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3982222222222223, "grad_norm": 0.5442250998097384, "kl": 0.53125, "learning_rate": 1.1073322079063913e-07, "loss": 0.0005, "num_tokens": 58971662.0, "reward": 0.53515625, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4017777777777778, "grad_norm": 1.553408988777784, "kl": 0.532958984375, "learning_rate": 1.0953875656206896e-07, "loss": 0.0005, "num_tokens": 59121722.0, "reward": 0.52734375, "reward_std": 0.05710469186306, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4053333333333333, "grad_norm": 0.5740208836758302, "kl": 0.438232421875, "learning_rate": 1.083489595940586e-07, "loss": 0.0004, "num_tokens": 59271834.0, "reward": 0.52734375, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4088888888888889, "grad_norm": 0.4151009063244617, "kl": 0.42919921875, "learning_rate": 1.0716386942130312e-07, "loss": 0.0004, "num_tokens": 59421910.0, "reward": 0.5234375, "reward_std": 0.024646097794175148, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4124444444444444, "grad_norm": 0.7311720397309218, "kl": 0.43115234375, "learning_rate": 1.0598352542210021e-07, "loss": 0.0004, "num_tokens": 59571914.0, "reward": 0.53515625, "reward_std": 0.05710469186306, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.416, "grad_norm": 0.6992515497367471, "kl": 0.637939453125, "learning_rate": 1.0480796681704077e-07, "loss": 0.0006, "num_tokens": 59722026.0, "reward": 0.53125, "reward_std": 0.024646097794175148, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4195555555555557, "grad_norm": 0.5217670799751064, "kl": 0.43408203125, "learning_rate": 1.0363723266770649e-07, "loss": 0.0004, "num_tokens": 59872062.0, "reward": 0.51171875, "reward_std": 0.041479695588350296, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.423111111111111, "grad_norm": 6.850730801381474, "kl": 1.605712890625, "learning_rate": 1.0247136187537123e-07, "loss": 0.0016, "num_tokens": 60022170.0, "reward": 0.5078125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4266666666666667, "grad_norm": 0.5426555800683942, "kl": 0.46826171875, "learning_rate": 1.0131039317970907e-07, "loss": 0.0005, "num_tokens": 60172198.0, "reward": 0.58203125, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4302222222222223, "grad_norm": 0.4359928548595561, "kl": 0.419921875, "learning_rate": 1.0015436515750636e-07, "loss": 0.0004, "num_tokens": 60322274.0, "reward": 0.5703125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4337777777777778, "grad_norm": 0.5019244396354806, "kl": 0.3994140625, "learning_rate": 9.900331622138063e-08, "loss": 0.0004, "num_tokens": 60472338.0, "reward": 0.57421875, "reward_std": 0.041479695588350296, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4373333333333334, "grad_norm": 0.6518642054974103, "kl": 0.466552734375, "learning_rate": 9.785728461850346e-08, "loss": 0.0005, "num_tokens": 60622438.0, "reward": 0.5390625, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4408888888888889, "grad_norm": 0.6183842691534489, "kl": 0.401123046875, "learning_rate": 9.671630842933027e-08, "loss": 0.0004, "num_tokens": 60772530.0, "reward": 0.60546875, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.2421875, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4444444444444444, "grad_norm": 0.5614152227928939, "kl": 0.42724609375, "learning_rate": 9.558042556633439e-08, "loss": 0.0004, "num_tokens": 60922622.0, "reward": 0.515625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.448, "grad_norm": 0.7515661137862419, "kl": 0.38232421875, "learning_rate": 9.44496737727479e-08, "loss": 0.0004, "num_tokens": 61072702.0, "reward": 0.5390625, "reward_std": 0.07525776326656342, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4515555555555555, "grad_norm": 0.6888661898947002, "kl": 0.446044921875, "learning_rate": 9.332409062130686e-08, "loss": 0.0004, "num_tokens": 61222818.0, "reward": 0.5546875, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.455111111111111, "grad_norm": 5.768513710763025, "kl": 1.973388671875, "learning_rate": 9.220371351300352e-08, "loss": 0.002, "num_tokens": 61372914.0, "reward": 0.59375, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.203125, "rewards/equation_reward_func/std": 0.40390563011169434, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 1022.25, "completions/mean_terminated_length": 800.0, "completions/min_length": 800.0, "completions/min_terminated_length": 800.0, "epoch": 1.4586666666666668, "grad_norm": 0.39703873761876407, "kl": 0.51416015625, "learning_rate": 9.10885796758428e-08, "loss": 0.0005, "num_tokens": 61522750.0, "reward": 0.5078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.462222222222222, "grad_norm": 0.4105299696973317, "kl": 0.400634765625, "learning_rate": 8.997872616360603e-08, "loss": 0.0004, "num_tokens": 61672774.0, "reward": 0.5078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4657777777777778, "grad_norm": 0.814786116863996, "kl": 0.501953125, "learning_rate": 8.887418985461903e-08, "loss": 0.0005, "num_tokens": 61822806.0, "reward": 0.484375, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4693333333333334, "grad_norm": 0.6543891528432907, "kl": 0.448974609375, "learning_rate": 8.777500745052743e-08, "loss": 0.0004, "num_tokens": 61972982.0, "reward": 0.484375, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.472888888888889, "grad_norm": 0.3103412060077848, "kl": 0.407958984375, "learning_rate": 8.668121547507634e-08, "loss": 0.0004, "num_tokens": 62123038.0, "reward": 0.53125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4764444444444444, "grad_norm": 0.5962802113040242, "kl": 0.478515625, "learning_rate": 8.559285027289753e-08, "loss": 0.0005, "num_tokens": 62273026.0, "reward": 0.59375, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.2109375, "rewards/equation_reward_func/std": 0.4095771610736847, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.48, "grad_norm": 0.6264823134539922, "kl": 0.448486328125, "learning_rate": 8.450994800830111e-08, "loss": 0.0004, "num_tokens": 62423086.0, "reward": 0.53125, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4835555555555555, "grad_norm": 0.8450656426538271, "kl": 0.41796875, "learning_rate": 8.343254466407435e-08, "loss": 0.0004, "num_tokens": 62573242.0, "reward": 0.5234375, "reward_std": 0.08427885919809341, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.487111111111111, "grad_norm": 0.4851276707023854, "kl": 0.388427734375, "learning_rate": 8.236067604028562e-08, "loss": 0.0004, "num_tokens": 62723310.0, "reward": 0.515625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4906666666666666, "grad_norm": 0.529824828272533, "kl": 0.39306640625, "learning_rate": 8.129437775309533e-08, "loss": 0.0004, "num_tokens": 62873314.0, "reward": 0.53125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4942222222222221, "grad_norm": 0.7362472960586969, "kl": 0.423583984375, "learning_rate": 8.023368523357182e-08, "loss": 0.0004, "num_tokens": 63023462.0, "reward": 0.51953125, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4977777777777779, "grad_norm": 0.7821251913385924, "kl": 0.479736328125, "learning_rate": 7.917863372651476e-08, "loss": 0.0005, "num_tokens": 63173582.0, "reward": 0.51171875, "reward_std": 0.0817507952451706, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24301259219646454, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5013333333333332, "grad_norm": 0.29499660137723543, "kl": 0.4189453125, "learning_rate": 7.812925828928332e-08, "loss": 0.0004, "num_tokens": 63323694.0, "reward": 0.52734375, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.504888888888889, "grad_norm": 3.151628702237811, "kl": 1.814453125, "learning_rate": 7.708559379063204e-08, "loss": 0.0018, "num_tokens": 63473758.0, "reward": 0.5390625, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5084444444444445, "grad_norm": 0.42289374767757515, "kl": 0.399658203125, "learning_rate": 7.604767490955138e-08, "loss": 0.0004, "num_tokens": 63623866.0, "reward": 0.53515625, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.512, "grad_norm": 0.810138453876871, "kl": 0.49365234375, "learning_rate": 7.501553613411626e-08, "loss": 0.0005, "num_tokens": 63774038.0, "reward": 0.5234375, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5155555555555555, "grad_norm": 0.5491563744374505, "kl": 0.431396484375, "learning_rate": 7.398921176033928e-08, "loss": 0.0004, "num_tokens": 63924126.0, "reward": 0.50390625, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.519111111111111, "grad_norm": 0.7576014460271948, "kl": 0.43359375, "learning_rate": 7.296873589103184e-08, "loss": 0.0004, "num_tokens": 64074174.0, "reward": 0.546875, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5226666666666666, "grad_norm": 0.5650904936953062, "kl": 0.400634765625, "learning_rate": 7.195414243467029e-08, "loss": 0.0004, "num_tokens": 64224258.0, "reward": 0.51171875, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5262222222222221, "grad_norm": 1.7191187286350949, "kl": 0.529541015625, "learning_rate": 7.094546510426994e-08, "loss": 0.0005, "num_tokens": 64374342.0, "reward": 0.55859375, "reward_std": 0.07933359593153, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.529777777777778, "grad_norm": 0.6477386570332615, "kl": 0.448974609375, "learning_rate": 6.994273741626405e-08, "loss": 0.0004, "num_tokens": 64524470.0, "reward": 0.59375, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.21875, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5333333333333332, "grad_norm": 0.49484841194714935, "kl": 0.42822265625, "learning_rate": 6.8945992689391e-08, "loss": 0.0004, "num_tokens": 64674642.0, "reward": 0.57421875, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.536888888888889, "grad_norm": 0.5641664834282943, "kl": 0.511962890625, "learning_rate": 6.795526404358628e-08, "loss": 0.0005, "num_tokens": 64824714.0, "reward": 0.55859375, "reward_std": 0.05050079524517059, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5404444444444443, "grad_norm": 0.7774048704942202, "kl": 0.4755859375, "learning_rate": 6.697058439888283e-08, "loss": 0.0005, "num_tokens": 64974826.0, "reward": 0.52734375, "reward_std": 0.08307026326656342, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.544, "grad_norm": 0.5632886984383448, "kl": 0.44482421875, "learning_rate": 6.599198647431642e-08, "loss": 0.0004, "num_tokens": 65125010.0, "reward": 0.5390625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5475555555555556, "grad_norm": 0.6453682372916957, "kl": 0.515869140625, "learning_rate": 6.501950278683907e-08, "loss": 0.0005, "num_tokens": 65275094.0, "reward": 0.54296875, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.551111111111111, "grad_norm": 0.5003006401865986, "kl": 0.449462890625, "learning_rate": 6.405316565023805e-08, "loss": 0.0004, "num_tokens": 65425174.0, "reward": 0.55078125, "reward_std": 0.03619525954127312, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5546666666666666, "grad_norm": 0.468183524036864, "kl": 0.427490234375, "learning_rate": 6.309300717406274e-08, "loss": 0.0004, "num_tokens": 65575170.0, "reward": 0.578125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5582222222222222, "grad_norm": 0.4013690840146761, "kl": 0.392333984375, "learning_rate": 6.213905926255697e-08, "loss": 0.0004, "num_tokens": 65725174.0, "reward": 0.5625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.561777777777778, "grad_norm": 0.5052845789767202, "kl": 0.395263671875, "learning_rate": 6.119135361359965e-08, "loss": 0.0004, "num_tokens": 65875286.0, "reward": 0.51953125, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5653333333333332, "grad_norm": 0.7242539146561551, "kl": 0.483642578125, "learning_rate": 6.024992171765089e-08, "loss": 0.0005, "num_tokens": 66025386.0, "reward": 0.5859375, "reward_std": 0.07866839319467545, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.568888888888889, "grad_norm": 0.3684742193914073, "kl": 0.396728515625, "learning_rate": 5.9314794856705983e-08, "loss": 0.0004, "num_tokens": 66175466.0, "reward": 0.546875, "reward_std": 0.037403859198093414, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5724444444444443, "grad_norm": 0.49890493402893316, "kl": 0.42822265625, "learning_rate": 5.8386004103255975e-08, "loss": 0.0004, "num_tokens": 66325550.0, "reward": 0.53515625, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.576, "grad_norm": 0.4438983691416006, "kl": 0.461181640625, "learning_rate": 5.7463580319254853e-08, "loss": 0.0005, "num_tokens": 66475646.0, "reward": 0.51171875, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5795555555555556, "grad_norm": 0.48341809431444255, "kl": 0.456298828125, "learning_rate": 5.6547554155094626e-08, "loss": 0.0005, "num_tokens": 66625682.0, "reward": 0.53515625, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5831111111111111, "grad_norm": 0.7056142221033674, "kl": 0.470703125, "learning_rate": 5.563795604858615e-08, "loss": 0.0005, "num_tokens": 66775798.0, "reward": 0.52734375, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5866666666666667, "grad_norm": 6.779742188034057, "kl": 0.804931640625, "learning_rate": 5.473481622394849e-08, "loss": 0.0008, "num_tokens": 66925906.0, "reward": 0.546875, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5902222222222222, "grad_norm": 1.40942249947706, "kl": 0.533203125, "learning_rate": 5.3838164690803935e-08, "loss": 0.0005, "num_tokens": 67075970.0, "reward": 0.51171875, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5937777777777777, "grad_norm": 0.5709869106367856, "kl": 0.514892578125, "learning_rate": 5.294803124318145e-08, "loss": 0.0005, "num_tokens": 67226142.0, "reward": 0.5390625, "reward_std": 0.047418396919965744, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5973333333333333, "grad_norm": 0.7428039346413163, "kl": 0.47802734375, "learning_rate": 5.20644454585262e-08, "loss": 0.0005, "num_tokens": 67376314.0, "reward": 0.51171875, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.600888888888889, "grad_norm": 0.47757983506186286, "kl": 0.44580078125, "learning_rate": 5.1187436696716906e-08, "loss": 0.0004, "num_tokens": 67526314.0, "reward": 0.52734375, "reward_std": 0.030584799125790596, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6044444444444443, "grad_norm": 0.6183437552202213, "kl": 0.5078125, "learning_rate": 5.0317034099090524e-08, "loss": 0.0005, "num_tokens": 67676354.0, "reward": 0.52734375, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.608, "grad_norm": 0.584260562502066, "kl": 0.486083984375, "learning_rate": 4.9453266587473423e-08, "loss": 0.0005, "num_tokens": 67826470.0, "reward": 0.5546875, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6115555555555554, "grad_norm": 0.536997119691752, "kl": 0.43310546875, "learning_rate": 4.859616286322094e-08, "loss": 0.0004, "num_tokens": 67976526.0, "reward": 0.5546875, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6151111111111112, "grad_norm": 0.4285196702276041, "kl": 0.413818359375, "learning_rate": 4.774575140626316e-08, "loss": 0.0004, "num_tokens": 68126606.0, "reward": 0.55859375, "reward_std": 0.025854695588350296, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6186666666666667, "grad_norm": 0.6432882809805262, "kl": 0.40478515625, "learning_rate": 4.6902060474159036e-08, "loss": 0.0004, "num_tokens": 68276658.0, "reward": 0.59375, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6222222222222222, "grad_norm": 0.670105632966217, "kl": 0.43212890625, "learning_rate": 4.6065118101157016e-08, "loss": 0.0004, "num_tokens": 68426750.0, "reward": 0.5078125, "reward_std": 0.06204995512962341, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6257777777777778, "grad_norm": 0.7554410323140208, "kl": 0.504150390625, "learning_rate": 4.5234952097263965e-08, "loss": 0.0005, "num_tokens": 68576790.0, "reward": 0.5625, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6293333333333333, "grad_norm": 0.5383642917248337, "kl": 0.489013671875, "learning_rate": 4.4411590047320617e-08, "loss": 0.0005, "num_tokens": 68726862.0, "reward": 0.5625, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6328888888888888, "grad_norm": 0.6764051889067699, "kl": 0.48876953125, "learning_rate": 4.359505931008553e-08, "loss": 0.0005, "num_tokens": 68877026.0, "reward": 0.5546875, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6364444444444444, "grad_norm": 0.608366482588541, "kl": 0.46923828125, "learning_rate": 4.278538701732534e-08, "loss": 0.0005, "num_tokens": 69027150.0, "reward": 0.546875, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6400000000000001, "grad_norm": 0.6650025403400517, "kl": 0.48193359375, "learning_rate": 4.198260007291399e-08, "loss": 0.0005, "num_tokens": 69177238.0, "reward": 0.5, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6435555555555554, "grad_norm": 0.4721462056014319, "kl": 0.51953125, "learning_rate": 4.118672515193794e-08, "loss": 0.0005, "num_tokens": 69327298.0, "reward": 0.5390625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6471111111111112, "grad_norm": 0.6621926942392276, "kl": 0.495361328125, "learning_rate": 4.039778869981064e-08, "loss": 0.0005, "num_tokens": 69477314.0, "reward": 0.546875, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6506666666666665, "grad_norm": 0.7308447153300471, "kl": 0.541748046875, "learning_rate": 3.961581693139307e-08, "loss": 0.0005, "num_tokens": 69627398.0, "reward": 0.55859375, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6542222222222223, "grad_norm": 0.847646129412192, "kl": 0.5849609375, "learning_rate": 3.884083583012318e-08, "loss": 0.0006, "num_tokens": 69777462.0, "reward": 0.58203125, "reward_std": 0.0703125, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6577777777777778, "grad_norm": 0.5867478692263943, "kl": 0.453857421875, "learning_rate": 3.807287114715216e-08, "loss": 0.0005, "num_tokens": 69927534.0, "reward": 0.546875, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6613333333333333, "grad_norm": 0.5569296548464157, "kl": 0.53515625, "learning_rate": 3.731194840048915e-08, "loss": 0.0005, "num_tokens": 70077530.0, "reward": 0.58203125, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6648888888888889, "grad_norm": 0.47294248502288805, "kl": 0.427490234375, "learning_rate": 3.655809287415284e-08, "loss": 0.0004, "num_tokens": 70227594.0, "reward": 0.5234375, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6684444444444444, "grad_norm": 0.4574981255397005, "kl": 0.4287109375, "learning_rate": 3.581132961733191e-08, "loss": 0.0004, "num_tokens": 70377714.0, "reward": 0.53515625, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6720000000000002, "grad_norm": 0.40519803603997345, "kl": 0.4169921875, "learning_rate": 3.5071683443552045e-08, "loss": 0.0004, "num_tokens": 70527870.0, "reward": 0.52734375, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6755555555555555, "grad_norm": 0.4840050767135489, "kl": 0.47607421875, "learning_rate": 3.433917892985208e-08, "loss": 0.0005, "num_tokens": 70677974.0, "reward": 0.4921875, "reward_std": 0.028382759541273117, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6791111111111112, "grad_norm": 0.5689276035825345, "kl": 0.447509765625, "learning_rate": 3.3613840415966764e-08, "loss": 0.0004, "num_tokens": 70828062.0, "reward": 0.5078125, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6826666666666665, "grad_norm": 0.3906743686263679, "kl": 0.39892578125, "learning_rate": 3.2895692003518575e-08, "loss": 0.0004, "num_tokens": 70978234.0, "reward": 0.48828125, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6862222222222223, "grad_norm": 0.6319095516033488, "kl": 0.4541015625, "learning_rate": 3.218475755521621e-08, "loss": 0.0005, "num_tokens": 71128366.0, "reward": 0.55859375, "reward_std": 0.05710469186306, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6897777777777778, "grad_norm": 0.6527920446086392, "kl": 0.404541015625, "learning_rate": 3.1481060694062365e-08, "loss": 0.0004, "num_tokens": 71278446.0, "reward": 0.53515625, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6933333333333334, "grad_norm": 0.609657629006971, "kl": 0.41162109375, "learning_rate": 3.078462480256819e-08, "loss": 0.0004, "num_tokens": 71428610.0, "reward": 0.53125, "reward_std": 0.058313291519880295, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.696888888888889, "grad_norm": 0.8910468177123286, "kl": 0.513427734375, "learning_rate": 3.0095473021976794e-08, "loss": 0.0005, "num_tokens": 71578734.0, "reward": 0.5078125, "reward_std": 0.07152109593153, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7004444444444444, "grad_norm": 0.8425501635040524, "kl": 0.51416015625, "learning_rate": 2.9413628251493934e-08, "loss": 0.0005, "num_tokens": 71728878.0, "reward": 0.55078125, "reward_std": 0.09077189117670059, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.704, "grad_norm": 0.6418930440563599, "kl": 0.430908203125, "learning_rate": 2.8739113147527417e-08, "loss": 0.0004, "num_tokens": 71878958.0, "reward": 0.5703125, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7075555555555555, "grad_norm": 1.0250618122706754, "kl": 0.630859375, "learning_rate": 2.8071950122934036e-08, "loss": 0.0006, "num_tokens": 72029110.0, "reward": 0.53515625, "reward_std": 0.07085589319467545, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7111111111111112, "grad_norm": 0.6070267863410984, "kl": 0.4892578125, "learning_rate": 2.7412161346275052e-08, "loss": 0.0005, "num_tokens": 72179178.0, "reward": 0.53515625, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7146666666666666, "grad_norm": 0.41436971147539053, "kl": 0.473388671875, "learning_rate": 2.675976874107935e-08, "loss": 0.0005, "num_tokens": 72329226.0, "reward": 0.51171875, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7182222222222223, "grad_norm": 0.7994525127067181, "kl": 0.623046875, "learning_rate": 2.611479398511518e-08, "loss": 0.0006, "num_tokens": 72479326.0, "reward": 0.51953125, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7217777777777776, "grad_norm": 3.591555794980261, "kl": 0.92333984375, "learning_rate": 2.5477258509669614e-08, "loss": 0.0009, "num_tokens": 72629430.0, "reward": 0.55859375, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7253333333333334, "grad_norm": 0.5523457619977477, "kl": 0.407958984375, "learning_rate": 2.4847183498836714e-08, "loss": 0.0004, "num_tokens": 72779562.0, "reward": 0.55078125, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.728888888888889, "grad_norm": 0.5337513777324062, "kl": 0.489013671875, "learning_rate": 2.4224589888813263e-08, "loss": 0.0005, "num_tokens": 72929630.0, "reward": 0.49609375, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7324444444444445, "grad_norm": 3.336975111696118, "kl": 0.9912109375, "learning_rate": 2.3609498367203467e-08, "loss": 0.001, "num_tokens": 73079714.0, "reward": 0.49609375, "reward_std": 0.06744526326656342, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.736, "grad_norm": 0.37339089559378696, "kl": 0.4267578125, "learning_rate": 2.300192937233128e-08, "loss": 0.0004, "num_tokens": 73229834.0, "reward": 0.51953125, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7395555555555555, "grad_norm": 0.5184832733671315, "kl": 0.509033203125, "learning_rate": 2.240190309256143e-08, "loss": 0.0005, "num_tokens": 73379898.0, "reward": 0.5, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.743111111111111, "grad_norm": 0.937777108795673, "kl": 0.7412109375, "learning_rate": 2.1809439465628382e-08, "loss": 0.0007, "num_tokens": 73530006.0, "reward": 0.5625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7466666666666666, "grad_norm": 0.44792777397212485, "kl": 0.432373046875, "learning_rate": 2.122455817797428e-08, "loss": 0.0004, "num_tokens": 73680138.0, "reward": 0.50390625, "reward_std": 0.016833597794175148, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7502222222222223, "grad_norm": 1.8165365556826047, "kl": 1.04248046875, "learning_rate": 2.0647278664094188e-08, "loss": 0.001, "num_tokens": 73830170.0, "reward": 0.53125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7537777777777777, "grad_norm": 0.5746953789475809, "kl": 0.496337890625, "learning_rate": 2.007762010589098e-08, "loss": 0.0005, "num_tokens": 73980250.0, "reward": 0.515625, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24301259219646454, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7573333333333334, "grad_norm": 34.458999462219055, "kl": 9.58984375, "learning_rate": 1.9515601432037317e-08, "loss": 0.0097, "num_tokens": 74130306.0, "reward": 0.51953125, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7608888888888887, "grad_norm": 1.1445300736653963, "kl": 0.626953125, "learning_rate": 1.8961241317347333e-08, "loss": 0.0006, "num_tokens": 74280438.0, "reward": 0.515625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7644444444444445, "grad_norm": 0.530745335656726, "kl": 0.537109375, "learning_rate": 1.8414558182155456e-08, "loss": 0.0005, "num_tokens": 74430474.0, "reward": 0.50390625, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.768, "grad_norm": 4.325748345041621, "kl": 0.811767578125, "learning_rate": 1.787557019170488e-08, "loss": 0.0008, "num_tokens": 74580578.0, "reward": 0.53515625, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7715555555555556, "grad_norm": 0.6871408255583382, "kl": 0.51953125, "learning_rate": 1.734429525554365e-08, "loss": 0.0005, "num_tokens": 74730626.0, "reward": 0.4921875, "reward_std": 0.04929219186306, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.775111111111111, "grad_norm": 0.5276217136164556, "kl": 0.505859375, "learning_rate": 1.6820751026929674e-08, "loss": 0.0005, "num_tokens": 74880714.0, "reward": 0.5, "reward_std": 0.04642495512962341, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7786666666666666, "grad_norm": 0.6364230686117014, "kl": 0.4501953125, "learning_rate": 1.6304954902244095e-08, "loss": 0.0005, "num_tokens": 75030838.0, "reward": 0.52734375, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7822222222222224, "grad_norm": 0.45223361174699134, "kl": 0.44482421875, "learning_rate": 1.5796924020413327e-08, "loss": 0.0004, "num_tokens": 75180934.0, "reward": 0.578125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7857777777777777, "grad_norm": 3.248583973869253, "kl": 0.61669921875, "learning_rate": 1.529667526233941e-08, "loss": 0.0006, "num_tokens": 75330966.0, "reward": 0.57421875, "reward_std": 0.016833597794175148, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7893333333333334, "grad_norm": 0.4374604603350181, "kl": 0.434326171875, "learning_rate": 1.4804225250339281e-08, "loss": 0.0004, "num_tokens": 75481026.0, "reward": 0.50390625, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7928888888888888, "grad_norm": 3.0992385739650823, "kl": 0.75537109375, "learning_rate": 1.4319590347592254e-08, "loss": 0.0008, "num_tokens": 75631058.0, "reward": 0.546875, "reward_std": 0.06304339319467545, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7964444444444445, "grad_norm": 0.652594553880175, "kl": 0.421142578125, "learning_rate": 1.3842786657596446e-08, "loss": 0.0004, "num_tokens": 75781122.0, "reward": 0.52734375, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8, "grad_norm": 0.765818438315625, "kl": 0.432373046875, "learning_rate": 1.3373830023633597e-08, "loss": 0.0004, "num_tokens": 75931254.0, "reward": 0.515625, "reward_std": 0.08956328779459, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9296875, "rewards/format_reward_func/std": 0.2566775679588318, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8035555555555556, "grad_norm": 0.3411460860044164, "kl": 0.439453125, "learning_rate": 1.2912736028242777e-08, "loss": 0.0004, "num_tokens": 76081346.0, "reward": 0.5390625, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8071111111111111, "grad_norm": 0.7582340748491105, "kl": 0.58740234375, "learning_rate": 1.2459519992702311e-08, "loss": 0.0006, "num_tokens": 76231422.0, "reward": 0.515625, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8106666666666666, "grad_norm": 0.6931442469315353, "kl": 0.43994140625, "learning_rate": 1.2014196976521035e-08, "loss": 0.0004, "num_tokens": 76381442.0, "reward": 0.578125, "reward_std": 0.07206448912620544, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8142222222222222, "grad_norm": 0.6242469027810024, "kl": 0.418701171875, "learning_rate": 1.1576781776937634e-08, "loss": 0.0004, "num_tokens": 76531578.0, "reward": 0.53125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8177777777777777, "grad_norm": 0.5319993004285838, "kl": 0.42333984375, "learning_rate": 1.1147288928429116e-08, "loss": 0.0004, "num_tokens": 76681742.0, "reward": 0.53125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8213333333333335, "grad_norm": 4.892495144239733, "kl": 1.937744140625, "learning_rate": 1.0725732702227735e-08, "loss": 0.0019, "num_tokens": 76831858.0, "reward": 0.515625, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8248888888888888, "grad_norm": 1.0196354843303994, "kl": 0.447998046875, "learning_rate": 1.0312127105846947e-08, "loss": 0.0004, "num_tokens": 76981942.0, "reward": 0.5546875, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8284444444444445, "grad_norm": 0.7205299980402574, "kl": 0.420654296875, "learning_rate": 9.906485882615695e-09, "loss": 0.0004, "num_tokens": 77132030.0, "reward": 0.51953125, "reward_std": 0.057104695588350296, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8319999999999999, "grad_norm": 4.0685262934583095, "kl": 1.60107421875, "learning_rate": 9.50882251122212e-09, "loss": 0.0016, "num_tokens": 77282094.0, "reward": 0.52734375, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8355555555555556, "grad_norm": 0.627281356606571, "kl": 0.442626953125, "learning_rate": 9.119150205265324e-09, "loss": 0.0004, "num_tokens": 77432230.0, "reward": 0.51953125, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8391111111111111, "grad_norm": 0.3858806846831556, "kl": 0.45751953125, "learning_rate": 8.737481912816592e-09, "loss": 0.0005, "num_tokens": 77582310.0, "reward": 0.58984375, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8426666666666667, "grad_norm": 0.23991137840603588, "kl": 0.4140625, "learning_rate": 8.363830315988945e-09, "loss": 0.0004, "num_tokens": 77732286.0, "reward": 0.52734375, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8462222222222222, "grad_norm": 0.4740890347365351, "kl": 0.447998046875, "learning_rate": 7.99820783051583e-09, "loss": 0.0004, "num_tokens": 77882334.0, "reward": 0.56640625, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8497777777777777, "grad_norm": 0.5700442254866067, "kl": 0.484375, "learning_rate": 7.640626605338624e-09, "loss": 0.0005, "num_tokens": 78032414.0, "reward": 0.515625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8533333333333335, "grad_norm": 0.41681350867652445, "kl": 0.423095703125, "learning_rate": 7.291098522202776e-09, "loss": 0.0004, "num_tokens": 78182498.0, "reward": 0.546875, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8568888888888888, "grad_norm": 0.7337724383393545, "kl": 0.47021484375, "learning_rate": 6.949635195263259e-09, "loss": 0.0005, "num_tokens": 78332678.0, "reward": 0.546875, "reward_std": 0.058313291519880295, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8604444444444446, "grad_norm": 0.4627777960335074, "kl": 0.410888671875, "learning_rate": 6.616247970698319e-09, "loss": 0.0004, "num_tokens": 78482726.0, "reward": 0.58203125, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8639999999999999, "grad_norm": 0.9327871091407389, "kl": 0.59765625, "learning_rate": 6.290947926332835e-09, "loss": 0.0006, "num_tokens": 78632766.0, "reward": 0.5234375, "reward_std": 0.07152109593153, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8675555555555556, "grad_norm": 0.5684611750891795, "kl": 0.43798828125, "learning_rate": 5.97374587126992e-09, "loss": 0.0004, "num_tokens": 78782810.0, "reward": 0.5390625, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.871111111111111, "grad_norm": 0.41112080183218475, "kl": 0.45068359375, "learning_rate": 5.664652345531845e-09, "loss": 0.0005, "num_tokens": 78932894.0, "reward": 0.53515625, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8746666666666667, "grad_norm": 0.6226854601113873, "kl": 0.4345703125, "learning_rate": 5.363677619709933e-09, "loss": 0.0004, "num_tokens": 79082994.0, "reward": 0.54296875, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8782222222222222, "grad_norm": 0.40926220755087916, "kl": 0.460205078125, "learning_rate": 5.070831694623135e-09, "loss": 0.0005, "num_tokens": 79233062.0, "reward": 0.55078125, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8817777777777778, "grad_norm": 0.3715644209620836, "kl": 0.443359375, "learning_rate": 4.786124300985822e-09, "loss": 0.0004, "num_tokens": 79383142.0, "reward": 0.5859375, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8853333333333333, "grad_norm": 0.41303999956467335, "kl": 0.425048828125, "learning_rate": 4.509564899084328e-09, "loss": 0.0004, "num_tokens": 79533302.0, "reward": 0.51953125, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8888888888888888, "grad_norm": 0.34711935286318774, "kl": 0.502685546875, "learning_rate": 4.241162678462806e-09, "loss": 0.0005, "num_tokens": 79683382.0, "reward": 0.5, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8924444444444446, "grad_norm": 1.749104726801777, "kl": 0.57666015625, "learning_rate": 3.9809265576176146e-09, "loss": 0.0006, "num_tokens": 79833494.0, "reward": 0.5546875, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.896, "grad_norm": 0.317239752869522, "kl": 0.460693359375, "learning_rate": 3.7288651837012745e-09, "loss": 0.0005, "num_tokens": 79983606.0, "reward": 0.5234375, "reward_std": 0.009021097794175148, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8995555555555557, "grad_norm": 0.5981545675620618, "kl": 0.4501953125, "learning_rate": 3.4849869322348126e-09, "loss": 0.0005, "num_tokens": 80133666.0, "reward": 0.5234375, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.903111111111111, "grad_norm": 1.8987553387497587, "kl": 0.595703125, "learning_rate": 3.249299906829761e-09, "loss": 0.0006, "num_tokens": 80283778.0, "reward": 0.5234375, "reward_std": 0.037403859198093414, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9066666666666667, "grad_norm": 1.3482962026459895, "kl": 0.652587890625, "learning_rate": 3.0218119389186502e-09, "loss": 0.0007, "num_tokens": 80433874.0, "reward": 0.57421875, "reward_std": 0.04620979726314545, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9102222222222223, "grad_norm": 0.29399766813544437, "kl": 0.393798828125, "learning_rate": 2.8025305874949945e-09, "loss": 0.0004, "num_tokens": 80583958.0, "reward": 0.58203125, "reward_std": 0.016833597794175148, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9137777777777778, "grad_norm": 0.5162389083983054, "kl": 0.40966796875, "learning_rate": 2.5914631388619103e-09, "loss": 0.0004, "num_tokens": 80734106.0, "reward": 0.5390625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9173333333333333, "grad_norm": 0.5828591533810914, "kl": 0.423583984375, "learning_rate": 2.388616606390198e-09, "loss": 0.0004, "num_tokens": 80884218.0, "reward": 0.5234375, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9208888888888889, "grad_norm": 0.5133961922979738, "kl": 0.511962890625, "learning_rate": 2.193997730285141e-09, "loss": 0.0005, "num_tokens": 81034370.0, "reward": 0.5390625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9244444444444444, "grad_norm": 0.6682775620951583, "kl": 0.51123046875, "learning_rate": 2.0076129773627103e-09, "loss": 0.0005, "num_tokens": 81184450.0, "reward": 0.5, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.928, "grad_norm": 0.58701555865206, "kl": 0.44287109375, "learning_rate": 1.8294685408345167e-09, "loss": 0.0004, "num_tokens": 81334506.0, "reward": 0.5625, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9315555555555557, "grad_norm": 0.5226655218446554, "kl": 0.406005859375, "learning_rate": 1.6595703401020844e-09, "loss": 0.0004, "num_tokens": 81484590.0, "reward": 0.55078125, "reward_std": 0.05050079524517059, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.935111111111111, "grad_norm": 0.44785902127312094, "kl": 0.45068359375, "learning_rate": 1.497924020560204e-09, "loss": 0.0005, "num_tokens": 81634682.0, "reward": 0.51171875, "reward_std": 0.030584799125790596, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9386666666666668, "grad_norm": 1.2271943789655686, "kl": 0.521728515625, "learning_rate": 1.3445349534093598e-09, "loss": 0.0005, "num_tokens": 81784818.0, "reward": 0.53515625, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.942222222222222, "grad_norm": 0.34440149179608015, "kl": 0.428955078125, "learning_rate": 1.199408235477123e-09, "loss": 0.0004, "num_tokens": 81934862.0, "reward": 0.5390625, "reward_std": 0.024646097794175148, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9457777777777778, "grad_norm": 0.5299180419650383, "kl": 0.398681640625, "learning_rate": 1.0625486890488978e-09, "loss": 0.0004, "num_tokens": 82084958.0, "reward": 0.5078125, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9493333333333334, "grad_norm": 0.7383076800700398, "kl": 0.576171875, "learning_rate": 9.339608617077165e-10, "loss": 0.0006, "num_tokens": 82234994.0, "reward": 0.515625, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.952888888888889, "grad_norm": 2.413667288202016, "kl": 0.955322265625, "learning_rate": 8.136490261830553e-10, "loss": 0.001, "num_tokens": 82385046.0, "reward": 0.5078125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9564444444444444, "grad_norm": 0.621725268878221, "kl": 0.462890625, "learning_rate": 7.016171802088633e-10, "loss": 0.0005, "num_tokens": 82535218.0, "reward": 0.5390625, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.96, "grad_norm": 0.23655919159399819, "kl": 0.40673828125, "learning_rate": 5.978690463908087e-10, "loss": 0.0004, "num_tokens": 82685354.0, "reward": 0.55859375, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9635555555555557, "grad_norm": 0.47785457137586707, "kl": 0.448974609375, "learning_rate": 5.024080720824608e-10, "loss": 0.0004, "num_tokens": 82835570.0, "reward": 0.51953125, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.967111111111111, "grad_norm": 0.346206288127188, "kl": 0.446533203125, "learning_rate": 4.152374292708538e-10, "loss": 0.0004, "num_tokens": 82985646.0, "reward": 0.51953125, "reward_std": 0.025854695588350296, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9706666666666668, "grad_norm": 0.3832712574599466, "kl": 0.40869140625, "learning_rate": 3.363600144710155e-10, "loss": 0.0004, "num_tokens": 83135678.0, "reward": 0.55078125, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.974222222222222, "grad_norm": 0.24281220847577703, "kl": 0.391357421875, "learning_rate": 2.6577844862973877e-10, "loss": 0.0004, "num_tokens": 83285698.0, "reward": 0.52734375, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9777777777777779, "grad_norm": 0.9582617582460151, "kl": 0.546142578125, "learning_rate": 2.0349507703851243e-10, "loss": 0.0005, "num_tokens": 83435778.0, "reward": 0.57421875, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9813333333333332, "grad_norm": 0.35323783838354617, "kl": 0.42431640625, "learning_rate": 1.4951196925561127e-10, "loss": 0.0004, "num_tokens": 83585798.0, "reward": 0.54296875, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.984888888888889, "grad_norm": 1.4092624453276854, "kl": 0.759521484375, "learning_rate": 1.0383091903720665e-10, "loss": 0.0008, "num_tokens": 83735938.0, "reward": 0.5625, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9884444444444445, "grad_norm": 0.7220556700564954, "kl": 0.511474609375, "learning_rate": 6.645344427794186e-11, "loss": 0.0005, "num_tokens": 83886042.0, "reward": 0.53125, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.992, "grad_norm": 0.3915357093577265, "kl": 0.454833984375, "learning_rate": 3.738078696036151e-11, "loss": 0.0005, "num_tokens": 84036218.0, "reward": 0.5234375, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9955555555555555, "grad_norm": 0.4708155060402308, "kl": 0.412353515625, "learning_rate": 1.6613913113694423e-11, "loss": 0.0004, "num_tokens": 84186286.0, "reward": 0.578125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.999111111111111, "grad_norm": 0.7953265607446423, "kl": 0.6337890625, "learning_rate": 4.153512781768231e-12, "loss": 0.0006, "num_tokens": 84336314.0, "reward": 0.5078125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 562 }, { "epoch": 1.999111111111111, "step": 562, "total_flos": 0.0, "train_loss": 0.010520504666011201, "train_runtime": 15057.2865, "train_samples_per_second": 1.195, "train_steps_per_second": 0.037 } ], "logging_steps": 1, "max_steps": 562, "num_input_tokens_seen": 84336314, "num_train_epochs": 2, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }