{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05714285714285714, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5208333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3128.0, "completions/mean_length": 2584.104248046875, "completions/mean_terminated_length": 1497.2608642578125, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 0.001142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.26198074221611023, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0022, "num_tokens": 131153.0, "policy_entropy_avg": 8.125, "reward": 0.3948305547237396, "reward_std": 0.7732391357421875, "rewards/cosine_scaled_reward/mean": -0.062009382992982864, "rewards/cosine_scaled_reward/std": 0.43048128485679626, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.504852294921875, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3280.0, "completions/mean_length": 2761.666748046875, "completions/mean_terminated_length": 1610.4000244140625, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.002285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2314005047082901, "kl": 0.0, "learning_rate": 2e-07, "loss": -0.0045, "num_tokens": 271243.0, "policy_entropy_avg": 8.125, "reward": 0.4077601432800293, "reward_std": 0.8425893187522888, "rewards/cosine_scaled_reward/mean": -0.003428752301260829, "rewards/cosine_scaled_reward/std": 0.4935320317745209, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.49822381138801575, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 2945.0, "completions/mean_length": 3343.33349609375, "completions/mean_terminated_length": 1658.666748046875, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.0034285714285714284, "frac_reward_zero_std": 0.0, "grad_norm": 0.19728681445121765, "kl": 0.0006656646728515625, "learning_rate": 4e-07, "loss": 0.0095, "num_tokens": 439577.0, "policy_entropy_avg": 8.125, "reward": -0.15455231070518494, "reward_std": 0.5764515995979309, "rewards/cosine_scaled_reward/mean": -0.17141447961330414, "rewards/cosine_scaled_reward/std": 0.32203689217567444, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.3944427967071533, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.39583333333333337, "completions/max_length": 3584.0, "completions/max_terminated_length": 3458.0, "completions/mean_length": 2226.89599609375, "completions/mean_terminated_length": 1337.7586669921875, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.004571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.2950705587863922, "kl": 0.0006043116251627604, "learning_rate": 6e-07, "loss": -0.001, "num_tokens": 553824.0, "policy_entropy_avg": 8.125, "reward": 0.4680083394050598, "reward_std": 0.8357078433036804, "rewards/cosine_scaled_reward/mean": -0.09815327078104019, "rewards/cosine_scaled_reward/std": 0.399366170167923, "rewards/format_reward/mean": 0.6666666865348816, "rewards/format_reward/std": 0.47639307379722595, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 3089.104248046875, "completions/mean_terminated_length": 1887.21435546875, "completions/min_length": 909.0, "completions/min_terminated_length": 909.0, "epoch": 0.005714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2482743263244629, "kl": 0.000629425048828125, "learning_rate": 8e-07, "loss": 0.0028, "num_tokens": 710213.0, "policy_entropy_avg": 8.125, "reward": -0.06966459006071091, "reward_std": 0.7608852386474609, "rewards/cosine_scaled_reward/mean": -0.20167399942874908, "rewards/cosine_scaled_reward/std": 0.3204644024372101, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.47639307379722595, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7916666666666666, "completions/max_length": 3584.0, "completions/max_terminated_length": 3421.0, "completions/mean_length": 3119.52099609375, "completions/mean_terminated_length": 1354.5, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "epoch": 0.006857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.24190759658813477, "kl": 0.0006701151529947916, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 868686.0, "policy_entropy_avg": 8.125, "reward": 0.026811789721250534, "reward_std": 0.7506579756736755, "rewards/cosine_scaled_reward/mean": -0.1427767425775528, "rewards/cosine_scaled_reward/std": 0.3361252248287201, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4684174358844757, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3457.0, "completions/mean_length": 3024.291748046875, "completions/mean_terminated_length": 2362.818359375, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 0.008, "frac_reward_zero_std": 0.0, "grad_norm": 0.20822857320308685, "kl": 0.0005512237548828125, "learning_rate": 9.989038226169207e-07, "loss": -0.009, "num_tokens": 1021658.0, "policy_entropy_avg": 8.125, "reward": 0.47669005393981934, "reward_std": 0.9081848859786987, "rewards/cosine_scaled_reward/mean": -0.031290601938962936, "rewards/cosine_scaled_reward/std": 0.47983497381210327, "rewards/format_reward/mean": 0.5416666865348816, "rewards/format_reward/std": 0.5035336017608643, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6041666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3568.0, "completions/mean_length": 2791.875, "completions/mean_terminated_length": 1582.8421630859375, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.009142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.23235374689102173, "kl": 0.0005970001220703125, "learning_rate": 9.956206309337066e-07, "loss": -0.0081, "num_tokens": 1163480.0, "policy_entropy_avg": 8.125, "reward": 0.5300650596618652, "reward_std": 0.7924127578735352, "rewards/cosine_scaled_reward/mean": 0.03719766065478325, "rewards/cosine_scaled_reward/std": 0.4377634525299072, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5035336017608643, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3494.0, "completions/mean_length": 3142.95849609375, "completions/mean_terminated_length": 2071.857177734375, "completions/min_length": 955.0, "completions/min_terminated_length": 955.0, "epoch": 0.010285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.21945622563362122, "kl": 0.0006663004557291666, "learning_rate": 9.901664203302124e-07, "loss": 0.002, "num_tokens": 1322934.0, "policy_entropy_avg": 8.125, "reward": 0.09029825031757355, "reward_std": 0.8250617980957031, "rewards/cosine_scaled_reward/mean": -0.1421239972114563, "rewards/cosine_scaled_reward/std": 0.3718816637992859, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.48924607038497925, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3440.0, "completions/mean_length": 2639.791748046875, "completions/mean_terminated_length": 1066.111083984375, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.011428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2775964140892029, "kl": 0.0005779266357421875, "learning_rate": 9.825677631722435e-07, "loss": -0.0111, "num_tokens": 1457768.0, "policy_entropy_avg": 8.125, "reward": 0.31791985034942627, "reward_std": 0.7219366431236267, "rewards/cosine_scaled_reward/mean": -0.03815798461437225, "rewards/cosine_scaled_reward/std": 0.4010634124279022, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.49420398473739624, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3528.0, "completions/mean_length": 3260.8125, "completions/mean_terminated_length": 1860.3333740234375, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 0.012571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.21065565943717957, "kl": 0.0005486806233723959, "learning_rate": 9.728616793536587e-07, "loss": 0.0145, "num_tokens": 1623041.0, "policy_entropy_avg": 8.135416666666666, "reward": -0.1468753218650818, "reward_std": 0.909512996673584, "rewards/cosine_scaled_reward/mean": -0.18839001655578613, "rewards/cosine_scaled_reward/std": 0.377286821603775, "rewards/format_reward/mean": 0.2291666716337204, "rewards/format_reward/std": 0.4247443675994873, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 3584.0, "completions/max_terminated_length": 3564.0, "completions/mean_length": 2480.791748046875, "completions/mean_terminated_length": 1692.7857666015625, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.013714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.33562034368515015, "kl": 0.0005970001220703125, "learning_rate": 9.610954559391704e-07, "loss": 0.0138, "num_tokens": 1750327.0, "policy_entropy_avg": 8.125, "reward": 0.425642192363739, "reward_std": 0.823100745677948, "rewards/cosine_scaled_reward/mean": -0.08819279819726944, "rewards/cosine_scaled_reward/std": 0.44234269857406616, "rewards/format_reward/mean": 0.6041666865348816, "rewards/format_reward/std": 0.49420398473739624, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3538.0, "completions/mean_length": 2816.14599609375, "completions/mean_terminated_length": 1741.1500244140625, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 0.014857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.27906712889671326, "kl": 0.0005658467610677084, "learning_rate": 9.473264167865171e-07, "loss": 0.0016, "num_tokens": 1893782.0, "policy_entropy_avg": 8.125, "reward": 0.24736499786376953, "reward_std": 0.7155070304870605, "rewards/cosine_scaled_reward/mean": -0.09444598108530045, "rewards/cosine_scaled_reward/std": 0.4492030441761017, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.5013279914855957, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3369.0, "completions/mean_length": 2769.0, "completions/mean_terminated_length": 1628.0, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "epoch": 0.016, "frac_reward_zero_std": 0.0, "grad_norm": 0.27333828806877136, "kl": 0.000553131103515625, "learning_rate": 9.316216432703916e-07, "loss": 0.0017, "num_tokens": 2034650.0, "policy_entropy_avg": 8.125, "reward": 0.1486106812953949, "reward_std": 0.8035473227500916, "rewards/cosine_scaled_reward/mean": -0.1336546093225479, "rewards/cosine_scaled_reward/std": 0.3953794538974762, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.49822381138801575, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3186.0, "completions/mean_length": 2703.08349609375, "completions/mean_terminated_length": 1234.888916015625, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.017142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.2734379768371582, "kl": 0.0005137125651041666, "learning_rate": 9.140576474687263e-07, "loss": -0.0122, "num_tokens": 2172588.0, "policy_entropy_avg": 8.125, "reward": 0.406665563583374, "reward_std": 0.3276861608028412, "rewards/cosine_scaled_reward/mean": 0.0168545451015234, "rewards/cosine_scaled_reward/std": 0.4574853479862213, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.48924607038497925, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9791666666666666, "completions/max_length": 3584.0, "completions/max_terminated_length": 2984.0, "completions/mean_length": 3571.5, "completions/mean_terminated_length": 2984.0, "completions/min_length": 2984.0, "completions/min_terminated_length": 2984.0, "epoch": 0.018285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.1932428628206253, "kl": 0.0006643931070963541, "learning_rate": 8.9471999940354e-07, "loss": 0.0075, "num_tokens": 2351850.0, "policy_entropy_avg": 8.135416666666666, "reward": -0.3992506265640259, "reward_std": 0.5042399168014526, "rewards/cosine_scaled_reward/mean": -0.22146178781986237, "rewards/cosine_scaled_reward/std": 0.292772501707077, "rewards/format_reward/mean": 0.0416666679084301, "rewards/format_reward/std": 0.20194092392921448, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.39583333333333337, "completions/max_length": 3584.0, "completions/max_terminated_length": 3475.0, "completions/mean_length": 2287.416748046875, "completions/mean_terminated_length": 1437.9310302734375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.019428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.37555888295173645, "kl": 0.0006338755289713541, "learning_rate": 8.737029101523929e-07, "loss": -0.0011, "num_tokens": 2469536.0, "policy_entropy_avg": 8.125, "reward": 0.5107091665267944, "reward_std": 0.8238445520401001, "rewards/cosine_scaled_reward/mean": -0.04544559493660927, "rewards/cosine_scaled_reward/std": 0.45671001076698303, "rewards/format_reward/mean": 0.6041666865348816, "rewards/format_reward/std": 0.49420398473739624, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3116.0, "completions/mean_length": 2911.89599609375, "completions/mean_terminated_length": 1433.2667236328125, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 0.02057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.21075770258903503, "kl": 0.0006434122721354166, "learning_rate": 8.511087728614862e-07, "loss": 0.0029, "num_tokens": 2617089.0, "policy_entropy_avg": 8.135416666666666, "reward": -0.13376453518867493, "reward_std": 0.6403241157531738, "rewards/cosine_scaled_reward/mean": -0.2234683483839035, "rewards/cosine_scaled_reward/std": 0.2743138074874878, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4684174358844757, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3400.0, "completions/mean_length": 2844.5, "completions/mean_terminated_length": 1893.71435546875, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.021714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.24317172169685364, "kl": 0.0006122589111328125, "learning_rate": 8.270476638965461e-07, "loss": -0.0105, "num_tokens": 2762067.0, "policy_entropy_avg": 8.125, "reward": 0.7856847643852234, "reward_std": 0.5978894829750061, "rewards/cosine_scaled_reward/mean": 0.15523308515548706, "rewards/cosine_scaled_reward/std": 0.5373290181159973, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5048523545265198, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.47916666666666663, "completions/max_length": 3584.0, "completions/max_terminated_length": 3494.0, "completions/mean_length": 2482.83349609375, "completions/mean_terminated_length": 1469.760009765625, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.022857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.26188376545906067, "kl": 0.0005286534627278646, "learning_rate": 8.01636806561836e-07, "loss": -0.0042, "num_tokens": 2889757.0, "policy_entropy_avg": 8.125, "reward": 0.5357545614242554, "reward_std": 0.7750095129013062, "rewards/cosine_scaled_reward/mean": -0.03285994753241539, "rewards/cosine_scaled_reward/std": 0.4009867310523987, "rewards/format_reward/mean": 0.6041666865348816, "rewards/format_reward/std": 0.49420398473739624, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6041666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 2710.0, "completions/mean_length": 2631.70849609375, "completions/mean_terminated_length": 1178.2105712890625, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.024, "frac_reward_zero_std": 0.0, "grad_norm": 0.31193122267723083, "kl": 0.0006847381591796875, "learning_rate": 7.75e-07, "loss": 0.0002, "num_tokens": 3024185.0, "policy_entropy_avg": 8.125, "reward": 0.18238481879234314, "reward_std": 0.4078831374645233, "rewards/cosine_scaled_reward/mean": -0.11668267101049423, "rewards/cosine_scaled_reward/std": 0.3962862193584442, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.49822381138801575, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.27083333333333337, "completions/max_length": 3584.0, "completions/max_terminated_length": 3239.0, "completions/mean_length": 1697.2083740234375, "completions/mean_terminated_length": 996.4000244140625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.025142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.40179967880249023, "kl": 0.0006052652994791666, "learning_rate": 7.472670160550848e-07, "loss": -0.005, "num_tokens": 3112413.0, "policy_entropy_avg": 8.125, "reward": 0.6634411811828613, "reward_std": 0.5782728791236877, "rewards/cosine_scaled_reward/mean": -0.06244581937789917, "rewards/cosine_scaled_reward/std": 0.4282727539539337, "rewards/format_reward/mean": 0.7916666865348816, "rewards/format_reward/std": 0.41041406989097595, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.39583333333333337, "completions/max_length": 3584.0, "completions/max_terminated_length": 3421.0, "completions/mean_length": 2181.104248046875, "completions/mean_terminated_length": 1261.9654541015625, "completions/min_length": 595.0, "completions/min_terminated_length": 595.0, "epoch": 0.026285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.3063512444496155, "kl": 0.0006097157796223959, "learning_rate": 7.185729670371604e-07, "loss": 0.0002, "num_tokens": 3225200.0, "policy_entropy_avg": 8.125, "reward": 0.3287263512611389, "reward_std": 0.8908068537712097, "rewards/cosine_scaled_reward/mean": -0.14731089770793915, "rewards/cosine_scaled_reward/std": 0.42148637771606445, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.48924607038497925, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3322.0, "completions/mean_length": 2681.229248046875, "completions/mean_terminated_length": 1614.3182373046875, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.027428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.26260870695114136, "kl": 0.0006230672200520834, "learning_rate": 6.890576474687263e-07, "loss": 0.001, "num_tokens": 3362095.0, "policy_entropy_avg": 8.125, "reward": 0.3754323720932007, "reward_std": 0.7894452810287476, "rewards/cosine_scaled_reward/mean": -0.061340540647506714, "rewards/cosine_scaled_reward/std": 0.4359513223171234, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5052911639213562, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3461.0, "completions/mean_length": 2590.166748046875, "completions/mean_terminated_length": 1312.3809814453125, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.02857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.23011414706707, "kl": 0.0007470448811848959, "learning_rate": 6.588648530198504e-07, "loss": 0.0021, "num_tokens": 3494145.0, "policy_entropy_avg": 8.125, "reward": 0.3740345239639282, "reward_std": 0.7695837020874023, "rewards/cosine_scaled_reward/mean": -0.03079296462237835, "rewards/cosine_scaled_reward/std": 0.44012707471847534, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.5013279914855957, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3395.0, "completions/mean_length": 2929.666748046875, "completions/mean_terminated_length": 2088.381103515625, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "epoch": 0.029714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23103339970111847, "kl": 0.0006039937337239584, "learning_rate": 6.281416799501187e-07, "loss": 0.0037, "num_tokens": 3642743.0, "policy_entropy_avg": 8.125, "reward": 0.2619829773902893, "reward_std": 0.6574144959449768, "rewards/cosine_scaled_reward/mean": -0.10793358087539673, "rewards/cosine_scaled_reward/std": 0.4338338077068329, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5048523545265198, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3288.0, "completions/mean_length": 2908.20849609375, "completions/mean_terminated_length": 1556.625, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.030857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2881726324558258, "kl": 0.000667572021484375, "learning_rate": 5.97037808470444e-07, "loss": -0.0, "num_tokens": 3790053.0, "policy_entropy_avg": 8.125, "reward": 0.041869934648275375, "reward_std": 0.7798717021942139, "rewards/cosine_scaled_reward/mean": -0.1560431718826294, "rewards/cosine_scaled_reward/std": 0.29862359166145325, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.4833211302757263, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6041666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 2831.52099609375, "completions/mean_terminated_length": 1683.0, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.24444623291492462, "kl": 0.0005861918131510416, "learning_rate": 5.657047735161255e-07, "loss": 0.0062, "num_tokens": 3933718.0, "policy_entropy_avg": 8.125, "reward": 0.4484003484249115, "reward_std": 0.8719537258148193, "rewards/cosine_scaled_reward/mean": 0.006576786283403635, "rewards/cosine_scaled_reward/std": 0.4855944514274597, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.5013279914855957, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7916666666666666, "completions/max_length": 3584.0, "completions/max_terminated_length": 3091.0, "completions/mean_length": 3182.02099609375, "completions/mean_terminated_length": 1654.5, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.03314285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.22512345016002655, "kl": 0.0006993611653645834, "learning_rate": 5.342952264838747e-07, "loss": 0.0099, "num_tokens": 4094309.0, "policy_entropy_avg": 8.125, "reward": -0.1955069899559021, "reward_std": 0.668115496635437, "rewards/cosine_scaled_reward/mean": -0.21282805502414703, "rewards/cosine_scaled_reward/std": 0.37752190232276917, "rewards/format_reward/mean": 0.2291666716337204, "rewards/format_reward/std": 0.4247443675994873, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3320.0, "completions/mean_length": 2794.666748046875, "completions/mean_terminated_length": 1779.8095703125, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 0.03428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.22448024153709412, "kl": 0.0006434122721354166, "learning_rate": 5.02962191529556e-07, "loss": 0.0021, "num_tokens": 4236355.0, "policy_entropy_avg": 8.125, "reward": 0.38854461908340454, "reward_std": 0.8984581828117371, "rewards/cosine_scaled_reward/mean": -0.0443347692489624, "rewards/cosine_scaled_reward/std": 0.44917213916778564, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5048523545265198, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7708333333333334, "completions/max_length": 3584.0, "completions/max_terminated_length": 3570.0, "completions/mean_length": 3039.39599609375, "completions/mean_terminated_length": 1207.5455322265625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.03542857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.22645282745361328, "kl": 0.0006421407063802084, "learning_rate": 4.7185832004988133e-07, "loss": 0.009, "num_tokens": 4390118.0, "policy_entropy_avg": 8.125, "reward": -0.09765049070119858, "reward_std": 0.6962664127349854, "rewards/cosine_scaled_reward/mean": -0.1740705966949463, "rewards/cosine_scaled_reward/std": 0.4055609405040741, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4375949800014496, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3517.0, "completions/mean_length": 3097.125, "completions/mean_terminated_length": 2415.5, "completions/min_length": 1046.0, "completions/min_terminated_length": 1046.0, "epoch": 0.036571428571428574, "frac_reward_zero_std": 0.0, "grad_norm": 0.19842347502708435, "kl": 0.000629425048828125, "learning_rate": 4.4113514698014953e-07, "loss": -0.0137, "num_tokens": 4546544.0, "policy_entropy_avg": 8.125, "reward": 0.6700344085693359, "reward_std": 0.7424625158309937, "rewards/cosine_scaled_reward/mean": 0.10753399133682251, "rewards/cosine_scaled_reward/std": 0.5346410274505615, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5035336017608643, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3484.0, "completions/mean_length": 3236.604248046875, "completions/mean_terminated_length": 2194.416748046875, "completions/min_length": 1039.0, "completions/min_terminated_length": 1039.0, "epoch": 0.037714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.19302453100681305, "kl": 0.0005734761555989584, "learning_rate": 4.1094235253127374e-07, "loss": 0.0048, "num_tokens": 4710313.0, "policy_entropy_avg": 8.125, "reward": -0.12012770771980286, "reward_std": 0.6003495454788208, "rewards/cosine_scaled_reward/mean": -0.1957823485136032, "rewards/cosine_scaled_reward/std": 0.28730008006095886, "rewards/format_reward/mean": 0.2708333432674408, "rewards/format_reward/std": 0.4490928649902344, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.45833333333333337, "completions/max_length": 3584.0, "completions/max_terminated_length": 3326.0, "completions/mean_length": 2303.45849609375, "completions/mean_terminated_length": 1219.923095703125, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.038857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.273879736661911, "kl": 0.0007279713948567709, "learning_rate": 3.8142703296283953e-07, "loss": 0.0012, "num_tokens": 4828043.0, "policy_entropy_avg": 8.125, "reward": 0.5595396757125854, "reward_std": 0.9288837313652039, "rewards/cosine_scaled_reward/mean": -7.428725803038105e-05, "rewards/cosine_scaled_reward/std": 0.5000401139259338, "rewards/format_reward/mean": 0.5625, "rewards/format_reward/std": 0.5013279914855957, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7291666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3498.0, "completions/mean_length": 3022.52099609375, "completions/mean_terminated_length": 1510.84619140625, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 0.25011301040649414, "kl": 0.0006783803304036459, "learning_rate": 3.5273298394491515e-07, "loss": 0.0054, "num_tokens": 4981746.0, "policy_entropy_avg": 8.125, "reward": 0.1530809849500656, "reward_std": 0.9632071256637573, "rewards/cosine_scaled_reward/mean": -0.07932490855455399, "rewards/cosine_scaled_reward/std": 0.4641749858856201, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4684174358844757, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3483.0, "completions/mean_length": 3256.479248046875, "completions/mean_terminated_length": 1837.2222900390625, "completions/min_length": 1007.0, "completions/min_terminated_length": 1007.0, "epoch": 0.04114285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.21851785480976105, "kl": 0.0007712046305338541, "learning_rate": 3.250000000000001e-07, "loss": 0.0035, "num_tokens": 5146391.0, "policy_entropy_avg": 8.135416666666666, "reward": -0.2397136390209198, "reward_std": 0.3913343846797943, "rewards/cosine_scaled_reward/mean": -0.23504245281219482, "rewards/cosine_scaled_reward/std": 0.17867261171340942, "rewards/format_reward/mean": 0.2291666716337204, "rewards/format_reward/std": 0.4247443675994873, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7708333333333334, "completions/max_length": 3584.0, "completions/max_terminated_length": 3151.0, "completions/mean_length": 3157.70849609375, "completions/mean_terminated_length": 1723.8182373046875, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 0.04228571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2363603562116623, "kl": 0.0006186167399088541, "learning_rate": 2.9836319343816397e-07, "loss": 0.0063, "num_tokens": 5306229.0, "policy_entropy_avg": 8.125, "reward": -0.23191750049591064, "reward_std": 0.505261242389679, "rewards/cosine_scaled_reward/mean": -0.24154144525527954, "rewards/cosine_scaled_reward/std": 0.23630201816558838, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4375949800014496, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7916666666666666, "completions/max_length": 3584.0, "completions/max_terminated_length": 2515.0, "completions/mean_length": 3111.854248046875, "completions/mean_terminated_length": 1317.7000732421875, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 0.04342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.19270718097686768, "kl": 0.0006771087646484375, "learning_rate": 2.729523361034538e-07, "loss": -0.0004, "num_tokens": 5464382.0, "policy_entropy_avg": 8.125, "reward": 0.06942842155694962, "reward_std": 0.4463602900505066, "rewards/cosine_scaled_reward/mean": -0.07969469577074051, "rewards/cosine_scaled_reward/std": 0.3691597282886505, "rewards/format_reward/mean": 0.2291666716337204, "rewards/format_reward/std": 0.4247443675994873, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3387.0, "completions/mean_length": 2799.166748046875, "completions/mean_terminated_length": 1700.4000244140625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.044571428571428574, "frac_reward_zero_std": 0.0, "grad_norm": 0.2799771726131439, "kl": 0.0005861918131510416, "learning_rate": 2.488912271385139e-07, "loss": -0.0356, "num_tokens": 5606830.0, "policy_entropy_avg": 8.125, "reward": 0.41357025504112244, "reward_std": 0.3624642491340637, "rewards/cosine_scaled_reward/mean": -0.04217575863003731, "rewards/cosine_scaled_reward/std": 0.4245593845844269, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5052911639213562, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 2945.0, "completions/mean_length": 2401.95849609375, "completions/mean_terminated_length": 1005.0, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.045714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2853446900844574, "kl": 0.000675201416015625, "learning_rate": 2.2629708984760706e-07, "loss": -0.0033, "num_tokens": 5729678.0, "policy_entropy_avg": 8.125, "reward": 0.2566830515861511, "reward_std": 0.573390781879425, "rewards/cosine_scaled_reward/mean": -0.11059689521789551, "rewards/cosine_scaled_reward/std": 0.43331247568130493, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5048523545265198, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3370.0, "completions/mean_length": 2863.45849609375, "completions/mean_terminated_length": 1662.5555419921875, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 0.046857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.22030523419380188, "kl": 0.0006434122721354166, "learning_rate": 2.0528000059645995e-07, "loss": 0.0144, "num_tokens": 5875488.0, "policy_entropy_avg": 8.125, "reward": 0.050709377974271774, "reward_std": 0.7291332483291626, "rewards/cosine_scaled_reward/mean": -0.18285124003887177, "rewards/cosine_scaled_reward/std": 0.3616393804550171, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.49822381138801575, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 3584.0, "completions/max_terminated_length": 2370.0, "completions/mean_length": 2728.5, "completions/mean_terminated_length": 846.4000244140625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.048, "frac_reward_zero_std": 0.0, "grad_norm": 0.3805939257144928, "kl": 0.0007483164469401041, "learning_rate": 1.8594235253127372e-07, "loss": 0.001, "num_tokens": 6014226.0, "policy_entropy_avg": 8.125, "reward": -0.11857573688030243, "reward_std": 0.35950538516044617, "rewards/cosine_scaled_reward/mean": -0.2158358097076416, "rewards/cosine_scaled_reward/std": 0.18257829546928406, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4684174358844757, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 2668.0, "completions/mean_length": 3001.479248046875, "completions/mean_terminated_length": 1253.916748046875, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.04914285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2410728931427002, "kl": 0.0007006327311197916, "learning_rate": 1.6837835672960831e-07, "loss": 0.0025, "num_tokens": 6167009.0, "policy_entropy_avg": 8.125, "reward": 0.13230201601982117, "reward_std": 0.6667492389678955, "rewards/cosine_scaled_reward/mean": -0.05851660296320915, "rewards/cosine_scaled_reward/std": 0.43021252751350403, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4375949800014496, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6041666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3552.0, "completions/mean_length": 2645.104248046875, "completions/mean_terminated_length": 1212.0526123046875, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.05028571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2952722907066345, "kl": 0.0007654825846354166, "learning_rate": 1.5267358321348285e-07, "loss": 0.0014, "num_tokens": 6301996.0, "policy_entropy_avg": 8.135416666666666, "reward": 0.43852299451828003, "reward_std": 0.8475234508514404, "rewards/cosine_scaled_reward/mean": 0.0016132990131154656, "rewards/cosine_scaled_reward/std": 0.5085917711257935, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.5013279914855957, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8541666666666666, "completions/max_length": 3584.0, "completions/max_terminated_length": 3505.0, "completions/mean_length": 3466.39599609375, "completions/mean_terminated_length": 2777.571533203125, "completions/min_length": 1678.0, "completions/min_terminated_length": 1678.0, "epoch": 0.05142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.19519105553627014, "kl": 0.0006815592447916666, "learning_rate": 1.3890454406082956e-07, "loss": 0.0045, "num_tokens": 6477125.0, "policy_entropy_avg": 8.125, "reward": 0.16751746833324432, "reward_std": 0.5252600312232971, "rewards/cosine_scaled_reward/mean": -0.030403709039092064, "rewards/cosine_scaled_reward/std": 0.44781333208084106, "rewards/format_reward/mean": 0.2291666716337204, "rewards/format_reward/std": 0.4247443675994873, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7916666666666666, "completions/max_length": 3584.0, "completions/max_terminated_length": 3388.0, "completions/mean_length": 3097.77099609375, "completions/mean_terminated_length": 1250.0999755859375, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "epoch": 0.052571428571428575, "frac_reward_zero_std": 0.0, "grad_norm": 0.2662387490272522, "kl": 0.0007890065511067709, "learning_rate": 1.2713832064634125e-07, "loss": 0.006, "num_tokens": 6634194.0, "policy_entropy_avg": 8.125, "reward": -0.2356199026107788, "reward_std": 0.4806956648826599, "rewards/cosine_scaled_reward/mean": -0.22256861627101898, "rewards/cosine_scaled_reward/std": 0.2471582442522049, "rewards/format_reward/mean": 0.2083333283662796, "rewards/format_reward/std": 0.41041409969329834, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.47916666666666663, "completions/max_length": 3584.0, "completions/max_terminated_length": 3500.0, "completions/mean_length": 2685.25, "completions/mean_terminated_length": 1858.39990234375, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.053714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.30878785252571106, "kl": 0.0005480448404947916, "learning_rate": 1.1743223682775649e-07, "loss": 0.0002, "num_tokens": 6770886.0, "policy_entropy_avg": 8.125, "reward": 0.628765344619751, "reward_std": 0.8911368250846863, "rewards/cosine_scaled_reward/mean": 0.04512912034988403, "rewards/cosine_scaled_reward/std": 0.5223999619483948, "rewards/format_reward/mean": 0.5416666865348816, "rewards/format_reward/std": 0.503533661365509, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 3584.0, "completions/max_terminated_length": 2532.0, "completions/mean_length": 2819.6875, "completions/mean_terminated_length": 1138.2000732421875, "completions/min_length": 705.0, "completions/min_terminated_length": 705.0, "epoch": 0.054857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.25418996810913086, "kl": 0.0007025400797526041, "learning_rate": 1.0983357966978745e-07, "loss": 0.0033, "num_tokens": 6914139.0, "policy_entropy_avg": 8.125, "reward": 0.09342099726200104, "reward_std": 0.7818130850791931, "rewards/cosine_scaled_reward/mean": -0.11972144991159439, "rewards/cosine_scaled_reward/std": 0.401507169008255, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.47639307379722595, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.47916666666666663, "completions/max_length": 3584.0, "completions/max_terminated_length": 3209.0, "completions/mean_length": 2395.6875, "completions/mean_terminated_length": 1302.43994140625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.056, "frac_reward_zero_std": 0.0, "grad_norm": 0.2804883122444153, "kl": 0.0006554921468098959, "learning_rate": 1.0437936906629334e-07, "loss": -0.0017, "num_tokens": 7036680.0, "policy_entropy_avg": 8.125, "reward": 0.4345873296260834, "reward_std": 0.7855587005615234, "rewards/cosine_scaled_reward/mean": -0.06286442279815674, "rewards/cosine_scaled_reward/std": 0.4665209949016571, "rewards/format_reward/mean": 0.5625, "rewards/format_reward/std": 0.5013279914855957, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 2765.0, "completions/mean_length": 2816.8125, "completions/mean_terminated_length": 1282.4375, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.05714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23478873074054718, "kl": 0.0006268819173177084, "learning_rate": 1.0109617738307911e-07, "loss": -0.0009, "num_tokens": 7179999.0, "policy_entropy_avg": 8.125, "reward": 0.23419660329818726, "reward_std": 0.5556939840316772, "rewards/cosine_scaled_reward/mean": -0.04897995665669441, "rewards/cosine_scaled_reward/std": 0.39337849617004395, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.47639307379722595, "step": 50 }, { "epoch": 0.05714285714285714, "step": 50, "total_flos": 0.0, "train_loss": 0.00044901110231876373, "train_runtime": 4526.0548, "train_samples_per_second": 0.53, "train_steps_per_second": 0.011 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 7179999, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }