|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.05714285714285714, |
|
"eval_steps": 500, |
|
"global_step": 50, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.5208333333333333, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3128.0, |
|
"completions/mean_length": 2584.104248046875, |
|
"completions/mean_terminated_length": 1497.2608642578125, |
|
"completions/min_length": 557.0, |
|
"completions/min_terminated_length": 557.0, |
|
"epoch": 0.001142857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.26198074221611023, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": -0.0022, |
|
"num_tokens": 131153.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.3948305547237396, |
|
"reward_std": 0.7732391357421875, |
|
"rewards/cosine_scaled_reward/mean": -0.062009382992982864, |
|
"rewards/cosine_scaled_reward/std": 0.43048128485679626, |
|
"rewards/format_reward/mean": 0.5208333134651184, |
|
"rewards/format_reward/std": 0.504852294921875, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.5833333333333333, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3280.0, |
|
"completions/mean_length": 2761.666748046875, |
|
"completions/mean_terminated_length": 1610.4000244140625, |
|
"completions/min_length": 465.0, |
|
"completions/min_terminated_length": 465.0, |
|
"epoch": 0.002285714285714286, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2314005047082901, |
|
"kl": 0.0, |
|
"learning_rate": 2e-07, |
|
"loss": -0.0045, |
|
"num_tokens": 271243.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.4077601432800293, |
|
"reward_std": 0.8425893187522888, |
|
"rewards/cosine_scaled_reward/mean": -0.003428752301260829, |
|
"rewards/cosine_scaled_reward/std": 0.4935320317745209, |
|
"rewards/format_reward/mean": 0.4166666567325592, |
|
"rewards/format_reward/std": 0.49822381138801575, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.875, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 2945.0, |
|
"completions/mean_length": 3343.33349609375, |
|
"completions/mean_terminated_length": 1658.666748046875, |
|
"completions/min_length": 490.0, |
|
"completions/min_terminated_length": 490.0, |
|
"epoch": 0.0034285714285714284, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19728681445121765, |
|
"kl": 0.0006656646728515625, |
|
"learning_rate": 4e-07, |
|
"loss": 0.0095, |
|
"num_tokens": 439577.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": -0.15455231070518494, |
|
"reward_std": 0.5764515995979309, |
|
"rewards/cosine_scaled_reward/mean": -0.17141447961330414, |
|
"rewards/cosine_scaled_reward/std": 0.32203689217567444, |
|
"rewards/format_reward/mean": 0.1875, |
|
"rewards/format_reward/std": 0.3944427967071533, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.39583333333333337, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3458.0, |
|
"completions/mean_length": 2226.89599609375, |
|
"completions/mean_terminated_length": 1337.7586669921875, |
|
"completions/min_length": 407.0, |
|
"completions/min_terminated_length": 407.0, |
|
"epoch": 0.004571428571428572, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2950705587863922, |
|
"kl": 0.0006043116251627604, |
|
"learning_rate": 6e-07, |
|
"loss": -0.001, |
|
"num_tokens": 553824.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.4680083394050598, |
|
"reward_std": 0.8357078433036804, |
|
"rewards/cosine_scaled_reward/mean": -0.09815327078104019, |
|
"rewards/cosine_scaled_reward/std": 0.399366170167923, |
|
"rewards/format_reward/mean": 0.6666666865348816, |
|
"rewards/format_reward/std": 0.47639307379722595, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.7083333333333333, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 2603.0, |
|
"completions/mean_length": 3089.104248046875, |
|
"completions/mean_terminated_length": 1887.21435546875, |
|
"completions/min_length": 909.0, |
|
"completions/min_terminated_length": 909.0, |
|
"epoch": 0.005714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2482743263244629, |
|
"kl": 0.000629425048828125, |
|
"learning_rate": 8e-07, |
|
"loss": 0.0028, |
|
"num_tokens": 710213.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": -0.06966459006071091, |
|
"reward_std": 0.7608852386474609, |
|
"rewards/cosine_scaled_reward/mean": -0.20167399942874908, |
|
"rewards/cosine_scaled_reward/std": 0.3204644024372101, |
|
"rewards/format_reward/mean": 0.3333333432674408, |
|
"rewards/format_reward/std": 0.47639307379722595, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.7916666666666666, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3421.0, |
|
"completions/mean_length": 3119.52099609375, |
|
"completions/mean_terminated_length": 1354.5, |
|
"completions/min_length": 554.0, |
|
"completions/min_terminated_length": 554.0, |
|
"epoch": 0.006857142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.24190759658813477, |
|
"kl": 0.0006701151529947916, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0018, |
|
"num_tokens": 868686.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.026811789721250534, |
|
"reward_std": 0.7506579756736755, |
|
"rewards/cosine_scaled_reward/mean": -0.1427767425775528, |
|
"rewards/cosine_scaled_reward/std": 0.3361252248287201, |
|
"rewards/format_reward/mean": 0.3125, |
|
"rewards/format_reward/std": 0.4684174358844757, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.5416666666666667, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3457.0, |
|
"completions/mean_length": 3024.291748046875, |
|
"completions/mean_terminated_length": 2362.818359375, |
|
"completions/min_length": 839.0, |
|
"completions/min_terminated_length": 839.0, |
|
"epoch": 0.008, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20822857320308685, |
|
"kl": 0.0005512237548828125, |
|
"learning_rate": 9.989038226169207e-07, |
|
"loss": -0.009, |
|
"num_tokens": 1021658.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.47669005393981934, |
|
"reward_std": 0.9081848859786987, |
|
"rewards/cosine_scaled_reward/mean": -0.031290601938962936, |
|
"rewards/cosine_scaled_reward/std": 0.47983497381210327, |
|
"rewards/format_reward/mean": 0.5416666865348816, |
|
"rewards/format_reward/std": 0.5035336017608643, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.6041666666666667, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3568.0, |
|
"completions/mean_length": 2791.875, |
|
"completions/mean_terminated_length": 1582.8421630859375, |
|
"completions/min_length": 327.0, |
|
"completions/min_terminated_length": 327.0, |
|
"epoch": 0.009142857142857144, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.23235374689102173, |
|
"kl": 0.0005970001220703125, |
|
"learning_rate": 9.956206309337066e-07, |
|
"loss": -0.0081, |
|
"num_tokens": 1163480.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.5300650596618652, |
|
"reward_std": 0.7924127578735352, |
|
"rewards/cosine_scaled_reward/mean": 0.03719766065478325, |
|
"rewards/cosine_scaled_reward/std": 0.4377634525299072, |
|
"rewards/format_reward/mean": 0.4583333432674408, |
|
"rewards/format_reward/std": 0.5035336017608643, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.7083333333333333, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3494.0, |
|
"completions/mean_length": 3142.95849609375, |
|
"completions/mean_terminated_length": 2071.857177734375, |
|
"completions/min_length": 955.0, |
|
"completions/min_terminated_length": 955.0, |
|
"epoch": 0.010285714285714285, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21945622563362122, |
|
"kl": 0.0006663004557291666, |
|
"learning_rate": 9.901664203302124e-07, |
|
"loss": 0.002, |
|
"num_tokens": 1322934.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.09029825031757355, |
|
"reward_std": 0.8250617980957031, |
|
"rewards/cosine_scaled_reward/mean": -0.1421239972114563, |
|
"rewards/cosine_scaled_reward/std": 0.3718816637992859, |
|
"rewards/format_reward/mean": 0.375, |
|
"rewards/format_reward/std": 0.48924607038497925, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3440.0, |
|
"completions/mean_length": 2639.791748046875, |
|
"completions/mean_terminated_length": 1066.111083984375, |
|
"completions/min_length": 329.0, |
|
"completions/min_terminated_length": 329.0, |
|
"epoch": 0.011428571428571429, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2775964140892029, |
|
"kl": 0.0005779266357421875, |
|
"learning_rate": 9.825677631722435e-07, |
|
"loss": -0.0111, |
|
"num_tokens": 1457768.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.31791985034942627, |
|
"reward_std": 0.7219366431236267, |
|
"rewards/cosine_scaled_reward/mean": -0.03815798461437225, |
|
"rewards/cosine_scaled_reward/std": 0.4010634124279022, |
|
"rewards/format_reward/mean": 0.3958333432674408, |
|
"rewards/format_reward/std": 0.49420398473739624, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3528.0, |
|
"completions/mean_length": 3260.8125, |
|
"completions/mean_terminated_length": 1860.3333740234375, |
|
"completions/min_length": 855.0, |
|
"completions/min_terminated_length": 855.0, |
|
"epoch": 0.012571428571428572, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21065565943717957, |
|
"kl": 0.0005486806233723959, |
|
"learning_rate": 9.728616793536587e-07, |
|
"loss": 0.0145, |
|
"num_tokens": 1623041.0, |
|
"policy_entropy_avg": 8.135416666666666, |
|
"reward": -0.1468753218650818, |
|
"reward_std": 0.909512996673584, |
|
"rewards/cosine_scaled_reward/mean": -0.18839001655578613, |
|
"rewards/cosine_scaled_reward/std": 0.377286821603775, |
|
"rewards/format_reward/mean": 0.2291666716337204, |
|
"rewards/format_reward/std": 0.4247443675994873, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.41666666666666663, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3564.0, |
|
"completions/mean_length": 2480.791748046875, |
|
"completions/mean_terminated_length": 1692.7857666015625, |
|
"completions/min_length": 474.0, |
|
"completions/min_terminated_length": 474.0, |
|
"epoch": 0.013714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.33562034368515015, |
|
"kl": 0.0005970001220703125, |
|
"learning_rate": 9.610954559391704e-07, |
|
"loss": 0.0138, |
|
"num_tokens": 1750327.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.425642192363739, |
|
"reward_std": 0.823100745677948, |
|
"rewards/cosine_scaled_reward/mean": -0.08819279819726944, |
|
"rewards/cosine_scaled_reward/std": 0.44234269857406616, |
|
"rewards/format_reward/mean": 0.6041666865348816, |
|
"rewards/format_reward/std": 0.49420398473739624, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.5833333333333333, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3538.0, |
|
"completions/mean_length": 2816.14599609375, |
|
"completions/mean_terminated_length": 1741.1500244140625, |
|
"completions/min_length": 452.0, |
|
"completions/min_terminated_length": 452.0, |
|
"epoch": 0.014857142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.27906712889671326, |
|
"kl": 0.0005658467610677084, |
|
"learning_rate": 9.473264167865171e-07, |
|
"loss": 0.0016, |
|
"num_tokens": 1893782.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.24736499786376953, |
|
"reward_std": 0.7155070304870605, |
|
"rewards/cosine_scaled_reward/mean": -0.09444598108530045, |
|
"rewards/cosine_scaled_reward/std": 0.4492030441761017, |
|
"rewards/format_reward/mean": 0.4375, |
|
"rewards/format_reward/std": 0.5013279914855957, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.5833333333333333, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3369.0, |
|
"completions/mean_length": 2769.0, |
|
"completions/mean_terminated_length": 1628.0, |
|
"completions/min_length": 555.0, |
|
"completions/min_terminated_length": 555.0, |
|
"epoch": 0.016, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.27333828806877136, |
|
"kl": 0.000553131103515625, |
|
"learning_rate": 9.316216432703916e-07, |
|
"loss": 0.0017, |
|
"num_tokens": 2034650.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.1486106812953949, |
|
"reward_std": 0.8035473227500916, |
|
"rewards/cosine_scaled_reward/mean": -0.1336546093225479, |
|
"rewards/cosine_scaled_reward/std": 0.3953794538974762, |
|
"rewards/format_reward/mean": 0.4166666567325592, |
|
"rewards/format_reward/std": 0.49822381138801575, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3186.0, |
|
"completions/mean_length": 2703.08349609375, |
|
"completions/mean_terminated_length": 1234.888916015625, |
|
"completions/min_length": 405.0, |
|
"completions/min_terminated_length": 405.0, |
|
"epoch": 0.017142857142857144, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2734379768371582, |
|
"kl": 0.0005137125651041666, |
|
"learning_rate": 9.140576474687263e-07, |
|
"loss": -0.0122, |
|
"num_tokens": 2172588.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.406665563583374, |
|
"reward_std": 0.3276861608028412, |
|
"rewards/cosine_scaled_reward/mean": 0.0168545451015234, |
|
"rewards/cosine_scaled_reward/std": 0.4574853479862213, |
|
"rewards/format_reward/mean": 0.375, |
|
"rewards/format_reward/std": 0.48924607038497925, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.9791666666666666, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 2984.0, |
|
"completions/mean_length": 3571.5, |
|
"completions/mean_terminated_length": 2984.0, |
|
"completions/min_length": 2984.0, |
|
"completions/min_terminated_length": 2984.0, |
|
"epoch": 0.018285714285714287, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1932428628206253, |
|
"kl": 0.0006643931070963541, |
|
"learning_rate": 8.9471999940354e-07, |
|
"loss": 0.0075, |
|
"num_tokens": 2351850.0, |
|
"policy_entropy_avg": 8.135416666666666, |
|
"reward": -0.3992506265640259, |
|
"reward_std": 0.5042399168014526, |
|
"rewards/cosine_scaled_reward/mean": -0.22146178781986237, |
|
"rewards/cosine_scaled_reward/std": 0.292772501707077, |
|
"rewards/format_reward/mean": 0.0416666679084301, |
|
"rewards/format_reward/std": 0.20194092392921448, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.39583333333333337, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3475.0, |
|
"completions/mean_length": 2287.416748046875, |
|
"completions/mean_terminated_length": 1437.9310302734375, |
|
"completions/min_length": 364.0, |
|
"completions/min_terminated_length": 364.0, |
|
"epoch": 0.019428571428571427, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.37555888295173645, |
|
"kl": 0.0006338755289713541, |
|
"learning_rate": 8.737029101523929e-07, |
|
"loss": -0.0011, |
|
"num_tokens": 2469536.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.5107091665267944, |
|
"reward_std": 0.8238445520401001, |
|
"rewards/cosine_scaled_reward/mean": -0.04544559493660927, |
|
"rewards/cosine_scaled_reward/std": 0.45671001076698303, |
|
"rewards/format_reward/mean": 0.6041666865348816, |
|
"rewards/format_reward/std": 0.49420398473739624, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3116.0, |
|
"completions/mean_length": 2911.89599609375, |
|
"completions/mean_terminated_length": 1433.2667236328125, |
|
"completions/min_length": 608.0, |
|
"completions/min_terminated_length": 608.0, |
|
"epoch": 0.02057142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21075770258903503, |
|
"kl": 0.0006434122721354166, |
|
"learning_rate": 8.511087728614862e-07, |
|
"loss": 0.0029, |
|
"num_tokens": 2617089.0, |
|
"policy_entropy_avg": 8.135416666666666, |
|
"reward": -0.13376453518867493, |
|
"reward_std": 0.6403241157531738, |
|
"rewards/cosine_scaled_reward/mean": -0.2234683483839035, |
|
"rewards/cosine_scaled_reward/std": 0.2743138074874878, |
|
"rewards/format_reward/mean": 0.3125, |
|
"rewards/format_reward/std": 0.4684174358844757, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.5625, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3400.0, |
|
"completions/mean_length": 2844.5, |
|
"completions/mean_terminated_length": 1893.71435546875, |
|
"completions/min_length": 504.0, |
|
"completions/min_terminated_length": 504.0, |
|
"epoch": 0.021714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.24317172169685364, |
|
"kl": 0.0006122589111328125, |
|
"learning_rate": 8.270476638965461e-07, |
|
"loss": -0.0105, |
|
"num_tokens": 2762067.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.7856847643852234, |
|
"reward_std": 0.5978894829750061, |
|
"rewards/cosine_scaled_reward/mean": 0.15523308515548706, |
|
"rewards/cosine_scaled_reward/std": 0.5373290181159973, |
|
"rewards/format_reward/mean": 0.4791666567325592, |
|
"rewards/format_reward/std": 0.5048523545265198, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.47916666666666663, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3494.0, |
|
"completions/mean_length": 2482.83349609375, |
|
"completions/mean_terminated_length": 1469.760009765625, |
|
"completions/min_length": 408.0, |
|
"completions/min_terminated_length": 408.0, |
|
"epoch": 0.022857142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.26188376545906067, |
|
"kl": 0.0005286534627278646, |
|
"learning_rate": 8.01636806561836e-07, |
|
"loss": -0.0042, |
|
"num_tokens": 2889757.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.5357545614242554, |
|
"reward_std": 0.7750095129013062, |
|
"rewards/cosine_scaled_reward/mean": -0.03285994753241539, |
|
"rewards/cosine_scaled_reward/std": 0.4009867310523987, |
|
"rewards/format_reward/mean": 0.6041666865348816, |
|
"rewards/format_reward/std": 0.49420398473739624, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.6041666666666667, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 2710.0, |
|
"completions/mean_length": 2631.70849609375, |
|
"completions/mean_terminated_length": 1178.2105712890625, |
|
"completions/min_length": 342.0, |
|
"completions/min_terminated_length": 342.0, |
|
"epoch": 0.024, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.31193122267723083, |
|
"kl": 0.0006847381591796875, |
|
"learning_rate": 7.75e-07, |
|
"loss": 0.0002, |
|
"num_tokens": 3024185.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.18238481879234314, |
|
"reward_std": 0.4078831374645233, |
|
"rewards/cosine_scaled_reward/mean": -0.11668267101049423, |
|
"rewards/cosine_scaled_reward/std": 0.3962862193584442, |
|
"rewards/format_reward/mean": 0.4166666567325592, |
|
"rewards/format_reward/std": 0.49822381138801575, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.27083333333333337, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3239.0, |
|
"completions/mean_length": 1697.2083740234375, |
|
"completions/mean_terminated_length": 996.4000244140625, |
|
"completions/min_length": 250.0, |
|
"completions/min_terminated_length": 250.0, |
|
"epoch": 0.025142857142857144, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.40179967880249023, |
|
"kl": 0.0006052652994791666, |
|
"learning_rate": 7.472670160550848e-07, |
|
"loss": -0.005, |
|
"num_tokens": 3112413.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.6634411811828613, |
|
"reward_std": 0.5782728791236877, |
|
"rewards/cosine_scaled_reward/mean": -0.06244581937789917, |
|
"rewards/cosine_scaled_reward/std": 0.4282727539539337, |
|
"rewards/format_reward/mean": 0.7916666865348816, |
|
"rewards/format_reward/std": 0.41041406989097595, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.39583333333333337, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3421.0, |
|
"completions/mean_length": 2181.104248046875, |
|
"completions/mean_terminated_length": 1261.9654541015625, |
|
"completions/min_length": 595.0, |
|
"completions/min_terminated_length": 595.0, |
|
"epoch": 0.026285714285714287, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.3063512444496155, |
|
"kl": 0.0006097157796223959, |
|
"learning_rate": 7.185729670371604e-07, |
|
"loss": 0.0002, |
|
"num_tokens": 3225200.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.3287263512611389, |
|
"reward_std": 0.8908068537712097, |
|
"rewards/cosine_scaled_reward/mean": -0.14731089770793915, |
|
"rewards/cosine_scaled_reward/std": 0.42148637771606445, |
|
"rewards/format_reward/mean": 0.625, |
|
"rewards/format_reward/std": 0.48924607038497925, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.5416666666666667, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3322.0, |
|
"completions/mean_length": 2681.229248046875, |
|
"completions/mean_terminated_length": 1614.3182373046875, |
|
"completions/min_length": 461.0, |
|
"completions/min_terminated_length": 461.0, |
|
"epoch": 0.027428571428571427, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.26260870695114136, |
|
"kl": 0.0006230672200520834, |
|
"learning_rate": 6.890576474687263e-07, |
|
"loss": 0.001, |
|
"num_tokens": 3362095.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.3754323720932007, |
|
"reward_std": 0.7894452810287476, |
|
"rewards/cosine_scaled_reward/mean": -0.061340540647506714, |
|
"rewards/cosine_scaled_reward/std": 0.4359513223171234, |
|
"rewards/format_reward/mean": 0.5, |
|
"rewards/format_reward/std": 0.5052911639213562, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.5625, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3461.0, |
|
"completions/mean_length": 2590.166748046875, |
|
"completions/mean_terminated_length": 1312.3809814453125, |
|
"completions/min_length": 532.0, |
|
"completions/min_terminated_length": 532.0, |
|
"epoch": 0.02857142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.23011414706707, |
|
"kl": 0.0007470448811848959, |
|
"learning_rate": 6.588648530198504e-07, |
|
"loss": 0.0021, |
|
"num_tokens": 3494145.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.3740345239639282, |
|
"reward_std": 0.7695837020874023, |
|
"rewards/cosine_scaled_reward/mean": -0.03079296462237835, |
|
"rewards/cosine_scaled_reward/std": 0.44012707471847534, |
|
"rewards/format_reward/mean": 0.4375, |
|
"rewards/format_reward/std": 0.5013279914855957, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.5625, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3395.0, |
|
"completions/mean_length": 2929.666748046875, |
|
"completions/mean_terminated_length": 2088.381103515625, |
|
"completions/min_length": 879.0, |
|
"completions/min_terminated_length": 879.0, |
|
"epoch": 0.029714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.23103339970111847, |
|
"kl": 0.0006039937337239584, |
|
"learning_rate": 6.281416799501187e-07, |
|
"loss": 0.0037, |
|
"num_tokens": 3642743.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.2619829773902893, |
|
"reward_std": 0.6574144959449768, |
|
"rewards/cosine_scaled_reward/mean": -0.10793358087539673, |
|
"rewards/cosine_scaled_reward/std": 0.4338338077068329, |
|
"rewards/format_reward/mean": 0.4791666567325592, |
|
"rewards/format_reward/std": 0.5048523545265198, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.6666666666666667, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3288.0, |
|
"completions/mean_length": 2908.20849609375, |
|
"completions/mean_terminated_length": 1556.625, |
|
"completions/min_length": 518.0, |
|
"completions/min_terminated_length": 518.0, |
|
"epoch": 0.030857142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2881726324558258, |
|
"kl": 0.000667572021484375, |
|
"learning_rate": 5.97037808470444e-07, |
|
"loss": -0.0, |
|
"num_tokens": 3790053.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.041869934648275375, |
|
"reward_std": 0.7798717021942139, |
|
"rewards/cosine_scaled_reward/mean": -0.1560431718826294, |
|
"rewards/cosine_scaled_reward/std": 0.29862359166145325, |
|
"rewards/format_reward/mean": 0.3541666567325592, |
|
"rewards/format_reward/std": 0.4833211302757263, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.6041666666666667, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 2977.0, |
|
"completions/mean_length": 2831.52099609375, |
|
"completions/mean_terminated_length": 1683.0, |
|
"completions/min_length": 509.0, |
|
"completions/min_terminated_length": 509.0, |
|
"epoch": 0.032, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.24444623291492462, |
|
"kl": 0.0005861918131510416, |
|
"learning_rate": 5.657047735161255e-07, |
|
"loss": 0.0062, |
|
"num_tokens": 3933718.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.4484003484249115, |
|
"reward_std": 0.8719537258148193, |
|
"rewards/cosine_scaled_reward/mean": 0.006576786283403635, |
|
"rewards/cosine_scaled_reward/std": 0.4855944514274597, |
|
"rewards/format_reward/mean": 0.4375, |
|
"rewards/format_reward/std": 0.5013279914855957, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.7916666666666666, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3091.0, |
|
"completions/mean_length": 3182.02099609375, |
|
"completions/mean_terminated_length": 1654.5, |
|
"completions/min_length": 418.0, |
|
"completions/min_terminated_length": 418.0, |
|
"epoch": 0.03314285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.22512345016002655, |
|
"kl": 0.0006993611653645834, |
|
"learning_rate": 5.342952264838747e-07, |
|
"loss": 0.0099, |
|
"num_tokens": 4094309.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": -0.1955069899559021, |
|
"reward_std": 0.668115496635437, |
|
"rewards/cosine_scaled_reward/mean": -0.21282805502414703, |
|
"rewards/cosine_scaled_reward/std": 0.37752190232276917, |
|
"rewards/format_reward/mean": 0.2291666716337204, |
|
"rewards/format_reward/std": 0.4247443675994873, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.5625, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3320.0, |
|
"completions/mean_length": 2794.666748046875, |
|
"completions/mean_terminated_length": 1779.8095703125, |
|
"completions/min_length": 667.0, |
|
"completions/min_terminated_length": 667.0, |
|
"epoch": 0.03428571428571429, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.22448024153709412, |
|
"kl": 0.0006434122721354166, |
|
"learning_rate": 5.02962191529556e-07, |
|
"loss": 0.0021, |
|
"num_tokens": 4236355.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.38854461908340454, |
|
"reward_std": 0.8984581828117371, |
|
"rewards/cosine_scaled_reward/mean": -0.0443347692489624, |
|
"rewards/cosine_scaled_reward/std": 0.44917213916778564, |
|
"rewards/format_reward/mean": 0.4791666567325592, |
|
"rewards/format_reward/std": 0.5048523545265198, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.7708333333333334, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3570.0, |
|
"completions/mean_length": 3039.39599609375, |
|
"completions/mean_terminated_length": 1207.5455322265625, |
|
"completions/min_length": 281.0, |
|
"completions/min_terminated_length": 281.0, |
|
"epoch": 0.03542857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.22645282745361328, |
|
"kl": 0.0006421407063802084, |
|
"learning_rate": 4.7185832004988133e-07, |
|
"loss": 0.009, |
|
"num_tokens": 4390118.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": -0.09765049070119858, |
|
"reward_std": 0.6962664127349854, |
|
"rewards/cosine_scaled_reward/mean": -0.1740705966949463, |
|
"rewards/cosine_scaled_reward/std": 0.4055609405040741, |
|
"rewards/format_reward/mean": 0.25, |
|
"rewards/format_reward/std": 0.4375949800014496, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.5833333333333333, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3517.0, |
|
"completions/mean_length": 3097.125, |
|
"completions/mean_terminated_length": 2415.5, |
|
"completions/min_length": 1046.0, |
|
"completions/min_terminated_length": 1046.0, |
|
"epoch": 0.036571428571428574, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19842347502708435, |
|
"kl": 0.000629425048828125, |
|
"learning_rate": 4.4113514698014953e-07, |
|
"loss": -0.0137, |
|
"num_tokens": 4546544.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.6700344085693359, |
|
"reward_std": 0.7424625158309937, |
|
"rewards/cosine_scaled_reward/mean": 0.10753399133682251, |
|
"rewards/cosine_scaled_reward/std": 0.5346410274505615, |
|
"rewards/format_reward/mean": 0.4583333432674408, |
|
"rewards/format_reward/std": 0.5035336017608643, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3484.0, |
|
"completions/mean_length": 3236.604248046875, |
|
"completions/mean_terminated_length": 2194.416748046875, |
|
"completions/min_length": 1039.0, |
|
"completions/min_terminated_length": 1039.0, |
|
"epoch": 0.037714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19302453100681305, |
|
"kl": 0.0005734761555989584, |
|
"learning_rate": 4.1094235253127374e-07, |
|
"loss": 0.0048, |
|
"num_tokens": 4710313.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": -0.12012770771980286, |
|
"reward_std": 0.6003495454788208, |
|
"rewards/cosine_scaled_reward/mean": -0.1957823485136032, |
|
"rewards/cosine_scaled_reward/std": 0.28730008006095886, |
|
"rewards/format_reward/mean": 0.2708333432674408, |
|
"rewards/format_reward/std": 0.4490928649902344, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.45833333333333337, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3326.0, |
|
"completions/mean_length": 2303.45849609375, |
|
"completions/mean_terminated_length": 1219.923095703125, |
|
"completions/min_length": 430.0, |
|
"completions/min_terminated_length": 430.0, |
|
"epoch": 0.038857142857142854, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.273879736661911, |
|
"kl": 0.0007279713948567709, |
|
"learning_rate": 3.8142703296283953e-07, |
|
"loss": 0.0012, |
|
"num_tokens": 4828043.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.5595396757125854, |
|
"reward_std": 0.9288837313652039, |
|
"rewards/cosine_scaled_reward/mean": -7.428725803038105e-05, |
|
"rewards/cosine_scaled_reward/std": 0.5000401139259338, |
|
"rewards/format_reward/mean": 0.5625, |
|
"rewards/format_reward/std": 0.5013279914855957, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.7291666666666667, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3498.0, |
|
"completions/mean_length": 3022.52099609375, |
|
"completions/mean_terminated_length": 1510.84619140625, |
|
"completions/min_length": 417.0, |
|
"completions/min_terminated_length": 417.0, |
|
"epoch": 0.04, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.25011301040649414, |
|
"kl": 0.0006783803304036459, |
|
"learning_rate": 3.5273298394491515e-07, |
|
"loss": 0.0054, |
|
"num_tokens": 4981746.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.1530809849500656, |
|
"reward_std": 0.9632071256637573, |
|
"rewards/cosine_scaled_reward/mean": -0.07932490855455399, |
|
"rewards/cosine_scaled_reward/std": 0.4641749858856201, |
|
"rewards/format_reward/mean": 0.3125, |
|
"rewards/format_reward/std": 0.4684174358844757, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3483.0, |
|
"completions/mean_length": 3256.479248046875, |
|
"completions/mean_terminated_length": 1837.2222900390625, |
|
"completions/min_length": 1007.0, |
|
"completions/min_terminated_length": 1007.0, |
|
"epoch": 0.04114285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21851785480976105, |
|
"kl": 0.0007712046305338541, |
|
"learning_rate": 3.250000000000001e-07, |
|
"loss": 0.0035, |
|
"num_tokens": 5146391.0, |
|
"policy_entropy_avg": 8.135416666666666, |
|
"reward": -0.2397136390209198, |
|
"reward_std": 0.3913343846797943, |
|
"rewards/cosine_scaled_reward/mean": -0.23504245281219482, |
|
"rewards/cosine_scaled_reward/std": 0.17867261171340942, |
|
"rewards/format_reward/mean": 0.2291666716337204, |
|
"rewards/format_reward/std": 0.4247443675994873, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.7708333333333334, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3151.0, |
|
"completions/mean_length": 3157.70849609375, |
|
"completions/mean_terminated_length": 1723.8182373046875, |
|
"completions/min_length": 749.0, |
|
"completions/min_terminated_length": 749.0, |
|
"epoch": 0.04228571428571429, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2363603562116623, |
|
"kl": 0.0006186167399088541, |
|
"learning_rate": 2.9836319343816397e-07, |
|
"loss": 0.0063, |
|
"num_tokens": 5306229.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": -0.23191750049591064, |
|
"reward_std": 0.505261242389679, |
|
"rewards/cosine_scaled_reward/mean": -0.24154144525527954, |
|
"rewards/cosine_scaled_reward/std": 0.23630201816558838, |
|
"rewards/format_reward/mean": 0.25, |
|
"rewards/format_reward/std": 0.4375949800014496, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.7916666666666666, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 2515.0, |
|
"completions/mean_length": 3111.854248046875, |
|
"completions/mean_terminated_length": 1317.7000732421875, |
|
"completions/min_length": 679.0, |
|
"completions/min_terminated_length": 679.0, |
|
"epoch": 0.04342857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19270718097686768, |
|
"kl": 0.0006771087646484375, |
|
"learning_rate": 2.729523361034538e-07, |
|
"loss": -0.0004, |
|
"num_tokens": 5464382.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.06942842155694962, |
|
"reward_std": 0.4463602900505066, |
|
"rewards/cosine_scaled_reward/mean": -0.07969469577074051, |
|
"rewards/cosine_scaled_reward/std": 0.3691597282886505, |
|
"rewards/format_reward/mean": 0.2291666716337204, |
|
"rewards/format_reward/std": 0.4247443675994873, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.5833333333333333, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3387.0, |
|
"completions/mean_length": 2799.166748046875, |
|
"completions/mean_terminated_length": 1700.4000244140625, |
|
"completions/min_length": 262.0, |
|
"completions/min_terminated_length": 262.0, |
|
"epoch": 0.044571428571428574, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2799771726131439, |
|
"kl": 0.0005861918131510416, |
|
"learning_rate": 2.488912271385139e-07, |
|
"loss": -0.0356, |
|
"num_tokens": 5606830.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.41357025504112244, |
|
"reward_std": 0.3624642491340637, |
|
"rewards/cosine_scaled_reward/mean": -0.04217575863003731, |
|
"rewards/cosine_scaled_reward/std": 0.4245593845844269, |
|
"rewards/format_reward/mean": 0.5, |
|
"rewards/format_reward/std": 0.5052911639213562, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.5416666666666667, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 2945.0, |
|
"completions/mean_length": 2401.95849609375, |
|
"completions/mean_terminated_length": 1005.0, |
|
"completions/min_length": 494.0, |
|
"completions/min_terminated_length": 494.0, |
|
"epoch": 0.045714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2853446900844574, |
|
"kl": 0.000675201416015625, |
|
"learning_rate": 2.2629708984760706e-07, |
|
"loss": -0.0033, |
|
"num_tokens": 5729678.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.2566830515861511, |
|
"reward_std": 0.573390781879425, |
|
"rewards/cosine_scaled_reward/mean": -0.11059689521789551, |
|
"rewards/cosine_scaled_reward/std": 0.43331247568130493, |
|
"rewards/format_reward/mean": 0.4791666567325592, |
|
"rewards/format_reward/std": 0.5048523545265198, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3370.0, |
|
"completions/mean_length": 2863.45849609375, |
|
"completions/mean_terminated_length": 1662.5555419921875, |
|
"completions/min_length": 762.0, |
|
"completions/min_terminated_length": 762.0, |
|
"epoch": 0.046857142857142854, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.22030523419380188, |
|
"kl": 0.0006434122721354166, |
|
"learning_rate": 2.0528000059645995e-07, |
|
"loss": 0.0144, |
|
"num_tokens": 5875488.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.050709377974271774, |
|
"reward_std": 0.7291332483291626, |
|
"rewards/cosine_scaled_reward/mean": -0.18285124003887177, |
|
"rewards/cosine_scaled_reward/std": 0.3616393804550171, |
|
"rewards/format_reward/mean": 0.4166666567325592, |
|
"rewards/format_reward/std": 0.49822381138801575, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 2370.0, |
|
"completions/mean_length": 2728.5, |
|
"completions/mean_terminated_length": 846.4000244140625, |
|
"completions/min_length": 207.0, |
|
"completions/min_terminated_length": 207.0, |
|
"epoch": 0.048, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.3805939257144928, |
|
"kl": 0.0007483164469401041, |
|
"learning_rate": 1.8594235253127372e-07, |
|
"loss": 0.001, |
|
"num_tokens": 6014226.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": -0.11857573688030243, |
|
"reward_std": 0.35950538516044617, |
|
"rewards/cosine_scaled_reward/mean": -0.2158358097076416, |
|
"rewards/cosine_scaled_reward/std": 0.18257829546928406, |
|
"rewards/format_reward/mean": 0.3125, |
|
"rewards/format_reward/std": 0.4684174358844757, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 2668.0, |
|
"completions/mean_length": 3001.479248046875, |
|
"completions/mean_terminated_length": 1253.916748046875, |
|
"completions/min_length": 529.0, |
|
"completions/min_terminated_length": 529.0, |
|
"epoch": 0.04914285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2410728931427002, |
|
"kl": 0.0007006327311197916, |
|
"learning_rate": 1.6837835672960831e-07, |
|
"loss": 0.0025, |
|
"num_tokens": 6167009.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.13230201601982117, |
|
"reward_std": 0.6667492389678955, |
|
"rewards/cosine_scaled_reward/mean": -0.05851660296320915, |
|
"rewards/cosine_scaled_reward/std": 0.43021252751350403, |
|
"rewards/format_reward/mean": 0.25, |
|
"rewards/format_reward/std": 0.4375949800014496, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.6041666666666667, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3552.0, |
|
"completions/mean_length": 2645.104248046875, |
|
"completions/mean_terminated_length": 1212.0526123046875, |
|
"completions/min_length": 395.0, |
|
"completions/min_terminated_length": 395.0, |
|
"epoch": 0.05028571428571429, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2952722907066345, |
|
"kl": 0.0007654825846354166, |
|
"learning_rate": 1.5267358321348285e-07, |
|
"loss": 0.0014, |
|
"num_tokens": 6301996.0, |
|
"policy_entropy_avg": 8.135416666666666, |
|
"reward": 0.43852299451828003, |
|
"reward_std": 0.8475234508514404, |
|
"rewards/cosine_scaled_reward/mean": 0.0016132990131154656, |
|
"rewards/cosine_scaled_reward/std": 0.5085917711257935, |
|
"rewards/format_reward/mean": 0.4375, |
|
"rewards/format_reward/std": 0.5013279914855957, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.8541666666666666, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3505.0, |
|
"completions/mean_length": 3466.39599609375, |
|
"completions/mean_terminated_length": 2777.571533203125, |
|
"completions/min_length": 1678.0, |
|
"completions/min_terminated_length": 1678.0, |
|
"epoch": 0.05142857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19519105553627014, |
|
"kl": 0.0006815592447916666, |
|
"learning_rate": 1.3890454406082956e-07, |
|
"loss": 0.0045, |
|
"num_tokens": 6477125.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.16751746833324432, |
|
"reward_std": 0.5252600312232971, |
|
"rewards/cosine_scaled_reward/mean": -0.030403709039092064, |
|
"rewards/cosine_scaled_reward/std": 0.44781333208084106, |
|
"rewards/format_reward/mean": 0.2291666716337204, |
|
"rewards/format_reward/std": 0.4247443675994873, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.7916666666666666, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3388.0, |
|
"completions/mean_length": 3097.77099609375, |
|
"completions/mean_terminated_length": 1250.0999755859375, |
|
"completions/min_length": 605.0, |
|
"completions/min_terminated_length": 605.0, |
|
"epoch": 0.052571428571428575, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2662387490272522, |
|
"kl": 0.0007890065511067709, |
|
"learning_rate": 1.2713832064634125e-07, |
|
"loss": 0.006, |
|
"num_tokens": 6634194.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": -0.2356199026107788, |
|
"reward_std": 0.4806956648826599, |
|
"rewards/cosine_scaled_reward/mean": -0.22256861627101898, |
|
"rewards/cosine_scaled_reward/std": 0.2471582442522049, |
|
"rewards/format_reward/mean": 0.2083333283662796, |
|
"rewards/format_reward/std": 0.41041409969329834, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.47916666666666663, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3500.0, |
|
"completions/mean_length": 2685.25, |
|
"completions/mean_terminated_length": 1858.39990234375, |
|
"completions/min_length": 431.0, |
|
"completions/min_terminated_length": 431.0, |
|
"epoch": 0.053714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.30878785252571106, |
|
"kl": 0.0005480448404947916, |
|
"learning_rate": 1.1743223682775649e-07, |
|
"loss": 0.0002, |
|
"num_tokens": 6770886.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.628765344619751, |
|
"reward_std": 0.8911368250846863, |
|
"rewards/cosine_scaled_reward/mean": 0.04512912034988403, |
|
"rewards/cosine_scaled_reward/std": 0.5223999619483948, |
|
"rewards/format_reward/mean": 0.5416666865348816, |
|
"rewards/format_reward/std": 0.503533661365509, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 2532.0, |
|
"completions/mean_length": 2819.6875, |
|
"completions/mean_terminated_length": 1138.2000732421875, |
|
"completions/min_length": 705.0, |
|
"completions/min_terminated_length": 705.0, |
|
"epoch": 0.054857142857142854, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.25418996810913086, |
|
"kl": 0.0007025400797526041, |
|
"learning_rate": 1.0983357966978745e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 6914139.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.09342099726200104, |
|
"reward_std": 0.7818130850791931, |
|
"rewards/cosine_scaled_reward/mean": -0.11972144991159439, |
|
"rewards/cosine_scaled_reward/std": 0.401507169008255, |
|
"rewards/format_reward/mean": 0.3333333432674408, |
|
"rewards/format_reward/std": 0.47639307379722595, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.47916666666666663, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 3209.0, |
|
"completions/mean_length": 2395.6875, |
|
"completions/mean_terminated_length": 1302.43994140625, |
|
"completions/min_length": 326.0, |
|
"completions/min_terminated_length": 326.0, |
|
"epoch": 0.056, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2804883122444153, |
|
"kl": 0.0006554921468098959, |
|
"learning_rate": 1.0437936906629334e-07, |
|
"loss": -0.0017, |
|
"num_tokens": 7036680.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.4345873296260834, |
|
"reward_std": 0.7855587005615234, |
|
"rewards/cosine_scaled_reward/mean": -0.06286442279815674, |
|
"rewards/cosine_scaled_reward/std": 0.4665209949016571, |
|
"rewards/format_reward/mean": 0.5625, |
|
"rewards/format_reward/std": 0.5013279914855957, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.6666666666666667, |
|
"completions/max_length": 3584.0, |
|
"completions/max_terminated_length": 2765.0, |
|
"completions/mean_length": 2816.8125, |
|
"completions/mean_terminated_length": 1282.4375, |
|
"completions/min_length": 370.0, |
|
"completions/min_terminated_length": 370.0, |
|
"epoch": 0.05714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.23478873074054718, |
|
"kl": 0.0006268819173177084, |
|
"learning_rate": 1.0109617738307911e-07, |
|
"loss": -0.0009, |
|
"num_tokens": 7179999.0, |
|
"policy_entropy_avg": 8.125, |
|
"reward": 0.23419660329818726, |
|
"reward_std": 0.5556939840316772, |
|
"rewards/cosine_scaled_reward/mean": -0.04897995665669441, |
|
"rewards/cosine_scaled_reward/std": 0.39337849617004395, |
|
"rewards/format_reward/mean": 0.3333333432674408, |
|
"rewards/format_reward/std": 0.47639307379722595, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05714285714285714, |
|
"step": 50, |
|
"total_flos": 0.0, |
|
"train_loss": 0.00044901110231876373, |
|
"train_runtime": 4526.0548, |
|
"train_samples_per_second": 0.53, |
|
"train_steps_per_second": 0.011 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 50, |
|
"num_input_tokens_seen": 7179999, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|