|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.11428571428571428, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.671875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1734.0, |
|
"completions/mean_length": 1702.03125, |
|
"completions/mean_terminated_length": 993.6190795898438, |
|
"completions/min_length": 483.0, |
|
"completions/min_terminated_length": 483.0, |
|
"epoch": 0.001142857142857143, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2005368024110794, |
|
"learning_rate": 0.0, |
|
"loss": 0.0427, |
|
"num_tokens": 118418.0, |
|
"reward": 0.17899775505065918, |
|
"reward_std": 0.7650213241577148, |
|
"rewards/cosine_scaled_reward/mean": -0.09800112992525101, |
|
"rewards/cosine_scaled_reward/std": 0.37953105568885803, |
|
"rewards/format_reward/mean": 0.375, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.71875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1894.0, |
|
"completions/mean_length": 1738.90625, |
|
"completions/mean_terminated_length": 949.0, |
|
"completions/min_length": 435.0, |
|
"completions/min_terminated_length": 435.0, |
|
"epoch": 0.002285714285714286, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19502800703048706, |
|
"learning_rate": 1e-07, |
|
"loss": 0.0561, |
|
"num_tokens": 239748.0, |
|
"reward": 0.3848632574081421, |
|
"reward_std": 0.9111153483390808, |
|
"rewards/cosine_scaled_reward/mean": 0.020556632429361343, |
|
"rewards/cosine_scaled_reward/std": 0.4492928683757782, |
|
"rewards/format_reward/mean": 0.34375, |
|
"rewards/format_reward/std": 0.4787135720252991, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.9375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 861.0, |
|
"completions/mean_length": 1963.75, |
|
"completions/mean_terminated_length": 700.0, |
|
"completions/min_length": 498.0, |
|
"completions/min_terminated_length": 498.0, |
|
"epoch": 0.0034285714285714284, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2347632497549057, |
|
"learning_rate": 2e-07, |
|
"loss": 0.0465, |
|
"num_tokens": 375900.0, |
|
"reward": -0.33020514249801636, |
|
"reward_std": 0.3351452350616455, |
|
"rewards/cosine_scaled_reward/mean": -0.1963525414466858, |
|
"rewards/cosine_scaled_reward/std": 0.16515092551708221, |
|
"rewards/format_reward/mean": 0.0625, |
|
"rewards/format_reward/std": 0.24397502839565277, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.5, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1923.0, |
|
"completions/mean_length": 1518.109375, |
|
"completions/mean_terminated_length": 988.21875, |
|
"completions/min_length": 447.0, |
|
"completions/min_terminated_length": 447.0, |
|
"epoch": 0.004571428571428572, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21861855685710907, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0509, |
|
"num_tokens": 482867.0, |
|
"reward": 0.2307693362236023, |
|
"reward_std": 0.7756893038749695, |
|
"rewards/cosine_scaled_reward/mean": -0.15024033188819885, |
|
"rewards/cosine_scaled_reward/std": 0.32144343852996826, |
|
"rewards/format_reward/mean": 0.53125, |
|
"rewards/format_reward/std": 0.5029674172401428, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.890625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1849.0, |
|
"completions/mean_length": 1964.09375, |
|
"completions/mean_terminated_length": 1280.857177734375, |
|
"completions/min_length": 531.0, |
|
"completions/min_terminated_length": 531.0, |
|
"epoch": 0.005714285714285714, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.23443199694156647, |
|
"learning_rate": 4e-07, |
|
"loss": 0.0273, |
|
"num_tokens": 619385.0, |
|
"reward": -0.36384251713752747, |
|
"reward_std": 0.4326132535934448, |
|
"rewards/cosine_scaled_reward/mean": -0.24442125856876373, |
|
"rewards/cosine_scaled_reward/std": 0.22642402350902557, |
|
"rewards/format_reward/mean": 0.125, |
|
"rewards/format_reward/std": 0.3333333432674408, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.84375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1733.0, |
|
"completions/mean_length": 1865.78125, |
|
"completions/mean_terminated_length": 881.7999877929688, |
|
"completions/min_length": 520.0, |
|
"completions/min_terminated_length": 520.0, |
|
"epoch": 0.006857142857142857, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2527252733707428, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0473, |
|
"num_tokens": 750443.0, |
|
"reward": -0.36761316657066345, |
|
"reward_std": 0.4643300175666809, |
|
"rewards/cosine_scaled_reward/mean": -0.2697440981864929, |
|
"rewards/cosine_scaled_reward/std": 0.1977701485157013, |
|
"rewards/format_reward/mean": 0.171875, |
|
"rewards/format_reward/std": 0.38025420904159546, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.84375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2000.0, |
|
"completions/mean_length": 1941.34375, |
|
"completions/mean_terminated_length": 1365.4000244140625, |
|
"completions/min_length": 607.0, |
|
"completions/min_terminated_length": 607.0, |
|
"epoch": 0.008, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21566396951675415, |
|
"learning_rate": 6e-07, |
|
"loss": 0.0344, |
|
"num_tokens": 885097.0, |
|
"reward": -0.08318325877189636, |
|
"reward_std": 0.5441455841064453, |
|
"rewards/cosine_scaled_reward/mean": -0.150966614484787, |
|
"rewards/cosine_scaled_reward/std": 0.3548375070095062, |
|
"rewards/format_reward/mean": 0.21875, |
|
"rewards/format_reward/std": 0.4166666865348816, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.25, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1943.0, |
|
"completions/mean_length": 1715.359375, |
|
"completions/mean_terminated_length": 717.4375, |
|
"completions/min_length": 311.0, |
|
"completions/min_terminated_length": 311.0, |
|
"epoch": 0.009142857142857144, |
|
"epsilon_high_adjusted": 0.25, |
|
"epsilon_low_adjusted": 0.25, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19708961248397827, |
|
"learning_rate": 7e-07, |
|
"loss": -0.0031, |
|
"num_tokens": 1005296.0, |
|
"reward": 0.1628682017326355, |
|
"reward_std": 0.6152325868606567, |
|
"rewards/cosine_scaled_reward/mean": -0.09044088423252106, |
|
"rewards/cosine_scaled_reward/std": 0.45745164155960083, |
|
"rewards/format_reward/mean": 0.34375, |
|
"rewards/format_reward/std": 0.4787135720252991, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1989.0, |
|
"completions/mean_length": 1971.171875, |
|
"completions/mean_terminated_length": 1433.375, |
|
"completions/min_length": 578.0, |
|
"completions/min_terminated_length": 578.0, |
|
"epoch": 0.010285714285714285, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2231767773628235, |
|
"learning_rate": 8e-07, |
|
"loss": 0.0363, |
|
"num_tokens": 1142907.0, |
|
"reward": -0.22906573116779327, |
|
"reward_std": 0.5889308452606201, |
|
"rewards/cosine_scaled_reward/mean": -0.20828285813331604, |
|
"rewards/cosine_scaled_reward/std": 0.2633083164691925, |
|
"rewards/format_reward/mean": 0.1875, |
|
"rewards/format_reward/std": 0.39339789748191833, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.65625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1642.0, |
|
"completions/mean_length": 1645.703125, |
|
"completions/mean_terminated_length": 877.6818237304688, |
|
"completions/min_length": 316.0, |
|
"completions/min_terminated_length": 316.0, |
|
"epoch": 0.011428571428571429, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19458407163619995, |
|
"learning_rate": 9e-07, |
|
"loss": 0.0565, |
|
"num_tokens": 1259064.0, |
|
"reward": 0.11773137003183365, |
|
"reward_std": 0.6405072212219238, |
|
"rewards/cosine_scaled_reward/mean": -0.12082181870937347, |
|
"rewards/cosine_scaled_reward/std": 0.33084097504615784, |
|
"rewards/format_reward/mean": 0.359375, |
|
"rewards/format_reward/std": 0.4836103618144989, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.921875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1271.0, |
|
"completions/mean_length": 1966.6875, |
|
"completions/mean_terminated_length": 1007.2000122070312, |
|
"completions/min_length": 789.0, |
|
"completions/min_terminated_length": 789.0, |
|
"epoch": 0.012571428571428572, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2074805498123169, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0544, |
|
"num_tokens": 1396604.0, |
|
"reward": -0.38875678181648254, |
|
"reward_std": 0.4678027033805847, |
|
"rewards/cosine_scaled_reward/mean": -0.24906589090824127, |
|
"rewards/cosine_scaled_reward/std": 0.22343340516090393, |
|
"rewards/format_reward/mean": 0.109375, |
|
"rewards/format_reward/std": 0.3145764470100403, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1913.0, |
|
"completions/mean_length": 1694.546875, |
|
"completions/mean_terminated_length": 1105.4583740234375, |
|
"completions/min_length": 300.0, |
|
"completions/min_terminated_length": 300.0, |
|
"epoch": 0.013714285714285714, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21040339767932892, |
|
"learning_rate": 9.997258721585931e-07, |
|
"loss": 0.0575, |
|
"num_tokens": 1515999.0, |
|
"reward": 0.16435137391090393, |
|
"reward_std": 0.7284502983093262, |
|
"rewards/cosine_scaled_reward/mean": -0.13657432794570923, |
|
"rewards/cosine_scaled_reward/std": 0.40020695328712463, |
|
"rewards/format_reward/mean": 0.4375, |
|
"rewards/format_reward/std": 0.5, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0625, |
|
"completions/clipped_ratio": 0.671875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1980.0, |
|
"completions/mean_length": 1735.546875, |
|
"completions/mean_terminated_length": 1095.761962890625, |
|
"completions/min_length": 591.0, |
|
"completions/min_terminated_length": 591.0, |
|
"epoch": 0.014857142857142857, |
|
"epsilon_high_adjusted": 0.22187500000000002, |
|
"epsilon_low_adjusted": 0.22187500000000002, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2161007821559906, |
|
"learning_rate": 9.989038226169207e-07, |
|
"loss": 0.0381, |
|
"num_tokens": 1638114.0, |
|
"reward": 0.03836393356323242, |
|
"reward_std": 0.6106836199760437, |
|
"rewards/cosine_scaled_reward/mean": -0.1448805332183838, |
|
"rewards/cosine_scaled_reward/std": 0.3520916700363159, |
|
"rewards/format_reward/mean": 0.328125, |
|
"rewards/format_reward/std": 0.4732423722743988, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.765625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1844.0, |
|
"completions/mean_length": 1819.390625, |
|
"completions/mean_terminated_length": 1072.60009765625, |
|
"completions/min_length": 482.0, |
|
"completions/min_terminated_length": 482.0, |
|
"epoch": 0.016, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18751561641693115, |
|
"learning_rate": 9.975348529157229e-07, |
|
"loss": 0.0728, |
|
"num_tokens": 1765163.0, |
|
"reward": -0.1004548892378807, |
|
"reward_std": 0.7881962060928345, |
|
"rewards/cosine_scaled_reward/mean": -0.17522744834423065, |
|
"rewards/cosine_scaled_reward/std": 0.3718147575855255, |
|
"rewards/format_reward/mean": 0.25, |
|
"rewards/format_reward/std": 0.4364357888698578, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.734375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1960.0, |
|
"completions/mean_length": 1727.703125, |
|
"completions/mean_terminated_length": 842.1764526367188, |
|
"completions/min_length": 406.0, |
|
"completions/min_terminated_length": 406.0, |
|
"epoch": 0.017142857142857144, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20928962528705597, |
|
"learning_rate": 9.956206309337066e-07, |
|
"loss": 0.0091, |
|
"num_tokens": 1886656.0, |
|
"reward": 0.2893116772174835, |
|
"reward_std": 0.44170767068862915, |
|
"rewards/cosine_scaled_reward/mean": -0.0037816911935806274, |
|
"rewards/cosine_scaled_reward/std": 0.493231862783432, |
|
"rewards/format_reward/mean": 0.296875, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 2048.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 2048.0, |
|
"completions/min_terminated_length": 0.0, |
|
"epoch": 0.018285714285714287, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.23164308071136475, |
|
"learning_rate": 9.931634888554935e-07, |
|
"loss": -0.0, |
|
"num_tokens": 2028168.0, |
|
"reward": -0.4323144555091858, |
|
"reward_std": 0.27591273188591003, |
|
"rewards/cosine_scaled_reward/mean": -0.2161572128534317, |
|
"rewards/cosine_scaled_reward/std": 0.16956526041030884, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.03125, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1988.0, |
|
"completions/mean_length": 1624.109375, |
|
"completions/mean_terminated_length": 917.625, |
|
"completions/min_length": 406.0, |
|
"completions/min_terminated_length": 406.0, |
|
"epoch": 0.019428571428571427, |
|
"epsilon_high_adjusted": 0.2109375, |
|
"epsilon_low_adjusted": 0.2109375, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.22363872826099396, |
|
"learning_rate": 9.901664203302124e-07, |
|
"loss": 0.0656, |
|
"num_tokens": 2142631.0, |
|
"reward": -0.03016360104084015, |
|
"reward_std": 0.6786063313484192, |
|
"rewards/cosine_scaled_reward/mean": -0.20258180797100067, |
|
"rewards/cosine_scaled_reward/std": 0.34620094299316406, |
|
"rewards/format_reward/mean": 0.375, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.125, |
|
"completions/clipped_ratio": 0.734375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1899.0, |
|
"completions/mean_length": 1762.875, |
|
"completions/mean_terminated_length": 974.5882568359375, |
|
"completions/min_length": 462.0, |
|
"completions/min_terminated_length": 462.0, |
|
"epoch": 0.02057142857142857, |
|
"epsilon_high_adjusted": 0.23750000000000002, |
|
"epsilon_low_adjusted": 0.23750000000000002, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19568873941898346, |
|
"learning_rate": 9.866330768241983e-07, |
|
"loss": 0.015, |
|
"num_tokens": 2265831.0, |
|
"reward": -0.024384755641222, |
|
"reward_std": 0.760321855545044, |
|
"rewards/cosine_scaled_reward/mean": -0.16844238340854645, |
|
"rewards/cosine_scaled_reward/std": 0.35202282667160034, |
|
"rewards/format_reward/mean": 0.3125, |
|
"rewards/format_reward/std": 0.467176616191864, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.734375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1934.0, |
|
"completions/mean_length": 1800.796875, |
|
"completions/mean_terminated_length": 1117.3529052734375, |
|
"completions/min_length": 510.0, |
|
"completions/min_terminated_length": 510.0, |
|
"epoch": 0.021714285714285714, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20510442554950714, |
|
"learning_rate": 9.825677631722435e-07, |
|
"loss": 0.0322, |
|
"num_tokens": 2392338.0, |
|
"reward": 0.2440589964389801, |
|
"reward_std": 0.7532124519348145, |
|
"rewards/cosine_scaled_reward/mean": -0.03422052040696144, |
|
"rewards/cosine_scaled_reward/std": 0.49625054001808167, |
|
"rewards/format_reward/mean": 0.3125, |
|
"rewards/format_reward/std": 0.467176616191864, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.25, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2002.0, |
|
"completions/mean_length": 1650.203125, |
|
"completions/mean_terminated_length": 987.2083740234375, |
|
"completions/min_length": 392.0, |
|
"completions/min_terminated_length": 392.0, |
|
"epoch": 0.022857142857142857, |
|
"epsilon_high_adjusted": 0.25, |
|
"epsilon_low_adjusted": 0.25, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.179988294839859, |
|
"learning_rate": 9.779754323328192e-07, |
|
"loss": -0.0019, |
|
"num_tokens": 2509303.0, |
|
"reward": 0.29894062876701355, |
|
"reward_std": 0.6712355613708496, |
|
"rewards/cosine_scaled_reward/mean": -0.06927968561649323, |
|
"rewards/cosine_scaled_reward/std": 0.3939419388771057, |
|
"rewards/format_reward/mean": 0.4375, |
|
"rewards/format_reward/std": 0.5, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.640625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1751.0, |
|
"completions/mean_length": 1687.578125, |
|
"completions/mean_terminated_length": 1045.0870361328125, |
|
"completions/min_length": 312.0, |
|
"completions/min_terminated_length": 312.0, |
|
"epoch": 0.024, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2064765989780426, |
|
"learning_rate": 9.728616793536587e-07, |
|
"loss": 0.0494, |
|
"num_tokens": 2628116.0, |
|
"reward": -0.134785458445549, |
|
"reward_std": 0.6005781888961792, |
|
"rewards/cosine_scaled_reward/mean": -0.2392677217721939, |
|
"rewards/cosine_scaled_reward/std": 0.2986987829208374, |
|
"rewards/format_reward/mean": 0.34375, |
|
"rewards/format_reward/std": 0.4787135720252991, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.40625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1695.0, |
|
"completions/mean_length": 1251.390625, |
|
"completions/mean_terminated_length": 706.3421020507812, |
|
"completions/min_length": 231.0, |
|
"completions/min_terminated_length": 231.0, |
|
"epoch": 0.025142857142857144, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20877699553966522, |
|
"learning_rate": 9.672327345550543e-07, |
|
"loss": 0.0521, |
|
"num_tokens": 2717221.0, |
|
"reward": 0.46772587299346924, |
|
"reward_std": 0.5018335580825806, |
|
"rewards/cosine_scaled_reward/mean": -0.07863706350326538, |
|
"rewards/cosine_scaled_reward/std": 0.3792650103569031, |
|
"rewards/format_reward/mean": 0.625, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.03125, |
|
"completions/clipped_ratio": 0.53125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1967.0, |
|
"completions/mean_length": 1559.6875, |
|
"completions/mean_terminated_length": 1006.2667236328125, |
|
"completions/min_length": 362.0, |
|
"completions/min_terminated_length": 362.0, |
|
"epoch": 0.026285714285714287, |
|
"epsilon_high_adjusted": 0.2109375, |
|
"epsilon_low_adjusted": 0.2109375, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20191854238510132, |
|
"learning_rate": 9.610954559391704e-07, |
|
"loss": 0.0816, |
|
"num_tokens": 2827833.0, |
|
"reward": 0.0568242147564888, |
|
"reward_std": 0.5486289858818054, |
|
"rewards/cosine_scaled_reward/mean": -0.2059628963470459, |
|
"rewards/cosine_scaled_reward/std": 0.3543168008327484, |
|
"rewards/format_reward/mean": 0.46875, |
|
"rewards/format_reward/std": 0.5029674172401428, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.734375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1648.0, |
|
"completions/mean_length": 1799.359375, |
|
"completions/mean_terminated_length": 1111.941162109375, |
|
"completions/min_length": 496.0, |
|
"completions/min_terminated_length": 496.0, |
|
"epoch": 0.027428571428571427, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20426414906978607, |
|
"learning_rate": 9.54457320834625e-07, |
|
"loss": 0.0531, |
|
"num_tokens": 2953920.0, |
|
"reward": -0.08145741373300552, |
|
"reward_std": 0.5499895811080933, |
|
"rewards/cosine_scaled_reward/mean": -0.18916621804237366, |
|
"rewards/cosine_scaled_reward/std": 0.33400654792785645, |
|
"rewards/format_reward/mean": 0.296875, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.640625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1891.0, |
|
"completions/mean_length": 1627.65625, |
|
"completions/mean_terminated_length": 878.3478393554688, |
|
"completions/min_length": 460.0, |
|
"completions/min_terminated_length": 460.0, |
|
"epoch": 0.02857142857142857, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2195703387260437, |
|
"learning_rate": 9.473264167865171e-07, |
|
"loss": 0.0179, |
|
"num_tokens": 3068386.0, |
|
"reward": 0.166388601064682, |
|
"reward_std": 0.6624079942703247, |
|
"rewards/cosine_scaled_reward/mean": -0.1199306920170784, |
|
"rewards/cosine_scaled_reward/std": 0.3789914548397064, |
|
"rewards/format_reward/mean": 0.40625, |
|
"rewards/format_reward/std": 0.49501484632492065, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.78125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1969.0, |
|
"completions/mean_length": 1917.65625, |
|
"completions/mean_terminated_length": 1452.1429443359375, |
|
"completions/min_length": 652.0, |
|
"completions/min_terminated_length": 652.0, |
|
"epoch": 0.029714285714285714, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21889279782772064, |
|
"learning_rate": 9.397114317029974e-07, |
|
"loss": 0.0068, |
|
"num_tokens": 3201748.0, |
|
"reward": 0.21682679653167725, |
|
"reward_std": 0.5016080737113953, |
|
"rewards/cosine_scaled_reward/mean": -0.055649105459451675, |
|
"rewards/cosine_scaled_reward/std": 0.3608931601047516, |
|
"rewards/format_reward/mean": 0.328125, |
|
"rewards/format_reward/std": 0.4732423722743988, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.78125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1958.0, |
|
"completions/mean_length": 1814.1875, |
|
"completions/mean_terminated_length": 979.1428833007812, |
|
"completions/min_length": 399.0, |
|
"completions/min_terminated_length": 399.0, |
|
"epoch": 0.030857142857142857, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21265852451324463, |
|
"learning_rate": 9.316216432703916e-07, |
|
"loss": 0.0397, |
|
"num_tokens": 3328144.0, |
|
"reward": -0.08447478711605072, |
|
"reward_std": 0.48191577196121216, |
|
"rewards/cosine_scaled_reward/mean": -0.17504990100860596, |
|
"rewards/cosine_scaled_reward/std": 0.2491498440504074, |
|
"rewards/format_reward/mean": 0.265625, |
|
"rewards/format_reward/std": 0.44515693187713623, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.765625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1964.0, |
|
"completions/mean_length": 1854.546875, |
|
"completions/mean_terminated_length": 1222.60009765625, |
|
"completions/min_length": 444.0, |
|
"completions/min_terminated_length": 444.0, |
|
"epoch": 0.032, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.22452472150325775, |
|
"learning_rate": 9.230669076497687e-07, |
|
"loss": 0.0473, |
|
"num_tokens": 3457171.0, |
|
"reward": 0.11354446411132812, |
|
"reward_std": 0.7764405608177185, |
|
"rewards/cosine_scaled_reward/mean": -0.10729026794433594, |
|
"rewards/cosine_scaled_reward/std": 0.42263516783714294, |
|
"rewards/format_reward/mean": 0.328125, |
|
"rewards/format_reward/std": 0.4732423722743988, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.84375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1669.0, |
|
"completions/mean_length": 1899.125, |
|
"completions/mean_terminated_length": 1095.2000732421875, |
|
"completions/min_length": 482.0, |
|
"completions/min_terminated_length": 482.0, |
|
"epoch": 0.03314285714285714, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21414655447006226, |
|
"learning_rate": 9.140576474687263e-07, |
|
"loss": 0.079, |
|
"num_tokens": 3589187.0, |
|
"reward": -0.2690792381763458, |
|
"reward_std": 0.4953657388687134, |
|
"rewards/cosine_scaled_reward/mean": -0.2282896190881729, |
|
"rewards/cosine_scaled_reward/std": 0.20246519148349762, |
|
"rewards/format_reward/mean": 0.1875, |
|
"rewards/format_reward/std": 0.39339789748191833, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0625, |
|
"completions/clipped_ratio": 0.796875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1905.0, |
|
"completions/mean_length": 1904.59375, |
|
"completions/mean_terminated_length": 1342.0, |
|
"completions/min_length": 905.0, |
|
"completions/min_terminated_length": 905.0, |
|
"epoch": 0.03428571428571429, |
|
"epsilon_high_adjusted": 0.22187500000000002, |
|
"epsilon_low_adjusted": 0.22187500000000002, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21034272015094757, |
|
"learning_rate": 9.046048391230247e-07, |
|
"loss": 0.011, |
|
"num_tokens": 3721617.0, |
|
"reward": 0.04454480856657028, |
|
"reward_std": 0.765734851360321, |
|
"rewards/cosine_scaled_reward/mean": -0.13397759199142456, |
|
"rewards/cosine_scaled_reward/std": 0.35358336567878723, |
|
"rewards/format_reward/mean": 0.3125, |
|
"rewards/format_reward/std": 0.467176616191864, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1971.0, |
|
"completions/mean_length": 1835.0625, |
|
"completions/mean_terminated_length": 912.3333740234375, |
|
"completions/min_length": 369.0, |
|
"completions/min_terminated_length": 369.0, |
|
"epoch": 0.03542857142857143, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20196685194969177, |
|
"learning_rate": 8.9471999940354e-07, |
|
"loss": 0.0248, |
|
"num_tokens": 3849557.0, |
|
"reward": -0.2954040765762329, |
|
"reward_std": 0.5728512406349182, |
|
"rewards/cosine_scaled_reward/mean": -0.26488950848579407, |
|
"rewards/cosine_scaled_reward/std": 0.30517446994781494, |
|
"rewards/format_reward/mean": 0.234375, |
|
"rewards/format_reward/std": 0.42695629596710205, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.828125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2010.0, |
|
"completions/mean_length": 1934.859375, |
|
"completions/mean_terminated_length": 1389.727294921875, |
|
"completions/min_length": 696.0, |
|
"completions/min_terminated_length": 696.0, |
|
"epoch": 0.036571428571428574, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21605251729488373, |
|
"learning_rate": 8.844151714648274e-07, |
|
"loss": 0.0572, |
|
"num_tokens": 3983740.0, |
|
"reward": -0.1224304735660553, |
|
"reward_std": 0.7502877712249756, |
|
"rewards/cosine_scaled_reward/mean": -0.17059023678302765, |
|
"rewards/cosine_scaled_reward/std": 0.398355633020401, |
|
"rewards/format_reward/mean": 0.21875, |
|
"rewards/format_reward/std": 0.4166666865348816, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.9375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1452.0, |
|
"completions/mean_length": 1997.625, |
|
"completions/mean_terminated_length": 1242.0, |
|
"completions/min_length": 1108.0, |
|
"completions/min_terminated_length": 1108.0, |
|
"epoch": 0.037714285714285714, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21664302051067352, |
|
"learning_rate": 8.737029101523929e-07, |
|
"loss": -0.0025, |
|
"num_tokens": 4122804.0, |
|
"reward": -0.4877340793609619, |
|
"reward_std": 0.35244429111480713, |
|
"rewards/cosine_scaled_reward/mean": -0.29074203968048096, |
|
"rewards/cosine_scaled_reward/std": 0.20779016613960266, |
|
"rewards/format_reward/mean": 0.09375, |
|
"rewards/format_reward/std": 0.29378482699394226, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.28125, |
|
"completions/clipped_ratio": 0.5, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2034.0, |
|
"completions/mean_length": 1607.84375, |
|
"completions/mean_terminated_length": 1167.6875, |
|
"completions/min_length": 484.0, |
|
"completions/min_terminated_length": 484.0, |
|
"epoch": 0.038857142857142854, |
|
"epsilon_high_adjusted": 0.2703125, |
|
"epsilon_low_adjusted": 0.2703125, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21048779785633087, |
|
"learning_rate": 8.625962667065487e-07, |
|
"loss": 0.0735, |
|
"num_tokens": 4235258.0, |
|
"reward": 0.5191864967346191, |
|
"reward_std": 0.8498681783676147, |
|
"rewards/cosine_scaled_reward/mean": -0.006031747907400131, |
|
"rewards/cosine_scaled_reward/std": 0.5057411193847656, |
|
"rewards/format_reward/mean": 0.53125, |
|
"rewards/format_reward/std": 0.5029674172401428, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.828125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2035.0, |
|
"completions/mean_length": 1834.703125, |
|
"completions/mean_terminated_length": 807.0, |
|
"completions/min_length": 429.0, |
|
"completions/min_terminated_length": 429.0, |
|
"epoch": 0.04, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2056789994239807, |
|
"learning_rate": 8.511087728614862e-07, |
|
"loss": 0.0218, |
|
"num_tokens": 4364175.0, |
|
"reward": -0.20920395851135254, |
|
"reward_std": 0.6680670976638794, |
|
"rewards/cosine_scaled_reward/mean": -0.20616447925567627, |
|
"rewards/cosine_scaled_reward/std": 0.2824583351612091, |
|
"rewards/format_reward/mean": 0.203125, |
|
"rewards/format_reward/std": 0.40550529956817627, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.828125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1944.0, |
|
"completions/mean_length": 1948.0625, |
|
"completions/mean_terminated_length": 1466.5455322265625, |
|
"completions/min_length": 805.0, |
|
"completions/min_terminated_length": 805.0, |
|
"epoch": 0.04114285714285714, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.23063796758651733, |
|
"learning_rate": 8.392544243589427e-07, |
|
"loss": 0.0303, |
|
"num_tokens": 4499963.0, |
|
"reward": -0.2987564504146576, |
|
"reward_std": 0.4857916235923767, |
|
"rewards/cosine_scaled_reward/mean": -0.2353157252073288, |
|
"rewards/cosine_scaled_reward/std": 0.23094965517520905, |
|
"rewards/format_reward/mean": 0.171875, |
|
"rewards/format_reward/std": 0.38025420904159546, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.921875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2033.0, |
|
"completions/mean_length": 2002.296875, |
|
"completions/mean_terminated_length": 1463.0, |
|
"completions/min_length": 992.0, |
|
"completions/min_terminated_length": 992.0, |
|
"epoch": 0.04228571428571429, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21713794767856598, |
|
"learning_rate": 8.270476638965461e-07, |
|
"loss": 0.0445, |
|
"num_tokens": 4639134.0, |
|
"reward": -0.38547438383102417, |
|
"reward_std": 0.39840468764305115, |
|
"rewards/cosine_scaled_reward/mean": -0.23961219191551208, |
|
"rewards/cosine_scaled_reward/std": 0.19388997554779053, |
|
"rewards/format_reward/mean": 0.09375, |
|
"rewards/format_reward/std": 0.29378482699394226, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1437.0, |
|
"completions/mean_length": 1932.40625, |
|
"completions/mean_terminated_length": 1123.25, |
|
"completions/min_length": 809.0, |
|
"completions/min_terminated_length": 809.0, |
|
"epoch": 0.04342857142857143, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.23154403269290924, |
|
"learning_rate": 8.145033635316128e-07, |
|
"loss": 0.0361, |
|
"num_tokens": 4774520.0, |
|
"reward": -0.15762007236480713, |
|
"reward_std": 0.4076302647590637, |
|
"rewards/cosine_scaled_reward/mean": -0.14912253618240356, |
|
"rewards/cosine_scaled_reward/std": 0.30022993683815, |
|
"rewards/format_reward/mean": 0.140625, |
|
"rewards/format_reward/std": 0.3503824472427368, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.765625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1862.0, |
|
"completions/mean_length": 1762.90625, |
|
"completions/mean_terminated_length": 831.6000366210938, |
|
"completions/min_length": 281.0, |
|
"completions/min_terminated_length": 281.0, |
|
"epoch": 0.044571428571428574, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19119793176651, |
|
"learning_rate": 8.01636806561836e-07, |
|
"loss": 0.0427, |
|
"num_tokens": 4898130.0, |
|
"reward": 0.09733833372592926, |
|
"reward_std": 0.525000274181366, |
|
"rewards/cosine_scaled_reward/mean": -0.09976834803819656, |
|
"rewards/cosine_scaled_reward/std": 0.3302258253097534, |
|
"rewards/format_reward/mean": 0.296875, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.578125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1934.0, |
|
"completions/mean_length": 1580.0625, |
|
"completions/mean_terminated_length": 938.8148193359375, |
|
"completions/min_length": 222.0, |
|
"completions/min_terminated_length": 222.0, |
|
"epoch": 0.045714285714285714, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.198013037443161, |
|
"learning_rate": 7.884636689049422e-07, |
|
"loss": 0.043, |
|
"num_tokens": 5009326.0, |
|
"reward": 0.2950524389743805, |
|
"reward_std": 0.6312240958213806, |
|
"rewards/cosine_scaled_reward/mean": -0.07903628796339035, |
|
"rewards/cosine_scaled_reward/std": 0.4338403344154358, |
|
"rewards/format_reward/mean": 0.453125, |
|
"rewards/format_reward/std": 0.501733124256134, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.125, |
|
"completions/clipped_ratio": 0.703125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1725.0, |
|
"completions/mean_length": 1816.5, |
|
"completions/mean_terminated_length": 1268.2105712890625, |
|
"completions/min_length": 671.0, |
|
"completions/min_terminated_length": 671.0, |
|
"epoch": 0.046857142857142854, |
|
"epsilon_high_adjusted": 0.2375, |
|
"epsilon_low_adjusted": 0.2375, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20052184164524078, |
|
"learning_rate": 7.75e-07, |
|
"loss": 0.049, |
|
"num_tokens": 5136734.0, |
|
"reward": -0.04274348169565201, |
|
"reward_std": 0.6400065422058105, |
|
"rewards/cosine_scaled_reward/mean": -0.2010592371225357, |
|
"rewards/cosine_scaled_reward/std": 0.31733086705207825, |
|
"rewards/format_reward/mean": 0.359375, |
|
"rewards/format_reward/std": 0.4836103618144989, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.71875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2008.0, |
|
"completions/mean_length": 1649.609375, |
|
"completions/mean_terminated_length": 631.5, |
|
"completions/min_length": 209.0, |
|
"completions/min_terminated_length": 209.0, |
|
"epoch": 0.048, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21507494151592255, |
|
"learning_rate": 7.612622032536507e-07, |
|
"loss": 0.035, |
|
"num_tokens": 5252669.0, |
|
"reward": -0.1898493468761444, |
|
"reward_std": 0.3643849194049835, |
|
"rewards/cosine_scaled_reward/mean": -0.2355496734380722, |
|
"rewards/cosine_scaled_reward/std": 0.18636876344680786, |
|
"rewards/format_reward/mean": 0.28125, |
|
"rewards/format_reward/std": 0.4531635046005249, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2012.0, |
|
"completions/mean_length": 1699.6875, |
|
"completions/mean_terminated_length": 933.4000244140625, |
|
"completions/min_length": 456.0, |
|
"completions/min_terminated_length": 456.0, |
|
"epoch": 0.04914285714285714, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1977306753396988, |
|
"learning_rate": 7.472670160550848e-07, |
|
"loss": 0.0412, |
|
"num_tokens": 5373065.0, |
|
"reward": 0.10587131977081299, |
|
"reward_std": 0.5526303052902222, |
|
"rewards/cosine_scaled_reward/mean": -0.1033143401145935, |
|
"rewards/cosine_scaled_reward/std": 0.3173230290412903, |
|
"rewards/format_reward/mean": 0.3125, |
|
"rewards/format_reward/std": 0.467176616191864, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.640625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1779.0, |
|
"completions/mean_length": 1585.3125, |
|
"completions/mean_terminated_length": 760.521728515625, |
|
"completions/min_length": 251.0, |
|
"completions/min_terminated_length": 251.0, |
|
"epoch": 0.05028571428571429, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18540318310260773, |
|
"learning_rate": 7.330314893841101e-07, |
|
"loss": 0.0628, |
|
"num_tokens": 5485221.0, |
|
"reward": 0.1987997144460678, |
|
"reward_std": 0.7999765872955322, |
|
"rewards/cosine_scaled_reward/mean": -0.10372515767812729, |
|
"rewards/cosine_scaled_reward/std": 0.43109309673309326, |
|
"rewards/format_reward/mean": 0.40625, |
|
"rewards/format_reward/std": 0.49501484632492065, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.1875, |
|
"completions/clipped_ratio": 0.953125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1830.0, |
|
"completions/mean_length": 2018.203125, |
|
"completions/mean_terminated_length": 1412.3333740234375, |
|
"completions/min_length": 1141.0, |
|
"completions/min_terminated_length": 1141.0, |
|
"epoch": 0.05142857142857143, |
|
"epsilon_high_adjusted": 0.246875, |
|
"epsilon_low_adjusted": 0.246875, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.22048652172088623, |
|
"learning_rate": 7.185729670371604e-07, |
|
"loss": -0.012, |
|
"num_tokens": 5626042.0, |
|
"reward": -0.2534261643886566, |
|
"reward_std": 0.46692758798599243, |
|
"rewards/cosine_scaled_reward/mean": -0.1892130821943283, |
|
"rewards/cosine_scaled_reward/std": 0.3184278905391693, |
|
"rewards/format_reward/mean": 0.125, |
|
"rewards/format_reward/std": 0.3333333432674408, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1438.0, |
|
"completions/mean_length": 1841.4375, |
|
"completions/mean_terminated_length": 946.3333740234375, |
|
"completions/min_length": 674.0, |
|
"completions/min_terminated_length": 674.0, |
|
"epoch": 0.052571428571428575, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2475695013999939, |
|
"learning_rate": 7.039090644965509e-07, |
|
"loss": 0.0372, |
|
"num_tokens": 5755062.0, |
|
"reward": -0.2845799922943115, |
|
"reward_std": 0.37563323974609375, |
|
"rewards/cosine_scaled_reward/mean": -0.23603998124599457, |
|
"rewards/cosine_scaled_reward/std": 0.17336885631084442, |
|
"rewards/format_reward/mean": 0.1875, |
|
"rewards/format_reward/std": 0.39339789748191833, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.703125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2008.0, |
|
"completions/mean_length": 1692.21875, |
|
"completions/mean_terminated_length": 849.5789794921875, |
|
"completions/min_length": 287.0, |
|
"completions/min_terminated_length": 287.0, |
|
"epoch": 0.053714285714285714, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20771269500255585, |
|
"learning_rate": 6.890576474687263e-07, |
|
"loss": 0.0472, |
|
"num_tokens": 5873764.0, |
|
"reward": 0.05584225058555603, |
|
"reward_std": 0.7296371459960938, |
|
"rewards/cosine_scaled_reward/mean": -0.1361413598060608, |
|
"rewards/cosine_scaled_reward/std": 0.43819770216941833, |
|
"rewards/format_reward/mean": 0.328125, |
|
"rewards/format_reward/std": 0.4732423722743988, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.03125, |
|
"completions/clipped_ratio": 0.71875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1544.0, |
|
"completions/mean_length": 1739.1875, |
|
"completions/mean_terminated_length": 950.0, |
|
"completions/min_length": 531.0, |
|
"completions/min_terminated_length": 531.0, |
|
"epoch": 0.054857142857142854, |
|
"epsilon_high_adjusted": 0.2109375, |
|
"epsilon_low_adjusted": 0.2109375, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19528591632843018, |
|
"learning_rate": 6.740368101176495e-07, |
|
"loss": 0.0163, |
|
"num_tokens": 5995616.0, |
|
"reward": 0.06744927167892456, |
|
"reward_std": 0.6208744049072266, |
|
"rewards/cosine_scaled_reward/mean": -0.10690036416053772, |
|
"rewards/cosine_scaled_reward/std": 0.3725154995918274, |
|
"rewards/format_reward/mean": 0.28125, |
|
"rewards/format_reward/std": 0.4531635046005249, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.546875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1689.0, |
|
"completions/mean_length": 1523.296875, |
|
"completions/mean_terminated_length": 890.0344848632812, |
|
"completions/min_length": 242.0, |
|
"completions/min_terminated_length": 242.0, |
|
"epoch": 0.056, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17805328965187073, |
|
"learning_rate": 6.588648530198504e-07, |
|
"loss": 0.0665, |
|
"num_tokens": 6103171.0, |
|
"reward": 0.17353929579257965, |
|
"reward_std": 0.6857056617736816, |
|
"rewards/cosine_scaled_reward/mean": -0.14760535955429077, |
|
"rewards/cosine_scaled_reward/std": 0.40281444787979126, |
|
"rewards/format_reward/mean": 0.46875, |
|
"rewards/format_reward/std": 0.5029674172401428, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.703125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1965.0, |
|
"completions/mean_length": 1694.578125, |
|
"completions/mean_terminated_length": 857.5263061523438, |
|
"completions/min_length": 344.0, |
|
"completions/min_terminated_length": 344.0, |
|
"epoch": 0.05714285714285714, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17739826440811157, |
|
"learning_rate": 6.435602608679916e-07, |
|
"loss": 0.0495, |
|
"num_tokens": 6222440.0, |
|
"reward": 0.12661927938461304, |
|
"reward_std": 0.588830828666687, |
|
"rewards/cosine_scaled_reward/mean": -0.08512786030769348, |
|
"rewards/cosine_scaled_reward/std": 0.43910878896713257, |
|
"rewards/format_reward/mean": 0.296875, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0625, |
|
"completions/clipped_ratio": 0.5, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1975.0, |
|
"completions/mean_length": 1466.984375, |
|
"completions/mean_terminated_length": 885.96875, |
|
"completions/min_length": 376.0, |
|
"completions/min_terminated_length": 376.0, |
|
"epoch": 0.05828571428571429, |
|
"epsilon_high_adjusted": 0.22187500000000002, |
|
"epsilon_low_adjusted": 0.22187500000000002, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20479989051818848, |
|
"learning_rate": 6.281416799501187e-07, |
|
"loss": 0.0226, |
|
"num_tokens": 6326535.0, |
|
"reward": 0.3101663589477539, |
|
"reward_std": 0.6823086738586426, |
|
"rewards/cosine_scaled_reward/mean": -0.10272930562496185, |
|
"rewards/cosine_scaled_reward/std": 0.344821572303772, |
|
"rewards/format_reward/mean": 0.515625, |
|
"rewards/format_reward/std": 0.5037065148353577, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.125, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1510.0, |
|
"completions/mean_length": 1744.640625, |
|
"completions/mean_terminated_length": 834.5625, |
|
"completions/min_length": 280.0, |
|
"completions/min_terminated_length": 280.0, |
|
"epoch": 0.05942857142857143, |
|
"epsilon_high_adjusted": 0.23750000000000002, |
|
"epsilon_low_adjusted": 0.23750000000000002, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19819842278957367, |
|
"learning_rate": 6.126278954320294e-07, |
|
"loss": 0.0083, |
|
"num_tokens": 6449544.0, |
|
"reward": -0.00986415147781372, |
|
"reward_std": 0.685615062713623, |
|
"rewards/cosine_scaled_reward/mean": -0.14555707573890686, |
|
"rewards/cosine_scaled_reward/std": 0.41420355439186096, |
|
"rewards/format_reward/mean": 0.28125, |
|
"rewards/format_reward/std": 0.4531635046005249, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1977.0, |
|
"completions/mean_length": 1733.171875, |
|
"completions/mean_terminated_length": 1040.550048828125, |
|
"completions/min_length": 525.0, |
|
"completions/min_terminated_length": 525.0, |
|
"epoch": 0.060571428571428575, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20193032920360565, |
|
"learning_rate": 5.97037808470444e-07, |
|
"loss": 0.0262, |
|
"num_tokens": 6571299.0, |
|
"reward": 0.08663126826286316, |
|
"reward_std": 0.661508321762085, |
|
"rewards/cosine_scaled_reward/mean": -0.15980936586856842, |
|
"rewards/cosine_scaled_reward/std": 0.30268651247024536, |
|
"rewards/format_reward/mean": 0.40625, |
|
"rewards/format_reward/std": 0.49501484632492065, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.59375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1997.0, |
|
"completions/mean_length": 1674.375, |
|
"completions/mean_terminated_length": 1128.3077392578125, |
|
"completions/min_length": 660.0, |
|
"completions/min_terminated_length": 660.0, |
|
"epoch": 0.061714285714285715, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19670119881629944, |
|
"learning_rate": 5.813904131848564e-07, |
|
"loss": 0.0604, |
|
"num_tokens": 6689603.0, |
|
"reward": 0.6827424764633179, |
|
"reward_std": 0.8742384910583496, |
|
"rewards/cosine_scaled_reward/mean": 0.07574622333049774, |
|
"rewards/cosine_scaled_reward/std": 0.5349056124687195, |
|
"rewards/format_reward/mean": 0.53125, |
|
"rewards/format_reward/std": 0.5029674172401428, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.71875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1565.0, |
|
"completions/mean_length": 1711.09375, |
|
"completions/mean_terminated_length": 850.1111450195312, |
|
"completions/min_length": 611.0, |
|
"completions/min_terminated_length": 611.0, |
|
"epoch": 0.06285714285714286, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20655271410942078, |
|
"learning_rate": 5.657047735161255e-07, |
|
"loss": 0.0611, |
|
"num_tokens": 6809401.0, |
|
"reward": 0.08179579675197601, |
|
"reward_std": 0.5935317277908325, |
|
"rewards/cosine_scaled_reward/mean": -0.099727101624012, |
|
"rewards/cosine_scaled_reward/std": 0.41786429286003113, |
|
"rewards/format_reward/mean": 0.28125, |
|
"rewards/format_reward/std": 0.4531635046005249, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.671875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1946.0, |
|
"completions/mean_length": 1734.6875, |
|
"completions/mean_terminated_length": 1093.142822265625, |
|
"completions/min_length": 556.0, |
|
"completions/min_terminated_length": 556.0, |
|
"epoch": 0.064, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.202525794506073, |
|
"learning_rate": 5.5e-07, |
|
"loss": 0.0524, |
|
"num_tokens": 6931381.0, |
|
"reward": 0.15938684344291687, |
|
"reward_std": 0.5165223479270935, |
|
"rewards/cosine_scaled_reward/mean": -0.10780657827854156, |
|
"rewards/cosine_scaled_reward/std": 0.4498305320739746, |
|
"rewards/format_reward/mean": 0.375, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.828125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1849.0, |
|
"completions/mean_length": 1927.875, |
|
"completions/mean_terminated_length": 1349.0909423828125, |
|
"completions/min_length": 554.0, |
|
"completions/min_terminated_length": 554.0, |
|
"epoch": 0.06514285714285714, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19767992198467255, |
|
"learning_rate": 5.342952264838747e-07, |
|
"loss": 0.011, |
|
"num_tokens": 7066333.0, |
|
"reward": -0.3306891620159149, |
|
"reward_std": 0.4264768958091736, |
|
"rewards/cosine_scaled_reward/mean": -0.28253209590911865, |
|
"rewards/cosine_scaled_reward/std": 0.2055179625749588, |
|
"rewards/format_reward/mean": 0.234375, |
|
"rewards/format_reward/std": 0.42695629596710205, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.421875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1905.0, |
|
"completions/mean_length": 1486.875, |
|
"completions/mean_terminated_length": 1077.4053955078125, |
|
"completions/min_length": 432.0, |
|
"completions/min_terminated_length": 432.0, |
|
"epoch": 0.06628571428571428, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17724664509296417, |
|
"learning_rate": 5.186095868151436e-07, |
|
"loss": 0.0289, |
|
"num_tokens": 7171589.0, |
|
"reward": 0.5129991769790649, |
|
"reward_std": 0.8471602201461792, |
|
"rewards/cosine_scaled_reward/mean": -0.048187918961048126, |
|
"rewards/cosine_scaled_reward/std": 0.4703964293003082, |
|
"rewards/format_reward/mean": 0.609375, |
|
"rewards/format_reward/std": 0.4917473793029785, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.640625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1912.0, |
|
"completions/mean_length": 1684.71875, |
|
"completions/mean_terminated_length": 1037.1304931640625, |
|
"completions/min_length": 369.0, |
|
"completions/min_terminated_length": 369.0, |
|
"epoch": 0.06742857142857143, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17898693680763245, |
|
"learning_rate": 5.02962191529556e-07, |
|
"loss": 0.0154, |
|
"num_tokens": 7289875.0, |
|
"reward": 0.19179533421993256, |
|
"reward_std": 0.6819975972175598, |
|
"rewards/cosine_scaled_reward/mean": -0.11503982543945312, |
|
"rewards/cosine_scaled_reward/std": 0.4170202612876892, |
|
"rewards/format_reward/mean": 0.421875, |
|
"rewards/format_reward/std": 0.49776285886764526, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.765625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1773.0, |
|
"completions/mean_length": 1784.640625, |
|
"completions/mean_terminated_length": 924.3333740234375, |
|
"completions/min_length": 501.0, |
|
"completions/min_terminated_length": 501.0, |
|
"epoch": 0.06857142857142857, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18929247558116913, |
|
"learning_rate": 4.873721045679706e-07, |
|
"loss": 0.0206, |
|
"num_tokens": 7414980.0, |
|
"reward": -0.2165871113538742, |
|
"reward_std": 0.4006432890892029, |
|
"rewards/cosine_scaled_reward/mean": -0.2411060631275177, |
|
"rewards/cosine_scaled_reward/std": 0.22077766060829163, |
|
"rewards/format_reward/mean": 0.265625, |
|
"rewards/format_reward/std": 0.44515693187713623, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.71875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2035.0, |
|
"completions/mean_length": 1769.8125, |
|
"completions/mean_terminated_length": 1058.888916015625, |
|
"completions/min_length": 470.0, |
|
"completions/min_terminated_length": 470.0, |
|
"epoch": 0.06971428571428571, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18952307105064392, |
|
"learning_rate": 4.7185832004988133e-07, |
|
"loss": 0.0408, |
|
"num_tokens": 7539768.0, |
|
"reward": 0.1118076741695404, |
|
"reward_std": 0.7766213417053223, |
|
"rewards/cosine_scaled_reward/mean": -0.1315961629152298, |
|
"rewards/cosine_scaled_reward/std": 0.3166446387767792, |
|
"rewards/format_reward/mean": 0.375, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.546875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1766.0, |
|
"completions/mean_length": 1529.890625, |
|
"completions/mean_terminated_length": 904.586181640625, |
|
"completions/min_length": 337.0, |
|
"completions/min_terminated_length": 337.0, |
|
"epoch": 0.07085714285714285, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17763926088809967, |
|
"learning_rate": 4.5643973913200837e-07, |
|
"loss": 0.067, |
|
"num_tokens": 7647913.0, |
|
"reward": 0.33338215947151184, |
|
"reward_std": 0.7777395248413086, |
|
"rewards/cosine_scaled_reward/mean": -0.07549642026424408, |
|
"rewards/cosine_scaled_reward/std": 0.42954275012016296, |
|
"rewards/format_reward/mean": 0.484375, |
|
"rewards/format_reward/std": 0.5037065148353577, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.421875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1890.0, |
|
"completions/mean_length": 1445.671875, |
|
"completions/mean_terminated_length": 1006.1351318359375, |
|
"completions/min_length": 326.0, |
|
"completions/min_terminated_length": 326.0, |
|
"epoch": 0.072, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19693626463413239, |
|
"learning_rate": 4.4113514698014953e-07, |
|
"loss": 0.0716, |
|
"num_tokens": 7750692.0, |
|
"reward": 0.4808192253112793, |
|
"reward_std": 0.7516437768936157, |
|
"rewards/cosine_scaled_reward/mean": -0.05646540969610214, |
|
"rewards/cosine_scaled_reward/std": 0.43912824988365173, |
|
"rewards/format_reward/mean": 0.59375, |
|
"rewards/format_reward/std": 0.49501484632492065, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.65625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1802.0, |
|
"completions/mean_length": 1667.59375, |
|
"completions/mean_terminated_length": 941.3636474609375, |
|
"completions/min_length": 481.0, |
|
"completions/min_terminated_length": 481.0, |
|
"epoch": 0.07314285714285715, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21556542813777924, |
|
"learning_rate": 4.2596318988235037e-07, |
|
"loss": 0.0673, |
|
"num_tokens": 7868370.0, |
|
"reward": -0.013222754001617432, |
|
"reward_std": 0.603374183177948, |
|
"rewards/cosine_scaled_reward/mean": -0.17848637700080872, |
|
"rewards/cosine_scaled_reward/std": 0.3722720146179199, |
|
"rewards/format_reward/mean": 0.34375, |
|
"rewards/format_reward/std": 0.4787135720252991, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.703125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1521.0, |
|
"completions/mean_length": 1636.25, |
|
"completions/mean_terminated_length": 661.0526123046875, |
|
"completions/min_length": 300.0, |
|
"completions/min_terminated_length": 300.0, |
|
"epoch": 0.07428571428571429, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.196330264210701, |
|
"learning_rate": 4.1094235253127374e-07, |
|
"loss": 0.0079, |
|
"num_tokens": 7983794.0, |
|
"reward": -0.02694564312696457, |
|
"reward_std": 0.43651264905929565, |
|
"rewards/cosine_scaled_reward/mean": -0.17753534018993378, |
|
"rewards/cosine_scaled_reward/std": 0.3935491740703583, |
|
"rewards/format_reward/mean": 0.328125, |
|
"rewards/format_reward/std": 0.4732423722743988, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1875.0, |
|
"completions/mean_length": 1233.84375, |
|
"completions/mean_terminated_length": 745.3500366210938, |
|
"completions/min_length": 269.0, |
|
"completions/min_terminated_length": 269.0, |
|
"epoch": 0.07542857142857143, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15430262684822083, |
|
"learning_rate": 3.9609093550344907e-07, |
|
"loss": 0.0418, |
|
"num_tokens": 8072992.0, |
|
"reward": 0.660415530204773, |
|
"reward_std": 0.8181198835372925, |
|
"rewards/cosine_scaled_reward/mean": 0.017707787454128265, |
|
"rewards/cosine_scaled_reward/std": 0.4732138216495514, |
|
"rewards/format_reward/mean": 0.625, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.796875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1643.0, |
|
"completions/mean_length": 1831.359375, |
|
"completions/mean_terminated_length": 981.4615478515625, |
|
"completions/min_length": 689.0, |
|
"completions/min_terminated_length": 689.0, |
|
"epoch": 0.07657142857142857, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18633423745632172, |
|
"learning_rate": 3.8142703296283953e-07, |
|
"loss": 0.0704, |
|
"num_tokens": 8200935.0, |
|
"reward": -0.28378796577453613, |
|
"reward_std": 0.5433966517448425, |
|
"rewards/cosine_scaled_reward/mean": -0.25908148288726807, |
|
"rewards/cosine_scaled_reward/std": 0.23884578049182892, |
|
"rewards/format_reward/mean": 0.234375, |
|
"rewards/format_reward/std": 0.42695629596710205, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.21875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1710.0, |
|
"completions/mean_length": 945.390625, |
|
"completions/mean_terminated_length": 636.6599731445312, |
|
"completions/min_length": 222.0, |
|
"completions/min_terminated_length": 222.0, |
|
"epoch": 0.07771428571428571, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1517961174249649, |
|
"learning_rate": 3.6696851061588994e-07, |
|
"loss": 0.084, |
|
"num_tokens": 8270696.0, |
|
"reward": 0.8194406032562256, |
|
"reward_std": 0.7585938572883606, |
|
"rewards/cosine_scaled_reward/mean": 0.019095297902822495, |
|
"rewards/cosine_scaled_reward/std": 0.46527862548828125, |
|
"rewards/format_reward/mean": 0.78125, |
|
"rewards/format_reward/std": 0.4166666865348816, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.46875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1374.0, |
|
"completions/mean_length": 1397.921875, |
|
"completions/mean_terminated_length": 824.3235473632812, |
|
"completions/min_length": 273.0, |
|
"completions/min_terminated_length": 273.0, |
|
"epoch": 0.07885714285714286, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1929248720407486, |
|
"learning_rate": 3.5273298394491515e-07, |
|
"loss": 0.0769, |
|
"num_tokens": 8370507.0, |
|
"reward": 0.018557976931333542, |
|
"reward_std": 0.6102144122123718, |
|
"rewards/cosine_scaled_reward/mean": -0.24072101712226868, |
|
"rewards/cosine_scaled_reward/std": 0.2912290096282959, |
|
"rewards/format_reward/mean": 0.5, |
|
"rewards/format_reward/std": 0.5039526224136353, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.84375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1670.0, |
|
"completions/mean_length": 1859.625, |
|
"completions/mean_terminated_length": 842.4000244140625, |
|
"completions/min_length": 510.0, |
|
"completions/min_terminated_length": 510.0, |
|
"epoch": 0.08, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20638665556907654, |
|
"learning_rate": 3.387377967463493e-07, |
|
"loss": 0.03, |
|
"num_tokens": 8500979.0, |
|
"reward": -0.2084158957004547, |
|
"reward_std": 0.5423504114151001, |
|
"rewards/cosine_scaled_reward/mean": -0.23702044785022736, |
|
"rewards/cosine_scaled_reward/std": 0.24943575263023376, |
|
"rewards/format_reward/mean": 0.265625, |
|
"rewards/format_reward/std": 0.44515693187713623, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.1875, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1880.0, |
|
"completions/mean_length": 1677.53125, |
|
"completions/mean_terminated_length": 1060.0833740234375, |
|
"completions/min_length": 362.0, |
|
"completions/min_terminated_length": 362.0, |
|
"epoch": 0.08114285714285714, |
|
"epsilon_high_adjusted": 0.25625, |
|
"epsilon_low_adjusted": 0.25625, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20180849730968475, |
|
"learning_rate": 3.250000000000001e-07, |
|
"loss": 0.042, |
|
"num_tokens": 8619061.0, |
|
"reward": 0.19083726406097412, |
|
"reward_std": 0.4570698142051697, |
|
"rewards/cosine_scaled_reward/mean": -0.09989387542009354, |
|
"rewards/cosine_scaled_reward/std": 0.5225576758384705, |
|
"rewards/format_reward/mean": 0.390625, |
|
"rewards/format_reward/std": 0.4917473793029785, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.4375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1769.0, |
|
"completions/mean_length": 1432.4375, |
|
"completions/mean_terminated_length": 953.6666870117188, |
|
"completions/min_length": 432.0, |
|
"completions/min_terminated_length": 432.0, |
|
"epoch": 0.08228571428571428, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18141792714595795, |
|
"learning_rate": 3.115363310950578e-07, |
|
"loss": 0.0705, |
|
"num_tokens": 8721089.0, |
|
"reward": 0.1964300572872162, |
|
"reward_std": 0.4526848793029785, |
|
"rewards/cosine_scaled_reward/mean": -0.1830349564552307, |
|
"rewards/cosine_scaled_reward/std": 0.23432116210460663, |
|
"rewards/format_reward/mean": 0.5625, |
|
"rewards/format_reward/std": 0.5, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.859375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1711.0, |
|
"completions/mean_length": 1940.59375, |
|
"completions/mean_terminated_length": 1284.2222900390625, |
|
"completions/min_length": 712.0, |
|
"completions/min_terminated_length": 712.0, |
|
"epoch": 0.08342857142857144, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19840198755264282, |
|
"learning_rate": 2.9836319343816397e-07, |
|
"loss": 0.0456, |
|
"num_tokens": 8856015.0, |
|
"reward": -0.27154141664505005, |
|
"reward_std": 0.5744385719299316, |
|
"rewards/cosine_scaled_reward/mean": -0.24514569342136383, |
|
"rewards/cosine_scaled_reward/std": 0.2793368399143219, |
|
"rewards/format_reward/mean": 0.21875, |
|
"rewards/format_reward/std": 0.4166666865348816, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.375, |
|
"completions/clipped_ratio": 0.71875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1927.0, |
|
"completions/mean_length": 1764.953125, |
|
"completions/mean_terminated_length": 1041.611083984375, |
|
"completions/min_length": 508.0, |
|
"completions/min_terminated_length": 508.0, |
|
"epoch": 0.08457142857142858, |
|
"epsilon_high_adjusted": 0.275, |
|
"epsilon_low_adjusted": 0.275, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1787436455488205, |
|
"learning_rate": 2.854966364683872e-07, |
|
"loss": 0.0899, |
|
"num_tokens": 8978900.0, |
|
"reward": 0.19575530290603638, |
|
"reward_std": 0.7323085069656372, |
|
"rewards/cosine_scaled_reward/mean": -0.06618484109640121, |
|
"rewards/cosine_scaled_reward/std": 0.48987188935279846, |
|
"rewards/format_reward/mean": 0.328125, |
|
"rewards/format_reward/std": 0.4732423722743988, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.59375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2039.0, |
|
"completions/mean_length": 1696.3125, |
|
"completions/mean_terminated_length": 1182.3077392578125, |
|
"completions/min_length": 664.0, |
|
"completions/min_terminated_length": 664.0, |
|
"epoch": 0.08571428571428572, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18963930010795593, |
|
"learning_rate": 2.729523361034538e-07, |
|
"loss": 0.0517, |
|
"num_tokens": 9098424.0, |
|
"reward": 0.3912990391254425, |
|
"reward_std": 0.6338691115379333, |
|
"rewards/cosine_scaled_reward/mean": -0.023100484162569046, |
|
"rewards/cosine_scaled_reward/std": 0.47905832529067993, |
|
"rewards/format_reward/mean": 0.4375, |
|
"rewards/format_reward/std": 0.5, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.71875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1340.0, |
|
"completions/mean_length": 1663.8125, |
|
"completions/mean_terminated_length": 682.0, |
|
"completions/min_length": 306.0, |
|
"completions/min_terminated_length": 306.0, |
|
"epoch": 0.08685714285714285, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1930101066827774, |
|
"learning_rate": 2.6074557564105724e-07, |
|
"loss": 0.0121, |
|
"num_tokens": 9215308.0, |
|
"reward": -0.3120652437210083, |
|
"reward_std": 0.3665449023246765, |
|
"rewards/cosine_scaled_reward/mean": -0.29665762186050415, |
|
"rewards/cosine_scaled_reward/std": 0.17376884818077087, |
|
"rewards/format_reward/mean": 0.28125, |
|
"rewards/format_reward/std": 0.4531635046005249, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1849.0, |
|
"completions/mean_length": 1733.296875, |
|
"completions/mean_terminated_length": 1040.9500732421875, |
|
"completions/min_length": 568.0, |
|
"completions/min_terminated_length": 568.0, |
|
"epoch": 0.088, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1888083517551422, |
|
"learning_rate": 2.488912271385139e-07, |
|
"loss": 0.0391, |
|
"num_tokens": 9337815.0, |
|
"reward": 0.12663224339485168, |
|
"reward_std": 0.4842023551464081, |
|
"rewards/cosine_scaled_reward/mean": -0.10855888575315475, |
|
"rewards/cosine_scaled_reward/std": 0.3368559777736664, |
|
"rewards/format_reward/mean": 0.34375, |
|
"rewards/format_reward/std": 0.4787135720252991, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.703125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1698.0, |
|
"completions/mean_length": 1747.84375, |
|
"completions/mean_terminated_length": 1036.9473876953125, |
|
"completions/min_length": 634.0, |
|
"completions/min_terminated_length": 634.0, |
|
"epoch": 0.08914285714285715, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18638849258422852, |
|
"learning_rate": 2.374037332934512e-07, |
|
"loss": 0.0141, |
|
"num_tokens": 9460397.0, |
|
"reward": 0.23273038864135742, |
|
"reward_std": 0.6073111295700073, |
|
"rewards/cosine_scaled_reward/mean": -0.047697313129901886, |
|
"rewards/cosine_scaled_reward/std": 0.48325926065444946, |
|
"rewards/format_reward/mean": 0.328125, |
|
"rewards/format_reward/std": 0.4732423722743988, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.4375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1959.0, |
|
"completions/mean_length": 1371.6875, |
|
"completions/mean_terminated_length": 845.6666870117188, |
|
"completions/min_length": 231.0, |
|
"completions/min_terminated_length": 231.0, |
|
"epoch": 0.09028571428571429, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1628364771604538, |
|
"learning_rate": 2.2629708984760706e-07, |
|
"loss": 0.0366, |
|
"num_tokens": 9558281.0, |
|
"reward": 0.29462432861328125, |
|
"reward_std": 0.553225040435791, |
|
"rewards/cosine_scaled_reward/mean": -0.13393783569335938, |
|
"rewards/cosine_scaled_reward/std": 0.3635351061820984, |
|
"rewards/format_reward/mean": 0.5625, |
|
"rewards/format_reward/std": 0.5, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.65625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1994.0, |
|
"completions/mean_length": 1768.25, |
|
"completions/mean_terminated_length": 1234.181884765625, |
|
"completions/min_length": 385.0, |
|
"completions/min_terminated_length": 385.0, |
|
"epoch": 0.09142857142857143, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19675402343273163, |
|
"learning_rate": 2.1558482853517253e-07, |
|
"loss": 0.0099, |
|
"num_tokens": 9681841.0, |
|
"reward": 0.07091247290372849, |
|
"reward_std": 0.7199949026107788, |
|
"rewards/cosine_scaled_reward/mean": -0.17548125982284546, |
|
"rewards/cosine_scaled_reward/std": 0.39285531640052795, |
|
"rewards/format_reward/mean": 0.421875, |
|
"rewards/format_reward/std": 0.49776285886764526, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1883.0, |
|
"completions/mean_length": 1689.15625, |
|
"completions/mean_terminated_length": 899.7000122070312, |
|
"completions/min_length": 471.0, |
|
"completions/min_terminated_length": 471.0, |
|
"epoch": 0.09257142857142857, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2319558709859848, |
|
"learning_rate": 2.0528000059645995e-07, |
|
"loss": 0.0239, |
|
"num_tokens": 9801219.0, |
|
"reward": 0.11250954866409302, |
|
"reward_std": 0.6092942953109741, |
|
"rewards/cosine_scaled_reward/mean": -0.1468702107667923, |
|
"rewards/cosine_scaled_reward/std": 0.3265360891819, |
|
"rewards/format_reward/mean": 0.40625, |
|
"rewards/format_reward/std": 0.49501484632492065, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.515625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1900.0, |
|
"completions/mean_length": 1523.6875, |
|
"completions/mean_terminated_length": 965.54833984375, |
|
"completions/min_length": 474.0, |
|
"completions/min_terminated_length": 474.0, |
|
"epoch": 0.09371428571428571, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2026216685771942, |
|
"learning_rate": 1.9539516087697517e-07, |
|
"loss": 0.0721, |
|
"num_tokens": 9909063.0, |
|
"reward": 0.5980618000030518, |
|
"reward_std": 0.7195340394973755, |
|
"rewards/cosine_scaled_reward/mean": 0.04903092980384827, |
|
"rewards/cosine_scaled_reward/std": 0.5532049536705017, |
|
"rewards/format_reward/mean": 0.5, |
|
"rewards/format_reward/std": 0.5039526224136353, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.53125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1988.0, |
|
"completions/mean_length": 1559.015625, |
|
"completions/mean_terminated_length": 1004.8333740234375, |
|
"completions/min_length": 459.0, |
|
"completions/min_terminated_length": 459.0, |
|
"epoch": 0.09485714285714286, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19982481002807617, |
|
"learning_rate": 1.8594235253127372e-07, |
|
"loss": 0.054, |
|
"num_tokens": 10019928.0, |
|
"reward": 0.3029702305793762, |
|
"reward_std": 0.49975115060806274, |
|
"rewards/cosine_scaled_reward/mean": -0.09851488471031189, |
|
"rewards/cosine_scaled_reward/std": 0.3822130560874939, |
|
"rewards/format_reward/mean": 0.5, |
|
"rewards/format_reward/std": 0.5039526224136353, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.59375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1714.0, |
|
"completions/mean_length": 1600.859375, |
|
"completions/mean_terminated_length": 947.34619140625, |
|
"completions/min_length": 433.0, |
|
"completions/min_terminated_length": 433.0, |
|
"epoch": 0.096, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18633437156677246, |
|
"learning_rate": 1.7693309235023127e-07, |
|
"loss": 0.0865, |
|
"num_tokens": 10133079.0, |
|
"reward": 0.08319186419248581, |
|
"reward_std": 0.6657856702804565, |
|
"rewards/cosine_scaled_reward/mean": -0.1927790641784668, |
|
"rewards/cosine_scaled_reward/std": 0.3336566388607025, |
|
"rewards/format_reward/mean": 0.46875, |
|
"rewards/format_reward/std": 0.5029674172401428, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.484375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2030.0, |
|
"completions/mean_length": 1632.65625, |
|
"completions/mean_terminated_length": 1242.48486328125, |
|
"completions/min_length": 666.0, |
|
"completions/min_terminated_length": 666.0, |
|
"epoch": 0.09714285714285714, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.16532936692237854, |
|
"learning_rate": 1.6837835672960831e-07, |
|
"loss": 0.0714, |
|
"num_tokens": 10247889.0, |
|
"reward": 0.259367436170578, |
|
"reward_std": 0.7487314939498901, |
|
"rewards/cosine_scaled_reward/mean": -0.151566281914711, |
|
"rewards/cosine_scaled_reward/std": 0.3289223909378052, |
|
"rewards/format_reward/mean": 0.5625, |
|
"rewards/format_reward/std": 0.5, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.65625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1633.0, |
|
"completions/mean_length": 1671.6875, |
|
"completions/mean_terminated_length": 953.2727661132812, |
|
"completions/min_length": 617.0, |
|
"completions/min_terminated_length": 617.0, |
|
"epoch": 0.09828571428571428, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19505523145198822, |
|
"learning_rate": 1.6028856829700258e-07, |
|
"loss": 0.0388, |
|
"num_tokens": 10365733.0, |
|
"reward": 0.07625935971736908, |
|
"reward_std": 0.707979679107666, |
|
"rewards/cosine_scaled_reward/mean": -0.14937034249305725, |
|
"rewards/cosine_scaled_reward/std": 0.39694419503211975, |
|
"rewards/format_reward/mean": 0.375, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.484375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1800.0, |
|
"completions/mean_length": 1431.3125, |
|
"completions/mean_terminated_length": 852.0, |
|
"completions/min_length": 423.0, |
|
"completions/min_terminated_length": 423.0, |
|
"epoch": 0.09942857142857142, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21713945269584656, |
|
"learning_rate": 1.5267358321348285e-07, |
|
"loss": 0.0879, |
|
"num_tokens": 10467457.0, |
|
"reward": 0.15931269526481628, |
|
"reward_std": 0.6175140142440796, |
|
"rewards/cosine_scaled_reward/mean": -0.18596863746643066, |
|
"rewards/cosine_scaled_reward/std": 0.2911415696144104, |
|
"rewards/format_reward/mean": 0.53125, |
|
"rewards/format_reward/std": 0.5029674172401428, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.5625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2047.0, |
|
"completions/mean_length": 1565.265625, |
|
"completions/mean_terminated_length": 944.607177734375, |
|
"completions/min_length": 100.0, |
|
"completions/min_terminated_length": 100.0, |
|
"epoch": 0.10057142857142858, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20137667655944824, |
|
"learning_rate": 1.4554267916537495e-07, |
|
"loss": 0.0868, |
|
"num_tokens": 10578146.0, |
|
"reward": 0.4069916307926178, |
|
"reward_std": 0.7737945318222046, |
|
"rewards/cosine_scaled_reward/mean": -0.023066692054271698, |
|
"rewards/cosine_scaled_reward/std": 0.39842066168785095, |
|
"rewards/format_reward/mean": 0.453125, |
|
"rewards/format_reward/std": 0.501733124256134, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.1875, |
|
"completions/clipped_ratio": 0.40625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2011.0, |
|
"completions/mean_length": 1554.59375, |
|
"completions/mean_terminated_length": 1217.0, |
|
"completions/min_length": 537.0, |
|
"completions/min_terminated_length": 537.0, |
|
"epoch": 0.10171428571428572, |
|
"epsilon_high_adjusted": 0.246875, |
|
"epsilon_low_adjusted": 0.246875, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18177422881126404, |
|
"learning_rate": 1.3890454406082956e-07, |
|
"loss": 0.0376, |
|
"num_tokens": 10688640.0, |
|
"reward": 0.5939216017723083, |
|
"reward_std": 0.7109141945838928, |
|
"rewards/cosine_scaled_reward/mean": -0.046789199113845825, |
|
"rewards/cosine_scaled_reward/std": 0.46814003586769104, |
|
"rewards/format_reward/mean": 0.6875, |
|
"rewards/format_reward/std": 0.467176616191864, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.4375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1729.0, |
|
"completions/mean_length": 1442.46875, |
|
"completions/mean_terminated_length": 971.5, |
|
"completions/min_length": 374.0, |
|
"completions/min_terminated_length": 374.0, |
|
"epoch": 0.10285714285714286, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20881003141403198, |
|
"learning_rate": 1.3276726544494571e-07, |
|
"loss": 0.0727, |
|
"num_tokens": 10790958.0, |
|
"reward": 0.2375994473695755, |
|
"reward_std": 0.6029033660888672, |
|
"rewards/cosine_scaled_reward/mean": -0.17807528376579285, |
|
"rewards/cosine_scaled_reward/std": 0.2965840995311737, |
|
"rewards/format_reward/mean": 0.59375, |
|
"rewards/format_reward/std": 0.49501484632492065, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.71875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1871.0, |
|
"completions/mean_length": 1856.3125, |
|
"completions/mean_terminated_length": 1366.4444580078125, |
|
"completions/min_length": 886.0, |
|
"completions/min_terminated_length": 886.0, |
|
"epoch": 0.104, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20903776586055756, |
|
"learning_rate": 1.2713832064634125e-07, |
|
"loss": 0.0415, |
|
"num_tokens": 10920330.0, |
|
"reward": -0.05261840671300888, |
|
"reward_std": 0.6418163776397705, |
|
"rewards/cosine_scaled_reward/mean": -0.19818420708179474, |
|
"rewards/cosine_scaled_reward/std": 0.30021125078201294, |
|
"rewards/format_reward/mean": 0.34375, |
|
"rewards/format_reward/std": 0.4787135720252991, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.40625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2027.0, |
|
"completions/mean_length": 1401.984375, |
|
"completions/mean_terminated_length": 959.9736938476562, |
|
"completions/min_length": 416.0, |
|
"completions/min_terminated_length": 416.0, |
|
"epoch": 0.10514285714285715, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1638815999031067, |
|
"learning_rate": 1.220245676671809e-07, |
|
"loss": 0.0186, |
|
"num_tokens": 11020049.0, |
|
"reward": 0.21965916454792023, |
|
"reward_std": 0.5625333786010742, |
|
"rewards/cosine_scaled_reward/mean": -0.19485794007778168, |
|
"rewards/cosine_scaled_reward/std": 0.27887240052223206, |
|
"rewards/format_reward/mean": 0.609375, |
|
"rewards/format_reward/std": 0.4917473793029785, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.78125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1866.0, |
|
"completions/mean_length": 1846.921875, |
|
"completions/mean_terminated_length": 1128.7857666015625, |
|
"completions/min_length": 544.0, |
|
"completions/min_terminated_length": 544.0, |
|
"epoch": 0.10628571428571429, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.22502079606056213, |
|
"learning_rate": 1.1743223682775649e-07, |
|
"loss": 0.0765, |
|
"num_tokens": 11149596.0, |
|
"reward": -0.34294235706329346, |
|
"reward_std": 0.4486418664455414, |
|
"rewards/cosine_scaled_reward/mean": -0.28084617853164673, |
|
"rewards/cosine_scaled_reward/std": 0.18859638273715973, |
|
"rewards/format_reward/mean": 0.21875, |
|
"rewards/format_reward/std": 0.4166666865348816, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.703125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1641.0, |
|
"completions/mean_length": 1736.5, |
|
"completions/mean_terminated_length": 998.7368774414062, |
|
"completions/min_length": 536.0, |
|
"completions/min_terminated_length": 536.0, |
|
"epoch": 0.10742857142857143, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2263384759426117, |
|
"learning_rate": 1.1336692317580158e-07, |
|
"loss": 0.0247, |
|
"num_tokens": 11270500.0, |
|
"reward": 0.068375363945961, |
|
"reward_std": 0.3989260196685791, |
|
"rewards/cosine_scaled_reward/mean": -0.1220623031258583, |
|
"rewards/cosine_scaled_reward/std": 0.4283704161643982, |
|
"rewards/format_reward/mean": 0.3125, |
|
"rewards/format_reward/std": 0.467176616191864, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.65625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2040.0, |
|
"completions/mean_length": 1739.734375, |
|
"completions/mean_terminated_length": 1151.227294921875, |
|
"completions/min_length": 620.0, |
|
"completions/min_terminated_length": 620.0, |
|
"epoch": 0.10857142857142857, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18776042759418488, |
|
"learning_rate": 1.0983357966978745e-07, |
|
"loss": 0.0606, |
|
"num_tokens": 11392371.0, |
|
"reward": -0.18143336474895477, |
|
"reward_std": 0.5141602158546448, |
|
"rewards/cosine_scaled_reward/mean": -0.2625916600227356, |
|
"rewards/cosine_scaled_reward/std": 0.23184041678905487, |
|
"rewards/format_reward/mean": 0.34375, |
|
"rewards/format_reward/std": 0.4787135720252991, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.546875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1686.0, |
|
"completions/mean_length": 1542.140625, |
|
"completions/mean_terminated_length": 931.6206665039062, |
|
"completions/min_length": 391.0, |
|
"completions/min_terminated_length": 391.0, |
|
"epoch": 0.10971428571428571, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17707978188991547, |
|
"learning_rate": 1.068365111445064e-07, |
|
"loss": 0.0335, |
|
"num_tokens": 11502204.0, |
|
"reward": 0.4653927683830261, |
|
"reward_std": 0.6023179292678833, |
|
"rewards/cosine_scaled_reward/mean": 0.006133869290351868, |
|
"rewards/cosine_scaled_reward/std": 0.4863370656967163, |
|
"rewards/format_reward/mean": 0.453125, |
|
"rewards/format_reward/std": 0.501733124256134, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.125, |
|
"completions/clipped_ratio": 0.4375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2041.0, |
|
"completions/mean_length": 1533.03125, |
|
"completions/mean_terminated_length": 1132.5, |
|
"completions/min_length": 358.0, |
|
"completions/min_terminated_length": 358.0, |
|
"epoch": 0.11085714285714286, |
|
"epsilon_high_adjusted": 0.24375000000000002, |
|
"epsilon_low_adjusted": 0.24375000000000002, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1965816169977188, |
|
"learning_rate": 1.0437936906629334e-07, |
|
"loss": 0.036, |
|
"num_tokens": 11610582.0, |
|
"reward": 0.42514127492904663, |
|
"reward_std": 0.6956003904342651, |
|
"rewards/cosine_scaled_reward/mean": -0.10774186253547668, |
|
"rewards/cosine_scaled_reward/std": 0.3644869327545166, |
|
"rewards/format_reward/mean": 0.640625, |
|
"rewards/format_reward/std": 0.4836103618144989, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.453125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1889.0, |
|
"completions/mean_length": 1502.15625, |
|
"completions/mean_terminated_length": 1049.8857421875, |
|
"completions/min_length": 398.0, |
|
"completions/min_terminated_length": 398.0, |
|
"epoch": 0.112, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19055023789405823, |
|
"learning_rate": 1.0246514708427701e-07, |
|
"loss": 0.0435, |
|
"num_tokens": 11717784.0, |
|
"reward": 0.3386792242527008, |
|
"reward_std": 0.560700535774231, |
|
"rewards/cosine_scaled_reward/mean": -0.1197228878736496, |
|
"rewards/cosine_scaled_reward/std": 0.3668956160545349, |
|
"rewards/format_reward/mean": 0.578125, |
|
"rewards/format_reward/std": 0.49776285886764526, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.375, |
|
"completions/clipped_ratio": 0.703125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1972.0, |
|
"completions/mean_length": 1663.796875, |
|
"completions/mean_terminated_length": 753.8421020507812, |
|
"completions/min_length": 384.0, |
|
"completions/min_terminated_length": 384.0, |
|
"epoch": 0.11314285714285714, |
|
"epsilon_high_adjusted": 0.275, |
|
"epsilon_low_adjusted": 0.275, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17839431762695312, |
|
"learning_rate": 1.0109617738307911e-07, |
|
"loss": 0.0419, |
|
"num_tokens": 11835435.0, |
|
"reward": 0.13991518318653107, |
|
"reward_std": 0.4746280312538147, |
|
"rewards/cosine_scaled_reward/mean": -0.08629240095615387, |
|
"rewards/cosine_scaled_reward/std": 0.40295156836509705, |
|
"rewards/format_reward/mean": 0.3125, |
|
"rewards/format_reward/std": 0.467176616191864, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completion_accuracy": 0.0, |
|
"completions/clipped_ratio": 0.375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1838.0, |
|
"completions/mean_length": 1376.015625, |
|
"completions/mean_terminated_length": 972.8250122070312, |
|
"completions/min_length": 583.0, |
|
"completions/min_terminated_length": 583.0, |
|
"epoch": 0.11428571428571428, |
|
"epsilon_high_adjusted": 0.2, |
|
"epsilon_low_adjusted": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.16691775619983673, |
|
"learning_rate": 1.002741278414069e-07, |
|
"loss": 0.0456, |
|
"num_tokens": 11933212.0, |
|
"reward": 0.7074599266052246, |
|
"reward_std": 0.6797176003456116, |
|
"rewards/cosine_scaled_reward/mean": 0.03341745585203171, |
|
"rewards/cosine_scaled_reward/std": 0.4788829982280731, |
|
"rewards/format_reward/mean": 0.640625, |
|
"rewards/format_reward/std": 0.4836103618144989, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11428571428571428, |
|
"step": 100, |
|
"total_flos": 0.0, |
|
"train_loss": 0.042700088126584886, |
|
"train_runtime": 5405.613, |
|
"train_samples_per_second": 1.184, |
|
"train_steps_per_second": 0.018 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 11933212, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|