|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.22857142857142856, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.671875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1734.0, |
|
"completions/mean_length": 1702.03125, |
|
"completions/mean_terminated_length": 993.6190795898438, |
|
"completions/min_length": 483.0, |
|
"completions/min_terminated_length": 483.0, |
|
"epoch": 0.001142857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20052470266819, |
|
"learning_rate": 0.0, |
|
"loss": 0.0427, |
|
"num_tokens": 118418.0, |
|
"reward": 0.17899775505065918, |
|
"reward_std": 0.7650213241577148, |
|
"rewards/cosine_scaled_reward/mean": -0.09800112992525101, |
|
"rewards/cosine_scaled_reward/std": 0.37953105568885803, |
|
"rewards/format_reward/mean": 0.375, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.71875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1894.0, |
|
"completions/mean_length": 1738.90625, |
|
"completions/mean_terminated_length": 949.0, |
|
"completions/min_length": 435.0, |
|
"completions/min_terminated_length": 435.0, |
|
"epoch": 0.002285714285714286, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19504369795322418, |
|
"learning_rate": 5e-08, |
|
"loss": 0.0561, |
|
"num_tokens": 239748.0, |
|
"reward": 0.3848632574081421, |
|
"reward_std": 0.9111153483390808, |
|
"rewards/cosine_scaled_reward/mean": 0.020556632429361343, |
|
"rewards/cosine_scaled_reward/std": 0.4492928683757782, |
|
"rewards/format_reward/mean": 0.34375, |
|
"rewards/format_reward/std": 0.4787135720252991, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.90625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1896.0, |
|
"completions/mean_length": 1948.96875, |
|
"completions/mean_terminated_length": 991.6666870117188, |
|
"completions/min_length": 534.0, |
|
"completions/min_terminated_length": 534.0, |
|
"epoch": 0.0034285714285714284, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.23850594460964203, |
|
"learning_rate": 1e-07, |
|
"loss": 0.0525, |
|
"num_tokens": 374954.0, |
|
"reward": -0.2894650101661682, |
|
"reward_std": 0.40320682525634766, |
|
"rewards/cosine_scaled_reward/mean": -0.1916075050830841, |
|
"rewards/cosine_scaled_reward/std": 0.17467568814754486, |
|
"rewards/format_reward/mean": 0.09375, |
|
"rewards/format_reward/std": 0.29378482699394226, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.53125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1653.0, |
|
"completions/mean_length": 1545.390625, |
|
"completions/mean_terminated_length": 975.7667236328125, |
|
"completions/min_length": 564.0, |
|
"completions/min_terminated_length": 564.0, |
|
"epoch": 0.004571428571428572, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19908685982227325, |
|
"learning_rate": 1.5e-07, |
|
"loss": 0.0836, |
|
"num_tokens": 483667.0, |
|
"reward": 0.1905757486820221, |
|
"reward_std": 0.6709368824958801, |
|
"rewards/cosine_scaled_reward/mean": -0.16252461075782776, |
|
"rewards/cosine_scaled_reward/std": 0.27594515681266785, |
|
"rewards/format_reward/mean": 0.515625, |
|
"rewards/format_reward/std": 0.5037065148353577, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.90625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2019.0, |
|
"completions/mean_length": 1966.78125, |
|
"completions/mean_terminated_length": 1181.666748046875, |
|
"completions/min_length": 474.0, |
|
"completions/min_terminated_length": 474.0, |
|
"epoch": 0.005714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21755796670913696, |
|
"learning_rate": 2e-07, |
|
"loss": 0.0519, |
|
"num_tokens": 620357.0, |
|
"reward": -0.402042031288147, |
|
"reward_std": 0.399784117937088, |
|
"rewards/cosine_scaled_reward/mean": -0.24789603054523468, |
|
"rewards/cosine_scaled_reward/std": 0.18156999349594116, |
|
"rewards/format_reward/mean": 0.09375, |
|
"rewards/format_reward/std": 0.29378482699394226, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1264.0, |
|
"completions/mean_length": 1897.390625, |
|
"completions/mean_terminated_length": 843.125, |
|
"completions/min_length": 628.0, |
|
"completions/min_terminated_length": 628.0, |
|
"epoch": 0.006857142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2295181304216385, |
|
"learning_rate": 2.5e-07, |
|
"loss": 0.0729, |
|
"num_tokens": 753438.0, |
|
"reward": -0.3786737024784088, |
|
"reward_std": 0.4345499277114868, |
|
"rewards/cosine_scaled_reward/mean": -0.2596493363380432, |
|
"rewards/cosine_scaled_reward/std": 0.1708926111459732, |
|
"rewards/format_reward/mean": 0.140625, |
|
"rewards/format_reward/std": 0.3503824472427368, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.859375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2000.0, |
|
"completions/mean_length": 1933.21875, |
|
"completions/mean_terminated_length": 1231.77783203125, |
|
"completions/min_length": 863.0, |
|
"completions/min_terminated_length": 863.0, |
|
"epoch": 0.008, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20217153429985046, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0254, |
|
"num_tokens": 887572.0, |
|
"reward": -0.13325583934783936, |
|
"reward_std": 0.5423575639724731, |
|
"rewards/cosine_scaled_reward/mean": -0.17600291967391968, |
|
"rewards/cosine_scaled_reward/std": 0.35686567425727844, |
|
"rewards/format_reward/mean": 0.21875, |
|
"rewards/format_reward/std": 0.4166666865348816, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2024.0, |
|
"completions/mean_length": 1776.96875, |
|
"completions/mean_terminated_length": 1180.7000732421875, |
|
"completions/min_length": 342.0, |
|
"completions/min_terminated_length": 342.0, |
|
"epoch": 0.009142857142857144, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19864660501480103, |
|
"learning_rate": 3.5e-07, |
|
"loss": -0.0092, |
|
"num_tokens": 1011714.0, |
|
"reward": 0.35212597250938416, |
|
"reward_std": 0.7144544720649719, |
|
"rewards/cosine_scaled_reward/mean": -0.003624534234404564, |
|
"rewards/cosine_scaled_reward/std": 0.515006422996521, |
|
"rewards/format_reward/mean": 0.359375, |
|
"rewards/format_reward/std": 0.4836103618144989, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.890625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1941.0, |
|
"completions/mean_length": 1951.0625, |
|
"completions/mean_terminated_length": 1161.71435546875, |
|
"completions/min_length": 636.0, |
|
"completions/min_terminated_length": 636.0, |
|
"epoch": 0.010285714285714285, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20887432992458344, |
|
"learning_rate": 4e-07, |
|
"loss": 0.0806, |
|
"num_tokens": 1148038.0, |
|
"reward": -0.3706062436103821, |
|
"reward_std": 0.4610140025615692, |
|
"rewards/cosine_scaled_reward/mean": -0.25561562180519104, |
|
"rewards/cosine_scaled_reward/std": 0.1772036999464035, |
|
"rewards/format_reward/mean": 0.140625, |
|
"rewards/format_reward/std": 0.3503824472427368, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.703125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1271.0, |
|
"completions/mean_length": 1669.9375, |
|
"completions/mean_terminated_length": 774.5263061523438, |
|
"completions/min_length": 303.0, |
|
"completions/min_terminated_length": 303.0, |
|
"epoch": 0.011428571428571429, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20181182026863098, |
|
"learning_rate": 4.5e-07, |
|
"loss": 0.043, |
|
"num_tokens": 1265746.0, |
|
"reward": 0.0919075608253479, |
|
"reward_std": 0.5226040482521057, |
|
"rewards/cosine_scaled_reward/mean": -0.10248372703790665, |
|
"rewards/cosine_scaled_reward/std": 0.37469154596328735, |
|
"rewards/format_reward/mean": 0.296875, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.921875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 839.0, |
|
"completions/mean_length": 1948.453125, |
|
"completions/mean_terminated_length": 773.7999877929688, |
|
"completions/min_length": 659.0, |
|
"completions/min_terminated_length": 659.0, |
|
"epoch": 0.012571428571428572, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21668891608715057, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0312, |
|
"num_tokens": 1402119.0, |
|
"reward": -0.4548088014125824, |
|
"reward_std": 0.35335251688957214, |
|
"rewards/cosine_scaled_reward/mean": -0.2664669156074524, |
|
"rewards/cosine_scaled_reward/std": 0.1670963168144226, |
|
"rewards/format_reward/mean": 0.078125, |
|
"rewards/format_reward/std": 0.27048972249031067, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.578125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1787.0, |
|
"completions/mean_length": 1666.046875, |
|
"completions/mean_terminated_length": 1142.629638671875, |
|
"completions/min_length": 157.0, |
|
"completions/min_terminated_length": 157.0, |
|
"epoch": 0.013714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.22070375084877014, |
|
"learning_rate": 5.5e-07, |
|
"loss": 0.0437, |
|
"num_tokens": 1519690.0, |
|
"reward": 0.07585961371660233, |
|
"reward_std": 0.7337090373039246, |
|
"rewards/cosine_scaled_reward/mean": -0.21207019686698914, |
|
"rewards/cosine_scaled_reward/std": 0.32506927847862244, |
|
"rewards/format_reward/mean": 0.5, |
|
"rewards/format_reward/std": 0.5039526224136353, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.703125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1837.0, |
|
"completions/mean_length": 1780.578125, |
|
"completions/mean_terminated_length": 1147.2105712890625, |
|
"completions/min_length": 780.0, |
|
"completions/min_terminated_length": 780.0, |
|
"epoch": 0.014857142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21096666157245636, |
|
"learning_rate": 6e-07, |
|
"loss": 0.0463, |
|
"num_tokens": 1644687.0, |
|
"reward": 0.10567126423120499, |
|
"reward_std": 0.7079647779464722, |
|
"rewards/cosine_scaled_reward/mean": -0.11122686415910721, |
|
"rewards/cosine_scaled_reward/std": 0.3569961190223694, |
|
"rewards/format_reward/mean": 0.328125, |
|
"rewards/format_reward/std": 0.4732423722743988, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.765625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1880.0, |
|
"completions/mean_length": 1887.984375, |
|
"completions/mean_terminated_length": 1365.2667236328125, |
|
"completions/min_length": 824.0, |
|
"completions/min_terminated_length": 824.0, |
|
"epoch": 0.016, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21131716668605804, |
|
"learning_rate": 6.5e-07, |
|
"loss": 0.0144, |
|
"num_tokens": 1776126.0, |
|
"reward": -0.0225231796503067, |
|
"reward_std": 0.5179126262664795, |
|
"rewards/cosine_scaled_reward/mean": -0.14407408237457275, |
|
"rewards/cosine_scaled_reward/std": 0.33444011211395264, |
|
"rewards/format_reward/mean": 0.265625, |
|
"rewards/format_reward/std": 0.44515693187713623, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1351.0, |
|
"completions/mean_length": 1718.78125, |
|
"completions/mean_terminated_length": 731.125, |
|
"completions/min_length": 420.0, |
|
"completions/min_terminated_length": 420.0, |
|
"epoch": 0.017142857142857144, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1991148591041565, |
|
"learning_rate": 7e-07, |
|
"loss": 0.0049, |
|
"num_tokens": 1897048.0, |
|
"reward": 0.19555333256721497, |
|
"reward_std": 0.40205830335617065, |
|
"rewards/cosine_scaled_reward/mean": -0.04284832626581192, |
|
"rewards/cosine_scaled_reward/std": 0.4670048952102661, |
|
"rewards/format_reward/mean": 0.28125, |
|
"rewards/format_reward/std": 0.4531635046005249, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.96875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1697.0, |
|
"completions/mean_length": 2027.5, |
|
"completions/mean_terminated_length": 1392.0, |
|
"completions/min_length": 1087.0, |
|
"completions/min_terminated_length": 1087.0, |
|
"epoch": 0.018285714285714287, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.22394295036792755, |
|
"learning_rate": 7.5e-07, |
|
"loss": 0.0187, |
|
"num_tokens": 2037248.0, |
|
"reward": -0.47975414991378784, |
|
"reward_std": 0.3722427487373352, |
|
"rewards/cosine_scaled_reward/mean": -0.2555020749568939, |
|
"rewards/cosine_scaled_reward/std": 0.17358116805553436, |
|
"rewards/format_reward/mean": 0.03125, |
|
"rewards/format_reward/std": 0.17536810040473938, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.640625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1789.0, |
|
"completions/mean_length": 1608.859375, |
|
"completions/mean_terminated_length": 826.0435180664062, |
|
"completions/min_length": 325.0, |
|
"completions/min_terminated_length": 325.0, |
|
"epoch": 0.019428571428571427, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20954757928848267, |
|
"learning_rate": 8e-07, |
|
"loss": 0.0717, |
|
"num_tokens": 2150735.0, |
|
"reward": 0.09985511004924774, |
|
"reward_std": 0.7668930292129517, |
|
"rewards/cosine_scaled_reward/mean": -0.13757243752479553, |
|
"rewards/cosine_scaled_reward/std": 0.3857298791408539, |
|
"rewards/format_reward/mean": 0.375, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.78125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1656.0, |
|
"completions/mean_length": 1832.9375, |
|
"completions/mean_terminated_length": 1064.857177734375, |
|
"completions/min_length": 616.0, |
|
"completions/min_terminated_length": 616.0, |
|
"epoch": 0.02057142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19936956465244293, |
|
"learning_rate": 8.499999999999999e-07, |
|
"loss": 0.0415, |
|
"num_tokens": 2278419.0, |
|
"reward": -0.09606979787349701, |
|
"reward_std": 0.6028552055358887, |
|
"rewards/cosine_scaled_reward/mean": -0.1886598914861679, |
|
"rewards/cosine_scaled_reward/std": 0.2934761047363281, |
|
"rewards/format_reward/mean": 0.28125, |
|
"rewards/format_reward/std": 0.4531635046005249, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.71875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1992.0, |
|
"completions/mean_length": 1797.421875, |
|
"completions/mean_terminated_length": 1157.0555419921875, |
|
"completions/min_length": 548.0, |
|
"completions/min_terminated_length": 548.0, |
|
"epoch": 0.021714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20787546038627625, |
|
"learning_rate": 9e-07, |
|
"loss": 0.0691, |
|
"num_tokens": 2404710.0, |
|
"reward": 0.3256925344467163, |
|
"reward_std": 0.7026835680007935, |
|
"rewards/cosine_scaled_reward/mean": -0.02465374395251274, |
|
"rewards/cosine_scaled_reward/std": 0.48578760027885437, |
|
"rewards/format_reward/mean": 0.375, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.609375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1981.0, |
|
"completions/mean_length": 1595.921875, |
|
"completions/mean_terminated_length": 890.6799926757812, |
|
"completions/min_length": 357.0, |
|
"completions/min_terminated_length": 357.0, |
|
"epoch": 0.022857142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19203181564807892, |
|
"learning_rate": 9.499999999999999e-07, |
|
"loss": 0.0843, |
|
"num_tokens": 2518201.0, |
|
"reward": 0.2115776240825653, |
|
"reward_std": 0.6924929618835449, |
|
"rewards/cosine_scaled_reward/mean": -0.09733618050813675, |
|
"rewards/cosine_scaled_reward/std": 0.4008020758628845, |
|
"rewards/format_reward/mean": 0.40625, |
|
"rewards/format_reward/std": 0.49501484632492065, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.65625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1892.0, |
|
"completions/mean_length": 1669.71875, |
|
"completions/mean_terminated_length": 947.5454711914062, |
|
"completions/min_length": 333.0, |
|
"completions/min_terminated_length": 333.0, |
|
"epoch": 0.024, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19905951619148254, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0554, |
|
"num_tokens": 2635871.0, |
|
"reward": -0.04711771011352539, |
|
"reward_std": 0.6225218772888184, |
|
"rewards/cosine_scaled_reward/mean": -0.2032463699579239, |
|
"rewards/cosine_scaled_reward/std": 0.32066139578819275, |
|
"rewards/format_reward/mean": 0.359375, |
|
"rewards/format_reward/std": 0.4836103618144989, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.46875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1776.0, |
|
"completions/mean_length": 1381.5625, |
|
"completions/mean_terminated_length": 793.5294189453125, |
|
"completions/min_length": 290.0, |
|
"completions/min_terminated_length": 290.0, |
|
"epoch": 0.025142857142857144, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2047095149755478, |
|
"learning_rate": 9.99931462820376e-07, |
|
"loss": 0.0102, |
|
"num_tokens": 2733307.0, |
|
"reward": 0.5420082807540894, |
|
"reward_std": 0.5808548927307129, |
|
"rewards/cosine_scaled_reward/mean": -0.04149584099650383, |
|
"rewards/cosine_scaled_reward/std": 0.45060864090919495, |
|
"rewards/format_reward/mean": 0.625, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.640625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1883.0, |
|
"completions/mean_length": 1658.0, |
|
"completions/mean_terminated_length": 962.7826538085938, |
|
"completions/min_length": 405.0, |
|
"completions/min_terminated_length": 405.0, |
|
"epoch": 0.026285714285714287, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19252249598503113, |
|
"learning_rate": 9.997258721585931e-07, |
|
"loss": 0.0556, |
|
"num_tokens": 2850211.0, |
|
"reward": -0.003935225307941437, |
|
"reward_std": 0.5448156595230103, |
|
"rewards/cosine_scaled_reward/mean": -0.21290510892868042, |
|
"rewards/cosine_scaled_reward/std": 0.3244985342025757, |
|
"rewards/format_reward/mean": 0.421875, |
|
"rewards/format_reward/std": 0.49776285886764526, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.65625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1979.0, |
|
"completions/mean_length": 1739.015625, |
|
"completions/mean_terminated_length": 1149.1363525390625, |
|
"completions/min_length": 512.0, |
|
"completions/min_terminated_length": 512.0, |
|
"epoch": 0.027428571428571427, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20268025994300842, |
|
"learning_rate": 9.993832906395582e-07, |
|
"loss": 0.0283, |
|
"num_tokens": 2972436.0, |
|
"reward": 0.023234538733959198, |
|
"reward_std": 0.5804120898246765, |
|
"rewards/cosine_scaled_reward/mean": -0.1836952269077301, |
|
"rewards/cosine_scaled_reward/std": 0.3640914857387543, |
|
"rewards/format_reward/mean": 0.390625, |
|
"rewards/format_reward/std": 0.4917473793029785, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.71875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1737.0, |
|
"completions/mean_length": 1718.3125, |
|
"completions/mean_terminated_length": 875.7777709960938, |
|
"completions/min_length": 484.0, |
|
"completions/min_terminated_length": 484.0, |
|
"epoch": 0.02857142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21169544756412506, |
|
"learning_rate": 9.989038226169207e-07, |
|
"loss": 0.0649, |
|
"num_tokens": 3092704.0, |
|
"reward": -0.048267342150211334, |
|
"reward_std": 0.6947153210639954, |
|
"rewards/cosine_scaled_reward/mean": -0.17257116734981537, |
|
"rewards/cosine_scaled_reward/std": 0.33179494738578796, |
|
"rewards/format_reward/mean": 0.296875, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.796875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2039.0, |
|
"completions/mean_length": 1931.46875, |
|
"completions/mean_terminated_length": 1474.3077392578125, |
|
"completions/min_length": 860.0, |
|
"completions/min_terminated_length": 860.0, |
|
"epoch": 0.029714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21874327957630157, |
|
"learning_rate": 9.982876141412855e-07, |
|
"loss": 0.0248, |
|
"num_tokens": 3226950.0, |
|
"reward": 0.07520664483308792, |
|
"reward_std": 0.5721991658210754, |
|
"rewards/cosine_scaled_reward/mean": -0.09520917385816574, |
|
"rewards/cosine_scaled_reward/std": 0.355131059885025, |
|
"rewards/format_reward/mean": 0.265625, |
|
"rewards/format_reward/std": 0.44515693187713623, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.859375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1709.0, |
|
"completions/mean_length": 1887.21875, |
|
"completions/mean_terminated_length": 904.6666870117188, |
|
"completions/min_length": 505.0, |
|
"completions/min_terminated_length": 505.0, |
|
"epoch": 0.030857142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2260063886642456, |
|
"learning_rate": 9.975348529157229e-07, |
|
"loss": 0.0282, |
|
"num_tokens": 3358020.0, |
|
"reward": -0.12340383231639862, |
|
"reward_std": 0.6229674220085144, |
|
"rewards/cosine_scaled_reward/mean": -0.1788894236087799, |
|
"rewards/cosine_scaled_reward/std": 0.27315112948417664, |
|
"rewards/format_reward/mean": 0.234375, |
|
"rewards/format_reward/std": 0.42695629596710205, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1976.0, |
|
"completions/mean_length": 1818.03125, |
|
"completions/mean_terminated_length": 1128.125, |
|
"completions/min_length": 441.0, |
|
"completions/min_terminated_length": 441.0, |
|
"epoch": 0.032, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2172878384590149, |
|
"learning_rate": 9.96645768238595e-07, |
|
"loss": 0.0203, |
|
"num_tokens": 3484710.0, |
|
"reward": -0.06130418926477432, |
|
"reward_std": 0.6516651511192322, |
|
"rewards/cosine_scaled_reward/mean": -0.17908960580825806, |
|
"rewards/cosine_scaled_reward/std": 0.3907976746559143, |
|
"rewards/format_reward/mean": 0.296875, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.953125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1187.0, |
|
"completions/mean_length": 1990.765625, |
|
"completions/mean_terminated_length": 827.0, |
|
"completions/min_length": 625.0, |
|
"completions/min_terminated_length": 625.0, |
|
"epoch": 0.03314285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21073698997497559, |
|
"learning_rate": 9.956206309337066e-07, |
|
"loss": 0.0469, |
|
"num_tokens": 3622591.0, |
|
"reward": -0.33952879905700684, |
|
"reward_std": 0.447256475687027, |
|
"rewards/cosine_scaled_reward/mean": -0.20882689952850342, |
|
"rewards/cosine_scaled_reward/std": 0.20297211408615112, |
|
"rewards/format_reward/mean": 0.078125, |
|
"rewards/format_reward/std": 0.27048972249031067, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2034.0, |
|
"completions/mean_length": 1843.828125, |
|
"completions/mean_terminated_length": 1231.3125, |
|
"completions/min_length": 767.0, |
|
"completions/min_terminated_length": 767.0, |
|
"epoch": 0.03428571428571429, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21709226071834564, |
|
"learning_rate": 9.944597532678119e-07, |
|
"loss": 0.0171, |
|
"num_tokens": 3751132.0, |
|
"reward": -0.024381320923566818, |
|
"reward_std": 0.6315211057662964, |
|
"rewards/cosine_scaled_reward/mean": -0.16062816977500916, |
|
"rewards/cosine_scaled_reward/std": 0.2835782468318939, |
|
"rewards/format_reward/mean": 0.296875, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.859375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1040.0, |
|
"completions/mean_length": 1853.625, |
|
"completions/mean_terminated_length": 665.7777709960938, |
|
"completions/min_length": 496.0, |
|
"completions/min_terminated_length": 496.0, |
|
"epoch": 0.03542857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20489497482776642, |
|
"learning_rate": 9.931634888554935e-07, |
|
"loss": 0.0071, |
|
"num_tokens": 3880260.0, |
|
"reward": -0.22396349906921387, |
|
"reward_std": 0.6550674438476562, |
|
"rewards/cosine_scaled_reward/mean": -0.19791924953460693, |
|
"rewards/cosine_scaled_reward/std": 0.3350917100906372, |
|
"rewards/format_reward/mean": 0.171875, |
|
"rewards/format_reward/std": 0.38025420904159546, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1997.0, |
|
"completions/mean_length": 1902.109375, |
|
"completions/mean_terminated_length": 1269.916748046875, |
|
"completions/min_length": 772.0, |
|
"completions/min_terminated_length": 772.0, |
|
"epoch": 0.036571428571428574, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20957782864570618, |
|
"learning_rate": 9.917322325514487e-07, |
|
"loss": 0.0611, |
|
"num_tokens": 4012347.0, |
|
"reward": -0.22782376408576965, |
|
"reward_std": 0.6326622366905212, |
|
"rewards/cosine_scaled_reward/mean": -0.22328688204288483, |
|
"rewards/cosine_scaled_reward/std": 0.3028508722782135, |
|
"rewards/format_reward/mean": 0.21875, |
|
"rewards/format_reward/std": 0.4166666865348816, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1698.0, |
|
"completions/mean_length": 1945.34375, |
|
"completions/mean_terminated_length": 1226.75, |
|
"completions/min_length": 887.0, |
|
"completions/min_terminated_length": 887.0, |
|
"epoch": 0.037714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.22317089140415192, |
|
"learning_rate": 9.901664203302124e-07, |
|
"loss": 0.0347, |
|
"num_tokens": 4148065.0, |
|
"reward": -0.47040778398513794, |
|
"reward_std": 0.4409722089767456, |
|
"rewards/cosine_scaled_reward/mean": -0.30551639199256897, |
|
"rewards/cosine_scaled_reward/std": 0.22323259711265564, |
|
"rewards/format_reward/mean": 0.140625, |
|
"rewards/format_reward/std": 0.3503824472427368, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.515625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1826.0, |
|
"completions/mean_length": 1541.515625, |
|
"completions/mean_terminated_length": 1002.3547973632812, |
|
"completions/min_length": 475.0, |
|
"completions/min_terminated_length": 475.0, |
|
"epoch": 0.038857142857142854, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2360963523387909, |
|
"learning_rate": 9.88466529153356e-07, |
|
"loss": 0.0712, |
|
"num_tokens": 4256274.0, |
|
"reward": 0.5805569291114807, |
|
"reward_std": 0.8525061011314392, |
|
"rewards/cosine_scaled_reward/mean": 0.04027845710515976, |
|
"rewards/cosine_scaled_reward/std": 0.49936607480049133, |
|
"rewards/format_reward/mean": 0.5, |
|
"rewards/format_reward/std": 0.5039526224136353, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.796875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1861.0, |
|
"completions/mean_length": 1808.921875, |
|
"completions/mean_terminated_length": 871.0000610351562, |
|
"completions/min_length": 466.0, |
|
"completions/min_terminated_length": 466.0, |
|
"epoch": 0.04, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1972445547580719, |
|
"learning_rate": 9.866330768241983e-07, |
|
"loss": 0.0577, |
|
"num_tokens": 4383541.0, |
|
"reward": 0.00036025047302246094, |
|
"reward_std": 0.8111597895622253, |
|
"rewards/cosine_scaled_reward/mean": -0.10919487476348877, |
|
"rewards/cosine_scaled_reward/std": 0.44675883650779724, |
|
"rewards/format_reward/mean": 0.21875, |
|
"rewards/format_reward/std": 0.4166666865348816, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.921875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1979.0, |
|
"completions/mean_length": 1990.765625, |
|
"completions/mean_terminated_length": 1315.4000244140625, |
|
"completions/min_length": 937.0, |
|
"completions/min_terminated_length": 937.0, |
|
"epoch": 0.04114285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2348623126745224, |
|
"learning_rate": 9.846666218300807e-07, |
|
"loss": 0.0216, |
|
"num_tokens": 4522062.0, |
|
"reward": -0.4222595691680908, |
|
"reward_std": 0.4755689203739166, |
|
"rewards/cosine_scaled_reward/mean": -0.2501922845840454, |
|
"rewards/cosine_scaled_reward/std": 0.2129606157541275, |
|
"rewards/format_reward/mean": 0.078125, |
|
"rewards/format_reward/std": 0.27048972249031067, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.84375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1928.0, |
|
"completions/mean_length": 1911.296875, |
|
"completions/mean_terminated_length": 1173.0999755859375, |
|
"completions/min_length": 629.0, |
|
"completions/min_terminated_length": 629.0, |
|
"epoch": 0.04228571428571429, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.22154958546161652, |
|
"learning_rate": 9.825677631722435e-07, |
|
"loss": 0.0467, |
|
"num_tokens": 4655409.0, |
|
"reward": -0.2846450209617615, |
|
"reward_std": 0.4525028467178345, |
|
"rewards/cosine_scaled_reward/mean": -0.23607251048088074, |
|
"rewards/cosine_scaled_reward/std": 0.19240929186344147, |
|
"rewards/format_reward/mean": 0.1875, |
|
"rewards/format_reward/std": 0.39339789748191833, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.84375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1530.0, |
|
"completions/mean_length": 1906.65625, |
|
"completions/mean_terminated_length": 1143.4000244140625, |
|
"completions/min_length": 530.0, |
|
"completions/min_terminated_length": 530.0, |
|
"epoch": 0.04342857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2259596437215805, |
|
"learning_rate": 9.80337140183366e-07, |
|
"loss": 0.0219, |
|
"num_tokens": 4789147.0, |
|
"reward": -0.14314083755016327, |
|
"reward_std": 0.4587753117084503, |
|
"rewards/cosine_scaled_reward/mean": -0.14969542622566223, |
|
"rewards/cosine_scaled_reward/std": 0.30969110131263733, |
|
"rewards/format_reward/mean": 0.15625, |
|
"rewards/format_reward/std": 0.36596253514289856, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.765625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2042.0, |
|
"completions/mean_length": 1729.984375, |
|
"completions/mean_terminated_length": 691.1333618164062, |
|
"completions/min_length": 312.0, |
|
"completions/min_terminated_length": 312.0, |
|
"epoch": 0.044571428571428574, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1975395530462265, |
|
"learning_rate": 9.779754323328192e-07, |
|
"loss": 0.0518, |
|
"num_tokens": 4910650.0, |
|
"reward": 0.20782151818275452, |
|
"reward_std": 0.5801891088485718, |
|
"rewards/cosine_scaled_reward/mean": -0.08358924090862274, |
|
"rewards/cosine_scaled_reward/std": 0.3715744912624359, |
|
"rewards/format_reward/mean": 0.375, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.546875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1936.0, |
|
"completions/mean_length": 1565.40625, |
|
"completions/mean_terminated_length": 982.9655151367188, |
|
"completions/min_length": 393.0, |
|
"completions/min_terminated_length": 393.0, |
|
"epoch": 0.045714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19556699693202972, |
|
"learning_rate": 9.754833590196926e-07, |
|
"loss": 0.0176, |
|
"num_tokens": 5020908.0, |
|
"reward": 0.21666434407234192, |
|
"reward_std": 0.47607892751693726, |
|
"rewards/cosine_scaled_reward/mean": -0.12604281306266785, |
|
"rewards/cosine_scaled_reward/std": 0.4459211230278015, |
|
"rewards/format_reward/mean": 0.46875, |
|
"rewards/format_reward/std": 0.5029674172401428, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1971.0, |
|
"completions/mean_length": 1847.96875, |
|
"completions/mean_terminated_length": 1247.875, |
|
"completions/min_length": 799.0, |
|
"completions/min_terminated_length": 799.0, |
|
"epoch": 0.046857142857142854, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19488316774368286, |
|
"learning_rate": 9.728616793536587e-07, |
|
"loss": 0.0491, |
|
"num_tokens": 5150330.0, |
|
"reward": -0.15268605947494507, |
|
"reward_std": 0.6881446838378906, |
|
"rewards/cosine_scaled_reward/mean": -0.22478052973747253, |
|
"rewards/cosine_scaled_reward/std": 0.3324533700942993, |
|
"rewards/format_reward/mean": 0.296875, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.71875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2018.0, |
|
"completions/mean_length": 1661.296875, |
|
"completions/mean_terminated_length": 673.0555419921875, |
|
"completions/min_length": 134.0, |
|
"completions/min_terminated_length": 134.0, |
|
"epoch": 0.048, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21376171708106995, |
|
"learning_rate": 9.701111919237408e-07, |
|
"loss": 0.0433, |
|
"num_tokens": 5267013.0, |
|
"reward": -0.20060807466506958, |
|
"reward_std": 0.34422361850738525, |
|
"rewards/cosine_scaled_reward/mean": -0.24874155223369598, |
|
"rewards/cosine_scaled_reward/std": 0.17742608487606049, |
|
"rewards/format_reward/mean": 0.296875, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.78125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2040.0, |
|
"completions/mean_length": 1802.484375, |
|
"completions/mean_terminated_length": 925.6428833007812, |
|
"completions/min_length": 580.0, |
|
"completions/min_terminated_length": 580.0, |
|
"epoch": 0.04914285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20949861407279968, |
|
"learning_rate": 9.672327345550543e-07, |
|
"loss": 0.0468, |
|
"num_tokens": 5393988.0, |
|
"reward": 0.1097467839717865, |
|
"reward_std": 0.4439903795719147, |
|
"rewards/cosine_scaled_reward/mean": -0.07012660801410675, |
|
"rewards/cosine_scaled_reward/std": 0.35852304100990295, |
|
"rewards/format_reward/mean": 0.25, |
|
"rewards/format_reward/std": 0.4364357888698578, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1497.0, |
|
"completions/mean_length": 1639.375, |
|
"completions/mean_terminated_length": 740.4000244140625, |
|
"completions/min_length": 186.0, |
|
"completions/min_terminated_length": 186.0, |
|
"epoch": 0.05028571428571429, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20765061676502228, |
|
"learning_rate": 9.64227184053598e-07, |
|
"loss": 0.0677, |
|
"num_tokens": 5509604.0, |
|
"reward": 0.1744289994239807, |
|
"reward_std": 0.7545564770698547, |
|
"rewards/cosine_scaled_reward/mean": -0.09247300028800964, |
|
"rewards/cosine_scaled_reward/std": 0.486594021320343, |
|
"rewards/format_reward/mean": 0.359375, |
|
"rewards/format_reward/std": 0.4836103618144989, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.921875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1784.0, |
|
"completions/mean_length": 2015.1875, |
|
"completions/mean_terminated_length": 1628.0, |
|
"completions/min_length": 1485.0, |
|
"completions/min_terminated_length": 1485.0, |
|
"epoch": 0.05142857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.22293689846992493, |
|
"learning_rate": 9.610954559391704e-07, |
|
"loss": 0.0141, |
|
"num_tokens": 5650232.0, |
|
"reward": -0.28319618105888367, |
|
"reward_std": 0.44461578130722046, |
|
"rewards/cosine_scaled_reward/mean": -0.19628559052944183, |
|
"rewards/cosine_scaled_reward/std": 0.2942677140235901, |
|
"rewards/format_reward/mean": 0.109375, |
|
"rewards/format_reward/std": 0.3145764470100403, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.765625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1232.0, |
|
"completions/mean_length": 1769.8125, |
|
"completions/mean_terminated_length": 861.0667114257812, |
|
"completions/min_length": 538.0, |
|
"completions/min_terminated_length": 538.0, |
|
"epoch": 0.052571428571428575, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21012793481349945, |
|
"learning_rate": 9.578385041664925e-07, |
|
"loss": 0.0845, |
|
"num_tokens": 5774668.0, |
|
"reward": -0.19958055019378662, |
|
"reward_std": 0.37389740347862244, |
|
"rewards/cosine_scaled_reward/mean": -0.2247902750968933, |
|
"rewards/cosine_scaled_reward/std": 0.18379005789756775, |
|
"rewards/format_reward/mean": 0.25, |
|
"rewards/format_reward/std": 0.4364357888698578, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1969.0, |
|
"completions/mean_length": 1761.734375, |
|
"completions/mean_terminated_length": 1131.9500732421875, |
|
"completions/min_length": 370.0, |
|
"completions/min_terminated_length": 370.0, |
|
"epoch": 0.053714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2044854760169983, |
|
"learning_rate": 9.54457320834625e-07, |
|
"loss": 0.0366, |
|
"num_tokens": 5897819.0, |
|
"reward": -0.11128583550453186, |
|
"reward_std": 0.7243642210960388, |
|
"rewards/cosine_scaled_reward/mean": -0.22751793265342712, |
|
"rewards/cosine_scaled_reward/std": 0.341621071100235, |
|
"rewards/format_reward/mean": 0.34375, |
|
"rewards/format_reward/std": 0.4787135720252991, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.671875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2015.0, |
|
"completions/mean_length": 1720.890625, |
|
"completions/mean_terminated_length": 1051.09521484375, |
|
"completions/min_length": 430.0, |
|
"completions/min_terminated_length": 430.0, |
|
"epoch": 0.054857142857142854, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19758965075016022, |
|
"learning_rate": 9.509529358847654e-07, |
|
"loss": 0.046, |
|
"num_tokens": 6018500.0, |
|
"reward": 0.026797622442245483, |
|
"reward_std": 0.5594782829284668, |
|
"rewards/cosine_scaled_reward/mean": -0.16628868877887726, |
|
"rewards/cosine_scaled_reward/std": 0.29110410809516907, |
|
"rewards/format_reward/mean": 0.359375, |
|
"rewards/format_reward/std": 0.4836103618144989, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.546875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1723.0, |
|
"completions/mean_length": 1488.71875, |
|
"completions/mean_terminated_length": 813.72412109375, |
|
"completions/min_length": 402.0, |
|
"completions/min_terminated_length": 402.0, |
|
"epoch": 0.056, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1846495270729065, |
|
"learning_rate": 9.473264167865171e-07, |
|
"loss": 0.0203, |
|
"num_tokens": 6123842.0, |
|
"reward": 0.3029339909553528, |
|
"reward_std": 0.6658899188041687, |
|
"rewards/cosine_scaled_reward/mean": -0.09853300452232361, |
|
"rewards/cosine_scaled_reward/std": 0.4083656370639801, |
|
"rewards/format_reward/mean": 0.5, |
|
"rewards/format_reward/std": 0.5039526224136353, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1959.0, |
|
"completions/mean_length": 1733.59375, |
|
"completions/mean_terminated_length": 790.375, |
|
"completions/min_length": 305.0, |
|
"completions/min_terminated_length": 305.0, |
|
"epoch": 0.05714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19527027010917664, |
|
"learning_rate": 9.43578868212728e-07, |
|
"loss": 0.0136, |
|
"num_tokens": 6245608.0, |
|
"reward": 0.15902790427207947, |
|
"reward_std": 0.46005839109420776, |
|
"rewards/cosine_scaled_reward/mean": -0.06892354786396027, |
|
"rewards/cosine_scaled_reward/std": 0.4567166864871979, |
|
"rewards/format_reward/mean": 0.296875, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.515625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1916.0, |
|
"completions/mean_length": 1432.421875, |
|
"completions/mean_terminated_length": 777.1290283203125, |
|
"completions/min_length": 401.0, |
|
"completions/min_terminated_length": 401.0, |
|
"epoch": 0.05828571428571429, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21701110899448395, |
|
"learning_rate": 9.397114317029974e-07, |
|
"loss": 0.0239, |
|
"num_tokens": 6347491.0, |
|
"reward": 0.2233203500509262, |
|
"reward_std": 0.6041151285171509, |
|
"rewards/cosine_scaled_reward/mean": -0.1383398175239563, |
|
"rewards/cosine_scaled_reward/std": 0.3747152090072632, |
|
"rewards/format_reward/mean": 0.5, |
|
"rewards/format_reward/std": 0.5039526224136353, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1448.0, |
|
"completions/mean_length": 1720.046875, |
|
"completions/mean_terminated_length": 736.1875, |
|
"completions/min_length": 301.0, |
|
"completions/min_terminated_length": 301.0, |
|
"epoch": 0.05942857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19577208161354065, |
|
"learning_rate": 9.357252853159505e-07, |
|
"loss": 0.0066, |
|
"num_tokens": 6468926.0, |
|
"reward": -0.1786521077156067, |
|
"reward_std": 0.3358575701713562, |
|
"rewards/cosine_scaled_reward/mean": -0.21432605385780334, |
|
"rewards/cosine_scaled_reward/std": 0.3689535856246948, |
|
"rewards/format_reward/mean": 0.25, |
|
"rewards/format_reward/std": 0.4364357888698578, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.71875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1931.0, |
|
"completions/mean_length": 1718.9375, |
|
"completions/mean_terminated_length": 878.0, |
|
"completions/min_length": 468.0, |
|
"completions/min_terminated_length": 468.0, |
|
"epoch": 0.060571428571428575, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21421696245670319, |
|
"learning_rate": 9.316216432703916e-07, |
|
"loss": 0.0423, |
|
"num_tokens": 6589770.0, |
|
"reward": -0.03741084039211273, |
|
"reward_std": 0.7027454376220703, |
|
"rewards/cosine_scaled_reward/mean": -0.17495542764663696, |
|
"rewards/cosine_scaled_reward/std": 0.29642969369888306, |
|
"rewards/format_reward/mean": 0.3125, |
|
"rewards/format_reward/std": 0.467176616191864, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.5625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2031.0, |
|
"completions/mean_length": 1664.625, |
|
"completions/mean_terminated_length": 1171.71435546875, |
|
"completions/min_length": 518.0, |
|
"completions/min_terminated_length": 518.0, |
|
"epoch": 0.061714285714285715, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19009248912334442, |
|
"learning_rate": 9.274017555754407e-07, |
|
"loss": 0.0958, |
|
"num_tokens": 6707450.0, |
|
"reward": 0.2984742522239685, |
|
"reward_std": 1.0811007022857666, |
|
"rewards/cosine_scaled_reward/mean": -0.08513787388801575, |
|
"rewards/cosine_scaled_reward/std": 0.455229252576828, |
|
"rewards/format_reward/mean": 0.46875, |
|
"rewards/format_reward/std": 0.5029674172401428, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.765625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1352.0, |
|
"completions/mean_length": 1757.359375, |
|
"completions/mean_terminated_length": 807.933349609375, |
|
"completions/min_length": 517.0, |
|
"completions/min_terminated_length": 517.0, |
|
"epoch": 0.06285714285714286, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1981392800807953, |
|
"learning_rate": 9.230669076497687e-07, |
|
"loss": 0.0294, |
|
"num_tokens": 6830209.0, |
|
"reward": 0.0005421042442321777, |
|
"reward_std": 0.512083888053894, |
|
"rewards/cosine_scaled_reward/mean": -0.1403539478778839, |
|
"rewards/cosine_scaled_reward/std": 0.37260064482688904, |
|
"rewards/format_reward/mean": 0.28125, |
|
"rewards/format_reward/std": 0.4531635046005249, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.65625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1955.0, |
|
"completions/mean_length": 1717.890625, |
|
"completions/mean_terminated_length": 1087.681884765625, |
|
"completions/min_length": 494.0, |
|
"completions/min_terminated_length": 494.0, |
|
"epoch": 0.064, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21031354367733002, |
|
"learning_rate": 9.186184199300463e-07, |
|
"loss": 0.0425, |
|
"num_tokens": 6951114.0, |
|
"reward": 0.25747445225715637, |
|
"reward_std": 0.5027350187301636, |
|
"rewards/cosine_scaled_reward/mean": -0.08220025897026062, |
|
"rewards/cosine_scaled_reward/std": 0.4609789550304413, |
|
"rewards/format_reward/mean": 0.421875, |
|
"rewards/format_reward/std": 0.49776285886764526, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.828125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1899.0, |
|
"completions/mean_length": 1946.203125, |
|
"completions/mean_terminated_length": 1455.727294921875, |
|
"completions/min_length": 844.0, |
|
"completions/min_terminated_length": 844.0, |
|
"epoch": 0.06514285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1840263158082962, |
|
"learning_rate": 9.140576474687263e-07, |
|
"loss": 0.0406, |
|
"num_tokens": 7087239.0, |
|
"reward": -0.31278592348098755, |
|
"reward_std": 0.5103937387466431, |
|
"rewards/cosine_scaled_reward/mean": -0.2501429617404938, |
|
"rewards/cosine_scaled_reward/std": 0.23870430886745453, |
|
"rewards/format_reward/mean": 0.1875, |
|
"rewards/format_reward/std": 0.39339789748191833, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.515625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2034.0, |
|
"completions/mean_length": 1514.5625, |
|
"completions/mean_terminated_length": 946.7096557617188, |
|
"completions/min_length": 411.0, |
|
"completions/min_terminated_length": 411.0, |
|
"epoch": 0.06628571428571428, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18003003299236298, |
|
"learning_rate": 9.093859795212817e-07, |
|
"loss": 0.0669, |
|
"num_tokens": 7194267.0, |
|
"reward": 0.3626611530780792, |
|
"reward_std": 0.6513576507568359, |
|
"rewards/cosine_scaled_reward/mean": -0.09991942346096039, |
|
"rewards/cosine_scaled_reward/std": 0.42993852496147156, |
|
"rewards/format_reward/mean": 0.5625, |
|
"rewards/format_reward/std": 0.5, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2017.0, |
|
"completions/mean_length": 1704.8125, |
|
"completions/mean_terminated_length": 1132.8333740234375, |
|
"completions/min_length": 524.0, |
|
"completions/min_terminated_length": 524.0, |
|
"epoch": 0.06742857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17114725708961487, |
|
"learning_rate": 9.046048391230247e-07, |
|
"loss": 0.0061, |
|
"num_tokens": 7313839.0, |
|
"reward": 0.15319865942001343, |
|
"reward_std": 0.6165874004364014, |
|
"rewards/cosine_scaled_reward/mean": -0.11871317774057388, |
|
"rewards/cosine_scaled_reward/std": 0.3659735918045044, |
|
"rewards/format_reward/mean": 0.390625, |
|
"rewards/format_reward/std": 0.4917473793029785, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.71875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1896.0, |
|
"completions/mean_length": 1767.53125, |
|
"completions/mean_terminated_length": 1050.77783203125, |
|
"completions/min_length": 459.0, |
|
"completions/min_terminated_length": 459.0, |
|
"epoch": 0.06857142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1782463639974594, |
|
"learning_rate": 8.997156826556369e-07, |
|
"loss": 0.0527, |
|
"num_tokens": 7437849.0, |
|
"reward": -0.09879650175571442, |
|
"reward_std": 0.6538424491882324, |
|
"rewards/cosine_scaled_reward/mean": -0.2212732434272766, |
|
"rewards/cosine_scaled_reward/std": 0.3128809630870819, |
|
"rewards/format_reward/mean": 0.34375, |
|
"rewards/format_reward/std": 0.4787135720252991, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1989.0, |
|
"completions/mean_length": 1799.53125, |
|
"completions/mean_terminated_length": 1054.125, |
|
"completions/min_length": 420.0, |
|
"completions/min_terminated_length": 420.0, |
|
"epoch": 0.06971428571428571, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19245384633541107, |
|
"learning_rate": 8.9471999940354e-07, |
|
"loss": 0.0533, |
|
"num_tokens": 7564539.0, |
|
"reward": 0.1226256862282753, |
|
"reward_std": 0.7401602268218994, |
|
"rewards/cosine_scaled_reward/mean": -0.11056216061115265, |
|
"rewards/cosine_scaled_reward/std": 0.314616322517395, |
|
"rewards/format_reward/mean": 0.34375, |
|
"rewards/format_reward/std": 0.4787135720252991, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.453125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2029.0, |
|
"completions/mean_length": 1458.1875, |
|
"completions/mean_terminated_length": 969.4857177734375, |
|
"completions/min_length": 364.0, |
|
"completions/min_terminated_length": 364.0, |
|
"epoch": 0.07085714285714285, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17495828866958618, |
|
"learning_rate": 8.896193111002475e-07, |
|
"loss": 0.0785, |
|
"num_tokens": 7668095.0, |
|
"reward": 0.6185990571975708, |
|
"reward_std": 0.6951406598091125, |
|
"rewards/cosine_scaled_reward/mean": 0.020237013697624207, |
|
"rewards/cosine_scaled_reward/std": 0.42793402075767517, |
|
"rewards/format_reward/mean": 0.578125, |
|
"rewards/format_reward/std": 0.49776285886764526, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1880.0, |
|
"completions/mean_length": 1369.65625, |
|
"completions/mean_terminated_length": 962.6500244140625, |
|
"completions/min_length": 384.0, |
|
"completions/min_terminated_length": 384.0, |
|
"epoch": 0.072, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17925356328487396, |
|
"learning_rate": 8.844151714648274e-07, |
|
"loss": 0.0802, |
|
"num_tokens": 7766009.0, |
|
"reward": 0.588592529296875, |
|
"reward_std": 0.7614073753356934, |
|
"rewards/cosine_scaled_reward/mean": -0.0260162390768528, |
|
"rewards/cosine_scaled_reward/std": 0.47686251997947693, |
|
"rewards/format_reward/mean": 0.640625, |
|
"rewards/format_reward/std": 0.4836103618144989, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.4375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1849.0, |
|
"completions/mean_length": 1493.0625, |
|
"completions/mean_terminated_length": 1061.4444580078125, |
|
"completions/min_length": 421.0, |
|
"completions/min_terminated_length": 421.0, |
|
"epoch": 0.07314285714285715, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.197045236825943, |
|
"learning_rate": 8.791091657286267e-07, |
|
"loss": 0.1112, |
|
"num_tokens": 7872517.0, |
|
"reward": 0.4587404727935791, |
|
"reward_std": 0.7483726739883423, |
|
"rewards/cosine_scaled_reward/mean": -0.08312976360321045, |
|
"rewards/cosine_scaled_reward/std": 0.3704431354999542, |
|
"rewards/format_reward/mean": 0.625, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1805.0, |
|
"completions/mean_length": 1561.09375, |
|
"completions/mean_terminated_length": 749.5833740234375, |
|
"completions/min_length": 276.0, |
|
"completions/min_terminated_length": 276.0, |
|
"epoch": 0.07428571428571429, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17185057699680328, |
|
"learning_rate": 8.737029101523929e-07, |
|
"loss": 0.0652, |
|
"num_tokens": 7983131.0, |
|
"reward": -0.022998124361038208, |
|
"reward_std": 0.5443873405456543, |
|
"rewards/cosine_scaled_reward/mean": -0.2146240472793579, |
|
"rewards/cosine_scaled_reward/std": 0.39696088433265686, |
|
"rewards/format_reward/mean": 0.40625, |
|
"rewards/format_reward/std": 0.49501484632492065, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1822.0, |
|
"completions/mean_length": 1160.96875, |
|
"completions/mean_terminated_length": 757.7727661132812, |
|
"completions/min_length": 245.0, |
|
"completions/min_terminated_length": 245.0, |
|
"epoch": 0.07542857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15682660043239594, |
|
"learning_rate": 8.681980515339463e-07, |
|
"loss": 0.0317, |
|
"num_tokens": 8067665.0, |
|
"reward": 0.7723344564437866, |
|
"reward_std": 0.5304180979728699, |
|
"rewards/cosine_scaled_reward/mean": 0.03460472822189331, |
|
"rewards/cosine_scaled_reward/std": 0.47199109196662903, |
|
"rewards/format_reward/mean": 0.703125, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.71875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1529.0, |
|
"completions/mean_length": 1760.03125, |
|
"completions/mean_terminated_length": 1024.111083984375, |
|
"completions/min_length": 494.0, |
|
"completions/min_terminated_length": 494.0, |
|
"epoch": 0.07657142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18002018332481384, |
|
"learning_rate": 8.625962667065487e-07, |
|
"loss": 0.0536, |
|
"num_tokens": 8191043.0, |
|
"reward": -0.27919694781303406, |
|
"reward_std": 0.3664131164550781, |
|
"rewards/cosine_scaled_reward/mean": -0.2724109888076782, |
|
"rewards/cosine_scaled_reward/std": 0.16395430266857147, |
|
"rewards/format_reward/mean": 0.265625, |
|
"rewards/format_reward/std": 0.44515693187713623, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.21875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1307.0, |
|
"completions/mean_length": 917.34375, |
|
"completions/mean_terminated_length": 600.760009765625, |
|
"completions/min_length": 295.0, |
|
"completions/min_terminated_length": 295.0, |
|
"epoch": 0.07771428571428571, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.13612917065620422, |
|
"learning_rate": 8.568992620281243e-07, |
|
"loss": 0.0077, |
|
"num_tokens": 8259009.0, |
|
"reward": 0.6957368850708008, |
|
"reward_std": 0.5402743816375732, |
|
"rewards/cosine_scaled_reward/mean": -0.04275655001401901, |
|
"rewards/cosine_scaled_reward/std": 0.434044748544693, |
|
"rewards/format_reward/mean": 0.78125, |
|
"rewards/format_reward/std": 0.4166666865348816, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1526.0, |
|
"completions/mean_length": 1233.78125, |
|
"completions/mean_terminated_length": 863.6818237304688, |
|
"completions/min_length": 343.0, |
|
"completions/min_terminated_length": 343.0, |
|
"epoch": 0.07885714285714286, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19043830037117004, |
|
"learning_rate": 8.511087728614862e-07, |
|
"loss": 0.0558, |
|
"num_tokens": 8348315.0, |
|
"reward": 0.21049074828624725, |
|
"reward_std": 0.5405222177505493, |
|
"rewards/cosine_scaled_reward/mean": -0.24631711840629578, |
|
"rewards/cosine_scaled_reward/std": 0.2778205871582031, |
|
"rewards/format_reward/mean": 0.703125, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.796875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2019.0, |
|
"completions/mean_length": 1808.9375, |
|
"completions/mean_terminated_length": 871.0769653320312, |
|
"completions/min_length": 513.0, |
|
"completions/min_terminated_length": 513.0, |
|
"epoch": 0.08, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19704341888427734, |
|
"learning_rate": 8.452265630457282e-07, |
|
"loss": 0.0391, |
|
"num_tokens": 8475543.0, |
|
"reward": -0.18982277810573578, |
|
"reward_std": 0.5247766971588135, |
|
"rewards/cosine_scaled_reward/mean": -0.2355363965034485, |
|
"rewards/cosine_scaled_reward/std": 0.3067134916782379, |
|
"rewards/format_reward/mean": 0.28125, |
|
"rewards/format_reward/std": 0.4531635046005249, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.609375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1800.0, |
|
"completions/mean_length": 1563.21875, |
|
"completions/mean_terminated_length": 806.9599609375, |
|
"completions/min_length": 315.0, |
|
"completions/min_terminated_length": 315.0, |
|
"epoch": 0.08114285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18498718738555908, |
|
"learning_rate": 8.392544243589427e-07, |
|
"loss": 0.016, |
|
"num_tokens": 8586309.0, |
|
"reward": 0.19864726066589355, |
|
"reward_std": 0.576451301574707, |
|
"rewards/cosine_scaled_reward/mean": -0.10380134731531143, |
|
"rewards/cosine_scaled_reward/std": 0.476872056722641, |
|
"rewards/format_reward/mean": 0.40625, |
|
"rewards/format_reward/std": 0.49501484632492065, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.4375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2048.0, |
|
"completions/mean_length": 1406.0625, |
|
"completions/mean_terminated_length": 906.7777709960938, |
|
"completions/min_length": 353.0, |
|
"completions/min_terminated_length": 353.0, |
|
"epoch": 0.08228571428571428, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17368191480636597, |
|
"learning_rate": 8.331941759724268e-07, |
|
"loss": 0.0237, |
|
"num_tokens": 8686649.0, |
|
"reward": 0.22483232617378235, |
|
"reward_std": 0.45926159620285034, |
|
"rewards/cosine_scaled_reward/mean": -0.20789632201194763, |
|
"rewards/cosine_scaled_reward/std": 0.294547975063324, |
|
"rewards/format_reward/mean": 0.640625, |
|
"rewards/format_reward/std": 0.4836103618144989, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1930.0, |
|
"completions/mean_length": 1912.875, |
|
"completions/mean_terminated_length": 1327.3333740234375, |
|
"completions/min_length": 878.0, |
|
"completions/min_terminated_length": 878.0, |
|
"epoch": 0.08342857142857144, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20081810653209686, |
|
"learning_rate": 8.270476638965461e-07, |
|
"loss": 0.0223, |
|
"num_tokens": 8819801.0, |
|
"reward": -0.18328779935836792, |
|
"reward_std": 0.5305245518684387, |
|
"rewards/cosine_scaled_reward/mean": -0.20883139967918396, |
|
"rewards/cosine_scaled_reward/std": 0.2695733904838562, |
|
"rewards/format_reward/mean": 0.234375, |
|
"rewards/format_reward/std": 0.42695629596710205, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.5, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2012.0, |
|
"completions/mean_length": 1517.875, |
|
"completions/mean_terminated_length": 987.75, |
|
"completions/min_length": 560.0, |
|
"completions/min_terminated_length": 560.0, |
|
"epoch": 0.08457142857142858, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1813385784626007, |
|
"learning_rate": 8.208167604184217e-07, |
|
"loss": 0.085, |
|
"num_tokens": 8926873.0, |
|
"reward": 0.46356096863746643, |
|
"reward_std": 0.6926693916320801, |
|
"rewards/cosine_scaled_reward/mean": -0.018219511955976486, |
|
"rewards/cosine_scaled_reward/std": 0.47079169750213623, |
|
"rewards/format_reward/mean": 0.5, |
|
"rewards/format_reward/std": 0.5039526224136353, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.46875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1907.0, |
|
"completions/mean_length": 1515.734375, |
|
"completions/mean_terminated_length": 1046.0882568359375, |
|
"completions/min_length": 374.0, |
|
"completions/min_terminated_length": 374.0, |
|
"epoch": 0.08571428571428572, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18714174628257751, |
|
"learning_rate": 8.145033635316128e-07, |
|
"loss": 0.0989, |
|
"num_tokens": 9034840.0, |
|
"reward": 0.5457433462142944, |
|
"reward_std": 0.6619582176208496, |
|
"rewards/cosine_scaled_reward/mean": -0.00837831199169159, |
|
"rewards/cosine_scaled_reward/std": 0.5059990882873535, |
|
"rewards/format_reward/mean": 0.5625, |
|
"rewards/format_reward/std": 0.5, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.4375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1222.0, |
|
"completions/mean_length": 1340.484375, |
|
"completions/mean_terminated_length": 790.1944580078125, |
|
"completions/min_length": 407.0, |
|
"completions/min_terminated_length": 407.0, |
|
"epoch": 0.08685714285714285, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17108581960201263, |
|
"learning_rate": 8.081093963579707e-07, |
|
"loss": 0.0209, |
|
"num_tokens": 9131031.0, |
|
"reward": 0.19882698357105255, |
|
"reward_std": 0.5817238092422485, |
|
"rewards/cosine_scaled_reward/mean": -0.18964898586273193, |
|
"rewards/cosine_scaled_reward/std": 0.3000561594963074, |
|
"rewards/format_reward/mean": 0.578125, |
|
"rewards/format_reward/std": 0.49776285886764526, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.46875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1755.0, |
|
"completions/mean_length": 1518.765625, |
|
"completions/mean_terminated_length": 1051.7940673828125, |
|
"completions/min_length": 641.0, |
|
"completions/min_terminated_length": 641.0, |
|
"epoch": 0.088, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1759587675333023, |
|
"learning_rate": 8.01636806561836e-07, |
|
"loss": 0.0869, |
|
"num_tokens": 9239808.0, |
|
"reward": 0.2113216668367386, |
|
"reward_std": 0.5600536465644836, |
|
"rewards/cosine_scaled_reward/mean": -0.1599641740322113, |
|
"rewards/cosine_scaled_reward/std": 0.33541423082351685, |
|
"rewards/format_reward/mean": 0.53125, |
|
"rewards/format_reward/std": 0.5029674172401428, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.640625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1659.0, |
|
"completions/mean_length": 1656.15625, |
|
"completions/mean_terminated_length": 957.6522216796875, |
|
"completions/min_length": 530.0, |
|
"completions/min_terminated_length": 530.0, |
|
"epoch": 0.08914285714285715, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17662394046783447, |
|
"learning_rate": 7.950875657567621e-07, |
|
"loss": 0.0177, |
|
"num_tokens": 9356522.0, |
|
"reward": 0.25513648986816406, |
|
"reward_std": 0.5462654829025269, |
|
"rewards/cosine_scaled_reward/mean": -0.05993174761533737, |
|
"rewards/cosine_scaled_reward/std": 0.4486319124698639, |
|
"rewards/format_reward/mean": 0.375, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1957.0, |
|
"completions/mean_length": 1289.359375, |
|
"completions/mean_terminated_length": 834.1749877929688, |
|
"completions/min_length": 229.0, |
|
"completions/min_terminated_length": 229.0, |
|
"epoch": 0.09028571428571429, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15610884130001068, |
|
"learning_rate": 7.884636689049422e-07, |
|
"loss": 0.026, |
|
"num_tokens": 9449137.0, |
|
"reward": 0.4372347593307495, |
|
"reward_std": 0.5517712831497192, |
|
"rewards/cosine_scaled_reward/mean": -0.10950762033462524, |
|
"rewards/cosine_scaled_reward/std": 0.3864338994026184, |
|
"rewards/format_reward/mean": 0.65625, |
|
"rewards/format_reward/std": 0.4787135720252991, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.546875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1902.0, |
|
"completions/mean_length": 1623.5, |
|
"completions/mean_terminated_length": 1111.17236328125, |
|
"completions/min_length": 538.0, |
|
"completions/min_terminated_length": 538.0, |
|
"epoch": 0.09142857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18003858625888824, |
|
"learning_rate": 7.817671337095244e-07, |
|
"loss": 0.017, |
|
"num_tokens": 9563433.0, |
|
"reward": 0.11363417655229568, |
|
"reward_std": 0.5530154705047607, |
|
"rewards/cosine_scaled_reward/mean": -0.16974541544914246, |
|
"rewards/cosine_scaled_reward/std": 0.3006208539009094, |
|
"rewards/format_reward/mean": 0.453125, |
|
"rewards/format_reward/std": 0.501733124256134, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.40625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1987.0, |
|
"completions/mean_length": 1432.125, |
|
"completions/mean_terminated_length": 1010.7368774414062, |
|
"completions/min_length": 287.0, |
|
"completions/min_terminated_length": 287.0, |
|
"epoch": 0.09257142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2004833072423935, |
|
"learning_rate": 7.75e-07, |
|
"loss": 0.0603, |
|
"num_tokens": 9666361.0, |
|
"reward": 0.512394905090332, |
|
"reward_std": 0.7596394419670105, |
|
"rewards/cosine_scaled_reward/mean": -0.05630255863070488, |
|
"rewards/cosine_scaled_reward/std": 0.43662360310554504, |
|
"rewards/format_reward/mean": 0.625, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.34375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1721.0, |
|
"completions/mean_length": 1341.03125, |
|
"completions/mean_terminated_length": 970.7142944335938, |
|
"completions/min_length": 431.0, |
|
"completions/min_terminated_length": 431.0, |
|
"epoch": 0.09371428571428571, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1563584953546524, |
|
"learning_rate": 7.681643291108517e-07, |
|
"loss": 0.0182, |
|
"num_tokens": 9762515.0, |
|
"reward": 0.746865451335907, |
|
"reward_std": 0.571272611618042, |
|
"rewards/cosine_scaled_reward/mean": 0.037495262920856476, |
|
"rewards/cosine_scaled_reward/std": 0.5523709654808044, |
|
"rewards/format_reward/mean": 0.671875, |
|
"rewards/format_reward/std": 0.4732423722743988, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.421875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1739.0, |
|
"completions/mean_length": 1357.640625, |
|
"completions/mean_terminated_length": 853.8648681640625, |
|
"completions/min_length": 455.0, |
|
"completions/min_terminated_length": 455.0, |
|
"epoch": 0.09485714285714286, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17990301549434662, |
|
"learning_rate": 7.612622032536507e-07, |
|
"loss": 0.0313, |
|
"num_tokens": 9860492.0, |
|
"reward": 0.4607480764389038, |
|
"reward_std": 0.4022068381309509, |
|
"rewards/cosine_scaled_reward/mean": -0.0665009543299675, |
|
"rewards/cosine_scaled_reward/std": 0.36611077189445496, |
|
"rewards/format_reward/mean": 0.59375, |
|
"rewards/format_reward/std": 0.49501484632492065, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.390625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2044.0, |
|
"completions/mean_length": 1392.609375, |
|
"completions/mean_terminated_length": 972.4871826171875, |
|
"completions/min_length": 395.0, |
|
"completions/min_terminated_length": 395.0, |
|
"epoch": 0.096, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.16545262932777405, |
|
"learning_rate": 7.54295724882796e-07, |
|
"loss": 0.016, |
|
"num_tokens": 9960315.0, |
|
"reward": 0.3932368755340576, |
|
"reward_std": 0.662509024143219, |
|
"rewards/cosine_scaled_reward/mean": -0.11588154733181, |
|
"rewards/cosine_scaled_reward/std": 0.428220272064209, |
|
"rewards/format_reward/mean": 0.625, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.25, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2034.0, |
|
"completions/mean_length": 1427.515625, |
|
"completions/mean_terminated_length": 1220.6875, |
|
"completions/min_length": 234.0, |
|
"completions/min_terminated_length": 234.0, |
|
"epoch": 0.09714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14229631423950195, |
|
"learning_rate": 7.472670160550848e-07, |
|
"loss": -0.0247, |
|
"num_tokens": 10061996.0, |
|
"reward": 0.7478936910629272, |
|
"reward_std": 0.8706425428390503, |
|
"rewards/cosine_scaled_reward/mean": -0.00886566936969757, |
|
"rewards/cosine_scaled_reward/std": 0.4233645796775818, |
|
"rewards/format_reward/mean": 0.765625, |
|
"rewards/format_reward/std": 0.42695629596710205, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.390625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2023.0, |
|
"completions/mean_length": 1459.28125, |
|
"completions/mean_terminated_length": 1081.8974609375, |
|
"completions/min_length": 496.0, |
|
"completions/min_terminated_length": 496.0, |
|
"epoch": 0.09828571428571428, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.19291459023952484, |
|
"learning_rate": 7.401782177833147e-07, |
|
"loss": 0.0182, |
|
"num_tokens": 10166246.0, |
|
"reward": 0.30948999524116516, |
|
"reward_std": 0.55961012840271, |
|
"rewards/cosine_scaled_reward/mean": -0.1733800172805786, |
|
"rewards/cosine_scaled_reward/std": 0.30220499634742737, |
|
"rewards/format_reward/mean": 0.65625, |
|
"rewards/format_reward/std": 0.4787135720252991, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.15625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1866.0, |
|
"completions/mean_length": 1140.828125, |
|
"completions/mean_terminated_length": 972.8333129882812, |
|
"completions/min_length": 353.0, |
|
"completions/min_terminated_length": 353.0, |
|
"epoch": 0.09942857142857142, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14790062606334686, |
|
"learning_rate": 7.330314893841101e-07, |
|
"loss": 0.0321, |
|
"num_tokens": 10249379.0, |
|
"reward": 0.429340660572052, |
|
"reward_std": 0.47173961997032166, |
|
"rewards/cosine_scaled_reward/mean": -0.207204669713974, |
|
"rewards/cosine_scaled_reward/std": 0.27721449732780457, |
|
"rewards/format_reward/mean": 0.84375, |
|
"rewards/format_reward/std": 0.36596253514289856, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.328125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1519.0, |
|
"completions/mean_length": 1258.484375, |
|
"completions/mean_terminated_length": 872.906982421875, |
|
"completions/min_length": 246.0, |
|
"completions/min_terminated_length": 246.0, |
|
"epoch": 0.10057142857142858, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1598745733499527, |
|
"learning_rate": 7.258290078201731e-07, |
|
"loss": 0.0482, |
|
"num_tokens": 10340434.0, |
|
"reward": 0.8419445157051086, |
|
"reward_std": 0.7817317247390747, |
|
"rewards/cosine_scaled_reward/mean": 0.06940975040197372, |
|
"rewards/cosine_scaled_reward/std": 0.4935828149318695, |
|
"rewards/format_reward/mean": 0.703125, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.234375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1919.0, |
|
"completions/mean_length": 1373.1875, |
|
"completions/mean_terminated_length": 1166.6121826171875, |
|
"completions/min_length": 675.0, |
|
"completions/min_terminated_length": 675.0, |
|
"epoch": 0.10171428571428572, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1521584838628769, |
|
"learning_rate": 7.185729670371604e-07, |
|
"loss": 0.0547, |
|
"num_tokens": 10439318.0, |
|
"reward": 0.648002028465271, |
|
"reward_std": 0.6874127984046936, |
|
"rewards/cosine_scaled_reward/mean": -0.0978739783167839, |
|
"rewards/cosine_scaled_reward/std": 0.41632241010665894, |
|
"rewards/format_reward/mean": 0.84375, |
|
"rewards/format_reward/std": 0.36596253514289856, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.234375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2023.0, |
|
"completions/mean_length": 1239.703125, |
|
"completions/mean_terminated_length": 992.2652587890625, |
|
"completions/min_length": 162.0, |
|
"completions/min_terminated_length": 162.0, |
|
"epoch": 0.10285714285714286, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1843656599521637, |
|
"learning_rate": 7.11265577295385e-07, |
|
"loss": 0.0371, |
|
"num_tokens": 10528659.0, |
|
"reward": 0.4645897150039673, |
|
"reward_std": 0.6535974740982056, |
|
"rewards/cosine_scaled_reward/mean": -0.15833015739917755, |
|
"rewards/cosine_scaled_reward/std": 0.3457205295562744, |
|
"rewards/format_reward/mean": 0.78125, |
|
"rewards/format_reward/std": 0.4166666865348816, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.453125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1961.0, |
|
"completions/mean_length": 1610.09375, |
|
"completions/mean_terminated_length": 1247.2572021484375, |
|
"completions/min_length": 205.0, |
|
"completions/min_terminated_length": 205.0, |
|
"epoch": 0.104, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17640981078147888, |
|
"learning_rate": 7.039090644965509e-07, |
|
"loss": 0.0305, |
|
"num_tokens": 10642273.0, |
|
"reward": 0.5222002267837524, |
|
"reward_std": 0.9113218784332275, |
|
"rewards/cosine_scaled_reward/mean": -0.05139988660812378, |
|
"rewards/cosine_scaled_reward/std": 0.4710950553417206, |
|
"rewards/format_reward/mean": 0.625, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.34375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2040.0, |
|
"completions/mean_length": 1320.09375, |
|
"completions/mean_terminated_length": 938.8095703125, |
|
"completions/min_length": 332.0, |
|
"completions/min_terminated_length": 332.0, |
|
"epoch": 0.10514285714285715, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15313342213630676, |
|
"learning_rate": 6.965056695057204e-07, |
|
"loss": 0.0055, |
|
"num_tokens": 10736751.0, |
|
"reward": 0.4166978597640991, |
|
"reward_std": 0.6364502310752869, |
|
"rewards/cosine_scaled_reward/mean": -0.13540107011795044, |
|
"rewards/cosine_scaled_reward/std": 0.3054071068763733, |
|
"rewards/format_reward/mean": 0.6875, |
|
"rewards/format_reward/std": 0.467176616191864, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.703125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1964.0, |
|
"completions/mean_length": 1770.671875, |
|
"completions/mean_terminated_length": 1113.8421630859375, |
|
"completions/min_length": 632.0, |
|
"completions/min_terminated_length": 632.0, |
|
"epoch": 0.10628571428571429, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.21292737126350403, |
|
"learning_rate": 6.890576474687263e-07, |
|
"loss": 0.0675, |
|
"num_tokens": 10861418.0, |
|
"reward": -0.15841422975063324, |
|
"reward_std": 0.4093279242515564, |
|
"rewards/cosine_scaled_reward/mean": -0.24326962232589722, |
|
"rewards/cosine_scaled_reward/std": 0.16840828955173492, |
|
"rewards/format_reward/mean": 0.328125, |
|
"rewards/format_reward/std": 0.4732423722743988, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.53125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2042.0, |
|
"completions/mean_length": 1532.65625, |
|
"completions/mean_terminated_length": 948.6000366210938, |
|
"completions/min_length": 511.0, |
|
"completions/min_terminated_length": 511.0, |
|
"epoch": 0.10742857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20122359693050385, |
|
"learning_rate": 6.815672671252315e-07, |
|
"loss": 0.0623, |
|
"num_tokens": 10969276.0, |
|
"reward": 0.20252148807048798, |
|
"reward_std": 0.345744788646698, |
|
"rewards/cosine_scaled_reward/mean": -0.1409267634153366, |
|
"rewards/cosine_scaled_reward/std": 0.4320366382598877, |
|
"rewards/format_reward/mean": 0.484375, |
|
"rewards/format_reward/std": 0.5037065148353577, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.453125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1745.0, |
|
"completions/mean_length": 1530.03125, |
|
"completions/mean_terminated_length": 1100.857177734375, |
|
"completions/min_length": 700.0, |
|
"completions/min_terminated_length": 700.0, |
|
"epoch": 0.10857142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.16728746891021729, |
|
"learning_rate": 6.740368101176495e-07, |
|
"loss": 0.0592, |
|
"num_tokens": 11077726.0, |
|
"reward": 0.05856095254421234, |
|
"reward_std": 0.5498154163360596, |
|
"rewards/cosine_scaled_reward/mean": -0.25196951627731323, |
|
"rewards/cosine_scaled_reward/std": 0.27556198835372925, |
|
"rewards/format_reward/mean": 0.5625, |
|
"rewards/format_reward/std": 0.5, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.234375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1633.0, |
|
"completions/mean_length": 1279.6875, |
|
"completions/mean_terminated_length": 1044.48974609375, |
|
"completions/min_length": 452.0, |
|
"completions/min_terminated_length": 452.0, |
|
"epoch": 0.10971428571428571, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1483285129070282, |
|
"learning_rate": 6.664685702961344e-07, |
|
"loss": 0.0161, |
|
"num_tokens": 11170762.0, |
|
"reward": 0.8373413681983948, |
|
"reward_std": 0.4410895109176636, |
|
"rewards/cosine_scaled_reward/mean": 0.01242067664861679, |
|
"rewards/cosine_scaled_reward/std": 0.46624863147735596, |
|
"rewards/format_reward/mean": 0.8125, |
|
"rewards/format_reward/std": 0.39339789748191833, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.28125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1905.0, |
|
"completions/mean_length": 1312.640625, |
|
"completions/mean_terminated_length": 1024.891357421875, |
|
"completions/min_length": 343.0, |
|
"completions/min_terminated_length": 343.0, |
|
"epoch": 0.11085714285714286, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.16424083709716797, |
|
"learning_rate": 6.588648530198504e-07, |
|
"loss": 0.0535, |
|
"num_tokens": 11265035.0, |
|
"reward": 0.5586233139038086, |
|
"reward_std": 0.7126098871231079, |
|
"rewards/cosine_scaled_reward/mean": -0.11131332814693451, |
|
"rewards/cosine_scaled_reward/std": 0.3577263653278351, |
|
"rewards/format_reward/mean": 0.78125, |
|
"rewards/format_reward/std": 0.4166666865348816, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.34375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1940.0, |
|
"completions/mean_length": 1376.53125, |
|
"completions/mean_terminated_length": 1024.8095703125, |
|
"completions/min_length": 372.0, |
|
"completions/min_terminated_length": 372.0, |
|
"epoch": 0.112, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17384155094623566, |
|
"learning_rate": 6.512279744547392e-07, |
|
"loss": 0.0164, |
|
"num_tokens": 11364197.0, |
|
"reward": 0.6794039607048035, |
|
"reward_std": 0.4869590997695923, |
|
"rewards/cosine_scaled_reward/mean": -0.02748553454875946, |
|
"rewards/cosine_scaled_reward/std": 0.45645180344581604, |
|
"rewards/format_reward/mean": 0.734375, |
|
"rewards/format_reward/std": 0.44515693187713623, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.296875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2048.0, |
|
"completions/mean_length": 1280.09375, |
|
"completions/mean_terminated_length": 955.86669921875, |
|
"completions/min_length": 415.0, |
|
"completions/min_terminated_length": 415.0, |
|
"epoch": 0.11314285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17342573404312134, |
|
"learning_rate": 6.435602608679916e-07, |
|
"loss": 0.0816, |
|
"num_tokens": 11457291.0, |
|
"reward": 0.7432724237442017, |
|
"reward_std": 0.6722617745399475, |
|
"rewards/cosine_scaled_reward/mean": -0.003363795578479767, |
|
"rewards/cosine_scaled_reward/std": 0.4415356516838074, |
|
"rewards/format_reward/mean": 0.75, |
|
"rewards/format_reward/std": 0.4364357888698578, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1965.0, |
|
"completions/mean_length": 1247.765625, |
|
"completions/mean_terminated_length": 1063.09619140625, |
|
"completions/min_length": 520.0, |
|
"completions/min_terminated_length": 520.0, |
|
"epoch": 0.11428571428571428, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15301530063152313, |
|
"learning_rate": 6.358640479194451e-07, |
|
"loss": 0.0125, |
|
"num_tokens": 11546860.0, |
|
"reward": 0.803851306438446, |
|
"reward_std": 0.6947499513626099, |
|
"rewards/cosine_scaled_reward/mean": -0.019949357956647873, |
|
"rewards/cosine_scaled_reward/std": 0.4705973267555237, |
|
"rewards/format_reward/mean": 0.84375, |
|
"rewards/format_reward/std": 0.36596253514289856, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.15625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1911.0, |
|
"completions/mean_length": 1269.671875, |
|
"completions/mean_terminated_length": 1125.5369873046875, |
|
"completions/min_length": 485.0, |
|
"completions/min_terminated_length": 485.0, |
|
"epoch": 0.11542857142857142, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1690932661294937, |
|
"learning_rate": 6.281416799501187e-07, |
|
"loss": 0.013, |
|
"num_tokens": 11639551.0, |
|
"reward": 0.6836185455322266, |
|
"reward_std": 0.5046678781509399, |
|
"rewards/cosine_scaled_reward/mean": -0.08787819743156433, |
|
"rewards/cosine_scaled_reward/std": 0.40181559324264526, |
|
"rewards/format_reward/mean": 0.859375, |
|
"rewards/format_reward/std": 0.3503824472427368, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.15625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1872.0, |
|
"completions/mean_length": 1174.265625, |
|
"completions/mean_terminated_length": 1012.4629516601562, |
|
"completions/min_length": 340.0, |
|
"completions/min_terminated_length": 340.0, |
|
"epoch": 0.11657142857142858, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.16043449938297272, |
|
"learning_rate": 6.203955092681039e-07, |
|
"loss": 0.032, |
|
"num_tokens": 11724856.0, |
|
"reward": 0.67606520652771, |
|
"reward_std": 0.6234960556030273, |
|
"rewards/cosine_scaled_reward/mean": -0.09165491163730621, |
|
"rewards/cosine_scaled_reward/std": 0.37837859988212585, |
|
"rewards/format_reward/mean": 0.859375, |
|
"rewards/format_reward/std": 0.3503824472427368, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.203125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1953.0, |
|
"completions/mean_length": 1157.8125, |
|
"completions/mean_terminated_length": 930.9019775390625, |
|
"completions/min_length": 247.0, |
|
"completions/min_terminated_length": 247.0, |
|
"epoch": 0.11771428571428572, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1574372500181198, |
|
"learning_rate": 6.126278954320294e-07, |
|
"loss": 0.0249, |
|
"num_tokens": 11809308.0, |
|
"reward": 0.4326379895210266, |
|
"reward_std": 0.5444109439849854, |
|
"rewards/cosine_scaled_reward/mean": -0.1977435052394867, |
|
"rewards/cosine_scaled_reward/std": 0.3261271119117737, |
|
"rewards/format_reward/mean": 0.828125, |
|
"rewards/format_reward/std": 0.38025420904159546, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.34375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1988.0, |
|
"completions/mean_length": 1330.484375, |
|
"completions/mean_terminated_length": 954.6428833007812, |
|
"completions/min_length": 371.0, |
|
"completions/min_terminated_length": 371.0, |
|
"epoch": 0.11885714285714286, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18211479485034943, |
|
"learning_rate": 6.048412045323164e-07, |
|
"loss": 0.0439, |
|
"num_tokens": 11904923.0, |
|
"reward": 0.4620264172554016, |
|
"reward_std": 0.5293800830841064, |
|
"rewards/cosine_scaled_reward/mean": -0.12054930627346039, |
|
"rewards/cosine_scaled_reward/std": 0.3497216999530792, |
|
"rewards/format_reward/mean": 0.703125, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.21875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2003.0, |
|
"completions/mean_length": 1224.859375, |
|
"completions/mean_terminated_length": 994.3800048828125, |
|
"completions/min_length": 499.0, |
|
"completions/min_terminated_length": 499.0, |
|
"epoch": 0.12, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1528584063053131, |
|
"learning_rate": 5.97037808470444e-07, |
|
"loss": 0.0233, |
|
"num_tokens": 11994602.0, |
|
"reward": 0.7569347620010376, |
|
"reward_std": 0.6899948120117188, |
|
"rewards/cosine_scaled_reward/mean": -0.027782641351222992, |
|
"rewards/cosine_scaled_reward/std": 0.5096075534820557, |
|
"rewards/format_reward/mean": 0.8125, |
|
"rewards/format_reward/std": 0.39339789748191833, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.296875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2046.0, |
|
"completions/mean_length": 1239.625, |
|
"completions/mean_terminated_length": 898.3111572265625, |
|
"completions/min_length": 293.0, |
|
"completions/min_terminated_length": 293.0, |
|
"epoch": 0.12114285714285715, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1494080275297165, |
|
"learning_rate": 5.892200842364462e-07, |
|
"loss": 0.0226, |
|
"num_tokens": 12084770.0, |
|
"reward": 1.043992519378662, |
|
"reward_std": 0.7194849252700806, |
|
"rewards/cosine_scaled_reward/mean": 0.13918372988700867, |
|
"rewards/cosine_scaled_reward/std": 0.46339961886405945, |
|
"rewards/format_reward/mean": 0.765625, |
|
"rewards/format_reward/std": 0.42695629596710205, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.15625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2036.0, |
|
"completions/mean_length": 1139.515625, |
|
"completions/mean_terminated_length": 971.2777709960938, |
|
"completions/min_length": 401.0, |
|
"completions/min_terminated_length": 401.0, |
|
"epoch": 0.12228571428571429, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1769389808177948, |
|
"learning_rate": 5.813904131848564e-07, |
|
"loss": 0.0041, |
|
"num_tokens": 12168851.0, |
|
"reward": 0.46204712986946106, |
|
"reward_std": 0.5935191512107849, |
|
"rewards/cosine_scaled_reward/mean": -0.18303894996643066, |
|
"rewards/cosine_scaled_reward/std": 0.30380427837371826, |
|
"rewards/format_reward/mean": 0.828125, |
|
"rewards/format_reward/std": 0.38025420904159546, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1791.0, |
|
"completions/mean_length": 1382.375, |
|
"completions/mean_terminated_length": 983.0, |
|
"completions/min_length": 348.0, |
|
"completions/min_terminated_length": 348.0, |
|
"epoch": 0.12342857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.16175994277000427, |
|
"learning_rate": 5.735511803093248e-07, |
|
"loss": 0.0651, |
|
"num_tokens": 12267683.0, |
|
"reward": 0.3516117334365845, |
|
"reward_std": 0.7561339735984802, |
|
"rewards/cosine_scaled_reward/mean": -0.17575663328170776, |
|
"rewards/cosine_scaled_reward/std": 0.35719168186187744, |
|
"rewards/format_reward/mean": 0.703125, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.34375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1965.0, |
|
"completions/mean_length": 1392.828125, |
|
"completions/mean_terminated_length": 1049.642822265625, |
|
"completions/min_length": 543.0, |
|
"completions/min_terminated_length": 543.0, |
|
"epoch": 0.12457142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.16766144335269928, |
|
"learning_rate": 5.657047735161255e-07, |
|
"loss": 0.0149, |
|
"num_tokens": 12368072.0, |
|
"reward": 0.7171763181686401, |
|
"reward_std": 0.4656876027584076, |
|
"rewards/cosine_scaled_reward/mean": 0.007025681436061859, |
|
"rewards/cosine_scaled_reward/std": 0.4227021336555481, |
|
"rewards/format_reward/mean": 0.703125, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.28125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1881.0, |
|
"completions/mean_length": 1239.796875, |
|
"completions/mean_terminated_length": 923.5435180664062, |
|
"completions/min_length": 194.0, |
|
"completions/min_terminated_length": 194.0, |
|
"epoch": 0.12571428571428572, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14556895196437836, |
|
"learning_rate": 5.578535828967777e-07, |
|
"loss": 0.0102, |
|
"num_tokens": 12458195.0, |
|
"reward": 0.3774694800376892, |
|
"reward_std": 0.654548704624176, |
|
"rewards/cosine_scaled_reward/mean": -0.1784527748823166, |
|
"rewards/cosine_scaled_reward/std": 0.331076443195343, |
|
"rewards/format_reward/mean": 0.734375, |
|
"rewards/format_reward/std": 0.44515693187713623, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1808.0, |
|
"completions/mean_length": 1170.40625, |
|
"completions/mean_terminated_length": 1045.0357666015625, |
|
"completions/min_length": 508.0, |
|
"completions/min_terminated_length": 508.0, |
|
"epoch": 0.12685714285714286, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1637505292892456, |
|
"learning_rate": 5.5e-07, |
|
"loss": 0.0741, |
|
"num_tokens": 12543221.0, |
|
"reward": 0.6489747762680054, |
|
"reward_std": 0.654654860496521, |
|
"rewards/cosine_scaled_reward/mean": -0.12082511186599731, |
|
"rewards/cosine_scaled_reward/std": 0.34212014079093933, |
|
"rewards/format_reward/mean": 0.890625, |
|
"rewards/format_reward/std": 0.3145764470100403, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.421875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1831.0, |
|
"completions/mean_length": 1475.09375, |
|
"completions/mean_terminated_length": 1057.027099609375, |
|
"completions/min_length": 351.0, |
|
"completions/min_terminated_length": 351.0, |
|
"epoch": 0.128, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1809089183807373, |
|
"learning_rate": 5.421464171032224e-07, |
|
"loss": 0.0187, |
|
"num_tokens": 12648723.0, |
|
"reward": 0.6672303676605225, |
|
"reward_std": 0.7431913614273071, |
|
"rewards/cosine_scaled_reward/mean": 0.01330268383026123, |
|
"rewards/cosine_scaled_reward/std": 0.4883294403553009, |
|
"rewards/format_reward/mean": 0.640625, |
|
"rewards/format_reward/std": 0.4836103618144989, |
|
"step": 112 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1961.0, |
|
"completions/mean_length": 1056.4375, |
|
"completions/mean_terminated_length": 914.7857666015625, |
|
"completions/min_length": 340.0, |
|
"completions/min_terminated_length": 340.0, |
|
"epoch": 0.12914285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1637895107269287, |
|
"learning_rate": 5.342952264838747e-07, |
|
"loss": 0.0644, |
|
"num_tokens": 12726631.0, |
|
"reward": 0.6515660881996155, |
|
"reward_std": 0.5848349332809448, |
|
"rewards/cosine_scaled_reward/mean": -0.11952944099903107, |
|
"rewards/cosine_scaled_reward/std": 0.4174686074256897, |
|
"rewards/format_reward/mean": 0.890625, |
|
"rewards/format_reward/std": 0.3145764470100403, |
|
"step": 113 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1711.0, |
|
"completions/mean_length": 1097.859375, |
|
"completions/mean_terminated_length": 962.1250610351562, |
|
"completions/min_length": 141.0, |
|
"completions/min_terminated_length": 141.0, |
|
"epoch": 0.13028571428571428, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1319950371980667, |
|
"learning_rate": 5.264488196906752e-07, |
|
"loss": 0.0226, |
|
"num_tokens": 12806742.0, |
|
"reward": 0.6668691635131836, |
|
"reward_std": 0.6580501794815063, |
|
"rewards/cosine_scaled_reward/mean": -0.1431279182434082, |
|
"rewards/cosine_scaled_reward/std": 0.378142774105072, |
|
"rewards/format_reward/mean": 0.953125, |
|
"rewards/format_reward/std": 0.21304203569889069, |
|
"step": 114 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1997.0, |
|
"completions/mean_length": 1380.0625, |
|
"completions/mean_terminated_length": 1076.45458984375, |
|
"completions/min_length": 322.0, |
|
"completions/min_terminated_length": 322.0, |
|
"epoch": 0.13142857142857142, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1882496327161789, |
|
"learning_rate": 5.186095868151436e-07, |
|
"loss": 0.04, |
|
"num_tokens": 12906282.0, |
|
"reward": 0.43996283411979675, |
|
"reward_std": 0.6503387093544006, |
|
"rewards/cosine_scaled_reward/mean": -0.13939358294010162, |
|
"rewards/cosine_scaled_reward/std": 0.3781909942626953, |
|
"rewards/format_reward/mean": 0.71875, |
|
"rewards/format_reward/std": 0.4531635046005249, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1852.0, |
|
"completions/mean_length": 1364.125, |
|
"completions/mean_terminated_length": 953.7999877929688, |
|
"completions/min_length": 343.0, |
|
"completions/min_terminated_length": 343.0, |
|
"epoch": 0.13257142857142856, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1886526346206665, |
|
"learning_rate": 5.107799157635538e-07, |
|
"loss": 0.1079, |
|
"num_tokens": 13004970.0, |
|
"reward": 0.5331847667694092, |
|
"reward_std": 0.7935209274291992, |
|
"rewards/cosine_scaled_reward/mean": -0.08497010916471481, |
|
"rewards/cosine_scaled_reward/std": 0.4501515328884125, |
|
"rewards/format_reward/mean": 0.703125, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 116 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.109375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1944.0, |
|
"completions/mean_length": 1136.4375, |
|
"completions/mean_terminated_length": 1024.4912109375, |
|
"completions/min_length": 505.0, |
|
"completions/min_terminated_length": 505.0, |
|
"epoch": 0.1337142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1523984968662262, |
|
"learning_rate": 5.02962191529556e-07, |
|
"loss": 0.0671, |
|
"num_tokens": 13088726.0, |
|
"reward": 0.7468037009239197, |
|
"reward_std": 0.7615803480148315, |
|
"rewards/cosine_scaled_reward/mean": -0.08753564208745956, |
|
"rewards/cosine_scaled_reward/std": 0.44001707434654236, |
|
"rewards/format_reward/mean": 0.921875, |
|
"rewards/format_reward/std": 0.27048972249031067, |
|
"step": 117 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.28125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1918.0, |
|
"completions/mean_length": 1274.578125, |
|
"completions/mean_terminated_length": 971.934814453125, |
|
"completions/min_length": 474.0, |
|
"completions/min_terminated_length": 474.0, |
|
"epoch": 0.13485714285714287, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14875811338424683, |
|
"learning_rate": 4.951587954676837e-07, |
|
"loss": 0.0166, |
|
"num_tokens": 13180835.0, |
|
"reward": 0.6522707939147949, |
|
"reward_std": 0.589940071105957, |
|
"rewards/cosine_scaled_reward/mean": -0.041052110493183136, |
|
"rewards/cosine_scaled_reward/std": 0.5126345157623291, |
|
"rewards/format_reward/mean": 0.734375, |
|
"rewards/format_reward/std": 0.44515693187713623, |
|
"step": 118 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1962.0, |
|
"completions/mean_length": 1070.359375, |
|
"completions/mean_terminated_length": 844.7500610351562, |
|
"completions/min_length": 333.0, |
|
"completions/min_terminated_length": 333.0, |
|
"epoch": 0.136, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15418609976768494, |
|
"learning_rate": 4.873721045679706e-07, |
|
"loss": 0.0135, |
|
"num_tokens": 13259746.0, |
|
"reward": 0.8924436569213867, |
|
"reward_std": 0.6925675272941589, |
|
"rewards/cosine_scaled_reward/mean": 0.00872182846069336, |
|
"rewards/cosine_scaled_reward/std": 0.49334391951560974, |
|
"rewards/format_reward/mean": 0.875, |
|
"rewards/format_reward/std": 0.3333333432674408, |
|
"step": 119 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.015625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1899.0, |
|
"completions/mean_length": 868.90625, |
|
"completions/mean_terminated_length": 850.1905517578125, |
|
"completions/min_length": 137.0, |
|
"completions/min_terminated_length": 137.0, |
|
"epoch": 0.13714285714285715, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.12889909744262695, |
|
"learning_rate": 4.79604490731896e-07, |
|
"loss": -0.0038, |
|
"num_tokens": 13325812.0, |
|
"reward": 0.833016574382782, |
|
"reward_std": 0.6583147048950195, |
|
"rewards/cosine_scaled_reward/mean": -0.08349171280860901, |
|
"rewards/cosine_scaled_reward/std": 0.43434619903564453, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.046875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1890.0, |
|
"completions/mean_length": 792.359375, |
|
"completions/mean_terminated_length": 730.6065063476562, |
|
"completions/min_length": 246.0, |
|
"completions/min_terminated_length": 246.0, |
|
"epoch": 0.1382857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14095140993595123, |
|
"learning_rate": 4.7185832004988133e-07, |
|
"loss": 0.0479, |
|
"num_tokens": 13386219.0, |
|
"reward": 1.289149284362793, |
|
"reward_std": 0.6984070539474487, |
|
"rewards/cosine_scaled_reward/mean": 0.16801217198371887, |
|
"rewards/cosine_scaled_reward/std": 0.5607498288154602, |
|
"rewards/format_reward/mean": 0.953125, |
|
"rewards/format_reward/std": 0.21304203569889069, |
|
"step": 121 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.328125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1805.0, |
|
"completions/mean_length": 1384.5, |
|
"completions/mean_terminated_length": 1060.465087890625, |
|
"completions/min_length": 334.0, |
|
"completions/min_terminated_length": 334.0, |
|
"epoch": 0.13942857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17925438284873962, |
|
"learning_rate": 4.641359520805548e-07, |
|
"loss": 0.0462, |
|
"num_tokens": 13486387.0, |
|
"reward": 0.4263126254081726, |
|
"reward_std": 0.6481289267539978, |
|
"rewards/cosine_scaled_reward/mean": -0.1462186872959137, |
|
"rewards/cosine_scaled_reward/std": 0.3027765154838562, |
|
"rewards/format_reward/mean": 0.71875, |
|
"rewards/format_reward/std": 0.4531635046005249, |
|
"step": 122 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1833.0, |
|
"completions/mean_length": 1208.140625, |
|
"completions/mean_terminated_length": 1014.3269653320312, |
|
"completions/min_length": 519.0, |
|
"completions/min_terminated_length": 519.0, |
|
"epoch": 0.14057142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14084899425506592, |
|
"learning_rate": 4.5643973913200837e-07, |
|
"loss": -0.0087, |
|
"num_tokens": 13573940.0, |
|
"reward": 0.5345523357391357, |
|
"reward_std": 0.35669955611228943, |
|
"rewards/cosine_scaled_reward/mean": -0.16241134703159332, |
|
"rewards/cosine_scaled_reward/std": 0.3877701759338379, |
|
"rewards/format_reward/mean": 0.859375, |
|
"rewards/format_reward/std": 0.3503824472427368, |
|
"step": 123 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.15625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2019.0, |
|
"completions/mean_length": 1185.703125, |
|
"completions/mean_terminated_length": 1026.0185546875, |
|
"completions/min_length": 455.0, |
|
"completions/min_terminated_length": 455.0, |
|
"epoch": 0.1417142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.13726963102817535, |
|
"learning_rate": 4.4877202554526084e-07, |
|
"loss": 0.0309, |
|
"num_tokens": 13660777.0, |
|
"reward": 0.802190363407135, |
|
"reward_std": 0.6432194709777832, |
|
"rewards/cosine_scaled_reward/mean": -0.044217295944690704, |
|
"rewards/cosine_scaled_reward/std": 0.4381820559501648, |
|
"rewards/format_reward/mean": 0.890625, |
|
"rewards/format_reward/std": 0.3145764470100403, |
|
"step": 124 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.203125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2033.0, |
|
"completions/mean_length": 1272.40625, |
|
"completions/mean_terminated_length": 1074.7059326171875, |
|
"completions/min_length": 451.0, |
|
"completions/min_terminated_length": 451.0, |
|
"epoch": 0.14285714285714285, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1567695438861847, |
|
"learning_rate": 4.4113514698014953e-07, |
|
"loss": -0.0098, |
|
"num_tokens": 13753139.0, |
|
"reward": 0.8288029432296753, |
|
"reward_std": 0.6727226972579956, |
|
"rewards/cosine_scaled_reward/mean": 0.0003389418125152588, |
|
"rewards/cosine_scaled_reward/std": 0.501276433467865, |
|
"rewards/format_reward/mean": 0.828125, |
|
"rewards/format_reward/std": 0.38025420904159546, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.234375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1738.0, |
|
"completions/mean_length": 1239.6875, |
|
"completions/mean_terminated_length": 992.244873046875, |
|
"completions/min_length": 609.0, |
|
"completions/min_terminated_length": 609.0, |
|
"epoch": 0.144, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1638517528772354, |
|
"learning_rate": 4.3353142970386557e-07, |
|
"loss": 0.0357, |
|
"num_tokens": 13843775.0, |
|
"reward": 0.8066681623458862, |
|
"reward_std": 0.8093670010566711, |
|
"rewards/cosine_scaled_reward/mean": -0.0029159002006053925, |
|
"rewards/cosine_scaled_reward/std": 0.40039899945259094, |
|
"rewards/format_reward/mean": 0.8125, |
|
"rewards/format_reward/std": 0.39339789748191833, |
|
"step": 126 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.21875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1968.0, |
|
"completions/mean_length": 1259.078125, |
|
"completions/mean_terminated_length": 1038.179931640625, |
|
"completions/min_length": 430.0, |
|
"completions/min_terminated_length": 430.0, |
|
"epoch": 0.14514285714285713, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18749745190143585, |
|
"learning_rate": 4.2596318988235037e-07, |
|
"loss": 0.1248, |
|
"num_tokens": 13935452.0, |
|
"reward": 0.3689166009426117, |
|
"reward_std": 0.5908951759338379, |
|
"rewards/cosine_scaled_reward/mean": -0.22960419952869415, |
|
"rewards/cosine_scaled_reward/std": 0.2868925929069519, |
|
"rewards/format_reward/mean": 0.828125, |
|
"rewards/format_reward/std": 0.38025420904159546, |
|
"step": 127 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1942.0, |
|
"completions/mean_length": 1131.578125, |
|
"completions/mean_terminated_length": 1070.4833984375, |
|
"completions/min_length": 449.0, |
|
"completions/min_terminated_length": 449.0, |
|
"epoch": 0.1462857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14950093626976013, |
|
"learning_rate": 4.1843273287476854e-07, |
|
"loss": 0.0065, |
|
"num_tokens": 14018225.0, |
|
"reward": 0.9214786291122437, |
|
"reward_std": 0.7154524922370911, |
|
"rewards/cosine_scaled_reward/mean": -0.01582319289445877, |
|
"rewards/cosine_scaled_reward/std": 0.47363659739494324, |
|
"rewards/format_reward/mean": 0.953125, |
|
"rewards/format_reward/std": 0.21304203569889069, |
|
"step": 128 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.234375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1927.0, |
|
"completions/mean_length": 1407.71875, |
|
"completions/mean_terminated_length": 1211.7142333984375, |
|
"completions/min_length": 455.0, |
|
"completions/min_terminated_length": 455.0, |
|
"epoch": 0.14742857142857144, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.16768500208854675, |
|
"learning_rate": 4.1094235253127374e-07, |
|
"loss": 0.0736, |
|
"num_tokens": 14119023.0, |
|
"reward": 0.5007042288780212, |
|
"reward_std": 0.6594030261039734, |
|
"rewards/cosine_scaled_reward/mean": -0.14808538556098938, |
|
"rewards/cosine_scaled_reward/std": 0.3597432076931, |
|
"rewards/format_reward/mean": 0.796875, |
|
"rewards/format_reward/std": 0.40550529956817627, |
|
"step": 129 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.359375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1705.0, |
|
"completions/mean_length": 1331.546875, |
|
"completions/mean_terminated_length": 929.6340942382812, |
|
"completions/min_length": 364.0, |
|
"completions/min_terminated_length": 364.0, |
|
"epoch": 0.14857142857142858, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1533743441104889, |
|
"learning_rate": 4.034943304942796e-07, |
|
"loss": 0.065, |
|
"num_tokens": 14214746.0, |
|
"reward": 0.18521776795387268, |
|
"reward_std": 0.4527278244495392, |
|
"rewards/cosine_scaled_reward/mean": -0.25895363092422485, |
|
"rewards/cosine_scaled_reward/std": 0.2297503650188446, |
|
"rewards/format_reward/mean": 0.703125, |
|
"rewards/format_reward/std": 0.4604927599430084, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.25, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1983.0, |
|
"completions/mean_length": 1240.703125, |
|
"completions/mean_terminated_length": 971.6041870117188, |
|
"completions/min_length": 348.0, |
|
"completions/min_terminated_length": 348.0, |
|
"epoch": 0.14971428571428572, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.16382953524589539, |
|
"learning_rate": 3.9609093550344907e-07, |
|
"loss": 0.0716, |
|
"num_tokens": 14303887.0, |
|
"reward": 1.0216246843338013, |
|
"reward_std": 0.8127155303955078, |
|
"rewards/cosine_scaled_reward/mean": 0.10456232726573944, |
|
"rewards/cosine_scaled_reward/std": 0.48323893547058105, |
|
"rewards/format_reward/mean": 0.8125, |
|
"rewards/format_reward/std": 0.39339789748191833, |
|
"step": 131 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.40625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1521.0, |
|
"completions/mean_length": 1340.046875, |
|
"completions/mean_terminated_length": 855.6578979492188, |
|
"completions/min_length": 297.0, |
|
"completions/min_terminated_length": 297.0, |
|
"epoch": 0.15085714285714286, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1577305793762207, |
|
"learning_rate": 3.8873442270461485e-07, |
|
"loss": 0.0288, |
|
"num_tokens": 14400714.0, |
|
"reward": 0.4232841432094574, |
|
"reward_std": 0.6519888639450073, |
|
"rewards/cosine_scaled_reward/mean": -0.1008579432964325, |
|
"rewards/cosine_scaled_reward/std": 0.42636433243751526, |
|
"rewards/format_reward/mean": 0.625, |
|
"rewards/format_reward/std": 0.48795005679130554, |
|
"step": 132 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.234375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1879.0, |
|
"completions/mean_length": 1292.671875, |
|
"completions/mean_terminated_length": 1061.448974609375, |
|
"completions/min_length": 459.0, |
|
"completions/min_terminated_length": 459.0, |
|
"epoch": 0.152, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1743871122598648, |
|
"learning_rate": 3.8142703296283953e-07, |
|
"loss": 0.0212, |
|
"num_tokens": 14494669.0, |
|
"reward": 0.6165566444396973, |
|
"reward_std": 0.5660312175750732, |
|
"rewards/cosine_scaled_reward/mean": -0.08234670013189316, |
|
"rewards/cosine_scaled_reward/std": 0.31525060534477234, |
|
"rewards/format_reward/mean": 0.78125, |
|
"rewards/format_reward/std": 0.4166666865348816, |
|
"step": 133 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.09375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2039.0, |
|
"completions/mean_length": 1096.953125, |
|
"completions/mean_terminated_length": 998.5689697265625, |
|
"completions/min_length": 364.0, |
|
"completions/min_terminated_length": 364.0, |
|
"epoch": 0.15314285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.16208074986934662, |
|
"learning_rate": 3.7417099217982686e-07, |
|
"loss": 0.0283, |
|
"num_tokens": 14575442.0, |
|
"reward": 1.0213682651519775, |
|
"reward_std": 0.6743905544281006, |
|
"rewards/cosine_scaled_reward/mean": 0.041934188455343246, |
|
"rewards/cosine_scaled_reward/std": 0.5223273038864136, |
|
"rewards/format_reward/mean": 0.9375, |
|
"rewards/format_reward/std": 0.24397502839565277, |
|
"step": 134 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.09375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1720.0, |
|
"completions/mean_length": 861.328125, |
|
"completions/mean_terminated_length": 738.5689697265625, |
|
"completions/min_length": 284.0, |
|
"completions/min_terminated_length": 284.0, |
|
"epoch": 0.15428571428571428, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15264089405536652, |
|
"learning_rate": 3.6696851061588994e-07, |
|
"loss": 0.0412, |
|
"num_tokens": 14641039.0, |
|
"reward": 1.173776388168335, |
|
"reward_std": 0.741400957107544, |
|
"rewards/cosine_scaled_reward/mean": 0.12595069408416748, |
|
"rewards/cosine_scaled_reward/std": 0.5099307298660278, |
|
"rewards/format_reward/mean": 0.921875, |
|
"rewards/format_reward/std": 0.27048972249031067, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2047.0, |
|
"completions/mean_length": 1234.375, |
|
"completions/mean_terminated_length": 1118.1429443359375, |
|
"completions/min_length": 429.0, |
|
"completions/min_terminated_length": 429.0, |
|
"epoch": 0.15542857142857142, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14350080490112305, |
|
"learning_rate": 3.5982178221668533e-07, |
|
"loss": 0.0484, |
|
"num_tokens": 14730711.0, |
|
"reward": 0.7637453675270081, |
|
"reward_std": 0.6790728569030762, |
|
"rewards/cosine_scaled_reward/mean": -0.10250230133533478, |
|
"rewards/cosine_scaled_reward/std": 0.4094173312187195, |
|
"rewards/format_reward/mean": 0.96875, |
|
"rewards/format_reward/std": 0.17536810040473938, |
|
"step": 136 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.078125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1865.0, |
|
"completions/mean_length": 1222.53125, |
|
"completions/mean_terminated_length": 1152.5762939453125, |
|
"completions/min_length": 555.0, |
|
"completions/min_terminated_length": 555.0, |
|
"epoch": 0.15657142857142858, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1349533200263977, |
|
"learning_rate": 3.5273298394491515e-07, |
|
"loss": -0.0333, |
|
"num_tokens": 14819561.0, |
|
"reward": 0.6314640641212463, |
|
"reward_std": 0.6037685871124268, |
|
"rewards/cosine_scaled_reward/mean": -0.16083045303821564, |
|
"rewards/cosine_scaled_reward/std": 0.3636666238307953, |
|
"rewards/format_reward/mean": 0.953125, |
|
"rewards/format_reward/std": 0.21304203569889069, |
|
"step": 137 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1955.0, |
|
"completions/mean_length": 1173.65625, |
|
"completions/mean_terminated_length": 1048.75, |
|
"completions/min_length": 320.0, |
|
"completions/min_terminated_length": 320.0, |
|
"epoch": 0.15771428571428572, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.13684934377670288, |
|
"learning_rate": 3.45704275117204e-07, |
|
"loss": 0.0176, |
|
"num_tokens": 14905987.0, |
|
"reward": 0.8157724142074585, |
|
"reward_std": 0.7757042646408081, |
|
"rewards/cosine_scaled_reward/mean": -0.04523882642388344, |
|
"rewards/cosine_scaled_reward/std": 0.4742158055305481, |
|
"rewards/format_reward/mean": 0.90625, |
|
"rewards/format_reward/std": 0.29378482699394226, |
|
"step": 138 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.15625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2009.0, |
|
"completions/mean_length": 1268.453125, |
|
"completions/mean_terminated_length": 1124.0926513671875, |
|
"completions/min_length": 291.0, |
|
"completions/min_terminated_length": 291.0, |
|
"epoch": 0.15885714285714286, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14245833456516266, |
|
"learning_rate": 3.387377967463493e-07, |
|
"loss": -0.0152, |
|
"num_tokens": 14997808.0, |
|
"reward": 0.7688822746276855, |
|
"reward_std": 0.5957136750221252, |
|
"rewards/cosine_scaled_reward/mean": -0.09212135523557663, |
|
"rewards/cosine_scaled_reward/std": 0.42672204971313477, |
|
"rewards/format_reward/mean": 0.953125, |
|
"rewards/format_reward/std": 0.21304203569889069, |
|
"step": 139 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.140625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1748.0, |
|
"completions/mean_length": 1121.875, |
|
"completions/mean_terminated_length": 970.3272705078125, |
|
"completions/min_length": 471.0, |
|
"completions/min_terminated_length": 471.0, |
|
"epoch": 0.16, |
|
"frac_reward_zero_std": 0.125, |
|
"grad_norm": 0.14092370867729187, |
|
"learning_rate": 3.3183567088914833e-07, |
|
"loss": 0.0336, |
|
"num_tokens": 15079832.0, |
|
"reward": 0.6852799654006958, |
|
"reward_std": 0.412535697221756, |
|
"rewards/cosine_scaled_reward/mean": -0.0948600098490715, |
|
"rewards/cosine_scaled_reward/std": 0.46610429883003235, |
|
"rewards/format_reward/mean": 0.875, |
|
"rewards/format_reward/std": 0.3333333432674408, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.078125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1891.0, |
|
"completions/mean_length": 1039.921875, |
|
"completions/mean_terminated_length": 954.4915161132812, |
|
"completions/min_length": 442.0, |
|
"completions/min_terminated_length": 442.0, |
|
"epoch": 0.16114285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1313817948102951, |
|
"learning_rate": 3.250000000000001e-07, |
|
"loss": 0.0563, |
|
"num_tokens": 15156947.0, |
|
"reward": 1.052842140197754, |
|
"reward_std": 0.7119845151901245, |
|
"rewards/cosine_scaled_reward/mean": 0.03423358500003815, |
|
"rewards/cosine_scaled_reward/std": 0.4524931311607361, |
|
"rewards/format_reward/mean": 0.984375, |
|
"rewards/format_reward/std": 0.125, |
|
"step": 141 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.296875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1770.0, |
|
"completions/mean_length": 1376.234375, |
|
"completions/mean_terminated_length": 1092.5999755859375, |
|
"completions/min_length": 326.0, |
|
"completions/min_terminated_length": 326.0, |
|
"epoch": 0.16228571428571428, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1682240515947342, |
|
"learning_rate": 3.182328662904756e-07, |
|
"loss": 0.0936, |
|
"num_tokens": 15255530.0, |
|
"reward": 0.44548213481903076, |
|
"reward_std": 0.7928640842437744, |
|
"rewards/cosine_scaled_reward/mean": -0.18350891768932343, |
|
"rewards/cosine_scaled_reward/std": 0.36820653080940247, |
|
"rewards/format_reward/mean": 0.8125, |
|
"rewards/format_reward/std": 0.39339789748191833, |
|
"step": 142 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.046875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1989.0, |
|
"completions/mean_length": 1057.03125, |
|
"completions/mean_terminated_length": 1008.2950439453125, |
|
"completions/min_length": 416.0, |
|
"completions/min_terminated_length": 416.0, |
|
"epoch": 0.16342857142857142, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15701259672641754, |
|
"learning_rate": 3.115363310950578e-07, |
|
"loss": 0.032, |
|
"num_tokens": 15333996.0, |
|
"reward": 0.681940495967865, |
|
"reward_std": 0.6061316728591919, |
|
"rewards/cosine_scaled_reward/mean": -0.1434047520160675, |
|
"rewards/cosine_scaled_reward/std": 0.31647545099258423, |
|
"rewards/format_reward/mean": 0.96875, |
|
"rewards/format_reward/std": 0.17536810040473938, |
|
"step": 143 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.203125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1879.0, |
|
"completions/mean_length": 1187.875, |
|
"completions/mean_terminated_length": 968.6275024414062, |
|
"completions/min_length": 316.0, |
|
"completions/min_terminated_length": 316.0, |
|
"epoch": 0.16457142857142856, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1424182802438736, |
|
"learning_rate": 3.0491243424323783e-07, |
|
"loss": 0.0431, |
|
"num_tokens": 15421508.0, |
|
"reward": 1.0751841068267822, |
|
"reward_std": 0.7788275480270386, |
|
"rewards/cosine_scaled_reward/mean": 0.12352952361106873, |
|
"rewards/cosine_scaled_reward/std": 0.5238592028617859, |
|
"rewards/format_reward/mean": 0.828125, |
|
"rewards/format_reward/std": 0.38025420904159546, |
|
"step": 144 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.046875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1908.0, |
|
"completions/mean_length": 908.546875, |
|
"completions/mean_terminated_length": 852.5081787109375, |
|
"completions/min_length": 261.0, |
|
"completions/min_terminated_length": 261.0, |
|
"epoch": 0.1657142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1289522349834442, |
|
"learning_rate": 2.9836319343816397e-07, |
|
"loss": 0.0264, |
|
"num_tokens": 15489599.0, |
|
"reward": 1.0159393548965454, |
|
"reward_std": 0.6956236958503723, |
|
"rewards/cosine_scaled_reward/mean": 0.023594655096530914, |
|
"rewards/cosine_scaled_reward/std": 0.472563236951828, |
|
"rewards/format_reward/mean": 0.96875, |
|
"rewards/format_reward/std": 0.17536810040473938, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1742.0, |
|
"completions/mean_length": 1074.765625, |
|
"completions/mean_terminated_length": 1009.8833618164062, |
|
"completions/min_length": 457.0, |
|
"completions/min_terminated_length": 457.0, |
|
"epoch": 0.16685714285714287, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.12770003080368042, |
|
"learning_rate": 2.918906036420294e-07, |
|
"loss": 0.0393, |
|
"num_tokens": 15569000.0, |
|
"reward": 0.5655175447463989, |
|
"reward_std": 0.5674481987953186, |
|
"rewards/cosine_scaled_reward/mean": -0.19380369782447815, |
|
"rewards/cosine_scaled_reward/std": 0.32235828042030334, |
|
"rewards/format_reward/mean": 0.953125, |
|
"rewards/format_reward/std": 0.21304203569889069, |
|
"step": 146 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.171875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2029.0, |
|
"completions/mean_length": 1272.375, |
|
"completions/mean_terminated_length": 1111.396240234375, |
|
"completions/min_length": 399.0, |
|
"completions/min_terminated_length": 399.0, |
|
"epoch": 0.168, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.160521000623703, |
|
"learning_rate": 2.854966364683872e-07, |
|
"loss": 0.017, |
|
"num_tokens": 15661216.0, |
|
"reward": 0.5459345579147339, |
|
"reward_std": 0.7825783491134644, |
|
"rewards/cosine_scaled_reward/mean": -0.14890772104263306, |
|
"rewards/cosine_scaled_reward/std": 0.4268314838409424, |
|
"rewards/format_reward/mean": 0.84375, |
|
"rewards/format_reward/std": 0.36596253514289856, |
|
"step": 147 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.109375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1729.0, |
|
"completions/mean_length": 1068.40625, |
|
"completions/mean_terminated_length": 948.1052856445312, |
|
"completions/min_length": 388.0, |
|
"completions/min_terminated_length": 388.0, |
|
"epoch": 0.16914285714285715, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.13231147825717926, |
|
"learning_rate": 2.791832395815782e-07, |
|
"loss": 0.0355, |
|
"num_tokens": 15740778.0, |
|
"reward": 0.8093540668487549, |
|
"reward_std": 0.5906412601470947, |
|
"rewards/cosine_scaled_reward/mean": -0.08751046657562256, |
|
"rewards/cosine_scaled_reward/std": 0.40702494978904724, |
|
"rewards/format_reward/mean": 0.984375, |
|
"rewards/format_reward/std": 0.125, |
|
"step": 148 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1865.0, |
|
"completions/mean_length": 1392.28125, |
|
"completions/mean_terminated_length": 998.8500366210938, |
|
"completions/min_length": 559.0, |
|
"completions/min_terminated_length": 559.0, |
|
"epoch": 0.1702857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1449200063943863, |
|
"learning_rate": 2.729523361034538e-07, |
|
"loss": 0.0033, |
|
"num_tokens": 15841780.0, |
|
"reward": 0.4228026866912842, |
|
"reward_std": 0.745114266872406, |
|
"rewards/cosine_scaled_reward/mean": -0.1323486566543579, |
|
"rewards/cosine_scaled_reward/std": 0.37805312871932983, |
|
"rewards/format_reward/mean": 0.6875, |
|
"rewards/format_reward/std": 0.467176616191864, |
|
"step": 149 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.046875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2032.0, |
|
"completions/mean_length": 1004.125, |
|
"completions/mean_terminated_length": 952.7868041992188, |
|
"completions/min_length": 378.0, |
|
"completions/min_terminated_length": 378.0, |
|
"epoch": 0.17142857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.13021093606948853, |
|
"learning_rate": 2.6680582402757324e-07, |
|
"loss": 0.0355, |
|
"num_tokens": 15916548.0, |
|
"reward": 0.7599377632141113, |
|
"reward_std": 0.5821801424026489, |
|
"rewards/cosine_scaled_reward/mean": -0.11221860349178314, |
|
"rewards/cosine_scaled_reward/std": 0.3788122236728668, |
|
"rewards/format_reward/mean": 0.984375, |
|
"rewards/format_reward/std": 0.125, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.171875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1859.0, |
|
"completions/mean_length": 1107.015625, |
|
"completions/mean_terminated_length": 911.7169799804688, |
|
"completions/min_length": 179.0, |
|
"completions/min_terminated_length": 179.0, |
|
"epoch": 0.17257142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14420656859874725, |
|
"learning_rate": 2.6074557564105724e-07, |
|
"loss": 0.0295, |
|
"num_tokens": 15998077.0, |
|
"reward": 1.2211229801177979, |
|
"reward_std": 0.7430520057678223, |
|
"rewards/cosine_scaled_reward/mean": 0.18087396025657654, |
|
"rewards/cosine_scaled_reward/std": 0.5226595401763916, |
|
"rewards/format_reward/mean": 0.859375, |
|
"rewards/format_reward/std": 0.3503824472427368, |
|
"step": 151 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.28125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2028.0, |
|
"completions/mean_length": 1256.234375, |
|
"completions/mean_terminated_length": 946.4130859375, |
|
"completions/min_length": 167.0, |
|
"completions/min_terminated_length": 167.0, |
|
"epoch": 0.1737142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15784548223018646, |
|
"learning_rate": 2.547734369542718e-07, |
|
"loss": 0.0658, |
|
"num_tokens": 16089140.0, |
|
"reward": 0.6517580151557922, |
|
"reward_std": 0.7057055830955505, |
|
"rewards/cosine_scaled_reward/mean": -0.056933484971523285, |
|
"rewards/cosine_scaled_reward/std": 0.403768390417099, |
|
"rewards/format_reward/mean": 0.765625, |
|
"rewards/format_reward/std": 0.42695629596710205, |
|
"step": 152 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2045.0, |
|
"completions/mean_length": 1169.125, |
|
"completions/mean_terminated_length": 1043.571533203125, |
|
"completions/min_length": 332.0, |
|
"completions/min_terminated_length": 332.0, |
|
"epoch": 0.17485714285714285, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.13566994667053223, |
|
"learning_rate": 2.488912271385139e-07, |
|
"loss": 0.0529, |
|
"num_tokens": 16175108.0, |
|
"reward": 0.4462122321128845, |
|
"reward_std": 0.4172056317329407, |
|
"rewards/cosine_scaled_reward/mean": -0.22220639884471893, |
|
"rewards/cosine_scaled_reward/std": 0.19565363228321075, |
|
"rewards/format_reward/mean": 0.890625, |
|
"rewards/format_reward/std": 0.3145764470100403, |
|
"step": 153 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.234375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1909.0, |
|
"completions/mean_length": 1226.546875, |
|
"completions/mean_terminated_length": 975.0816040039062, |
|
"completions/min_length": 443.0, |
|
"completions/min_terminated_length": 443.0, |
|
"epoch": 0.176, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15715065598487854, |
|
"learning_rate": 2.4310073797187573e-07, |
|
"loss": 0.0615, |
|
"num_tokens": 16264671.0, |
|
"reward": 0.6308701038360596, |
|
"reward_std": 0.6271623373031616, |
|
"rewards/cosine_scaled_reward/mean": -0.11425244808197021, |
|
"rewards/cosine_scaled_reward/std": 0.37054499983787537, |
|
"rewards/format_reward/mean": 0.859375, |
|
"rewards/format_reward/std": 0.3503824472427368, |
|
"step": 154 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.140625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2018.0, |
|
"completions/mean_length": 995.125, |
|
"completions/mean_terminated_length": 822.8363647460938, |
|
"completions/min_length": 273.0, |
|
"completions/min_terminated_length": 273.0, |
|
"epoch": 0.17714285714285713, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.13968248665332794, |
|
"learning_rate": 2.374037332934512e-07, |
|
"loss": -0.0003, |
|
"num_tokens": 16338983.0, |
|
"reward": 0.7562404870986938, |
|
"reward_std": 0.70821213722229, |
|
"rewards/cosine_scaled_reward/mean": -0.08281721919775009, |
|
"rewards/cosine_scaled_reward/std": 0.44696903228759766, |
|
"rewards/format_reward/mean": 0.921875, |
|
"rewards/format_reward/std": 0.27048972249031067, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.34375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1915.0, |
|
"completions/mean_length": 1327.6875, |
|
"completions/mean_terminated_length": 950.3809814453125, |
|
"completions/min_length": 284.0, |
|
"completions/min_terminated_length": 284.0, |
|
"epoch": 0.1782857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15466168522834778, |
|
"learning_rate": 2.3180194846605364e-07, |
|
"loss": 0.0102, |
|
"num_tokens": 16434059.0, |
|
"reward": 0.6187171936035156, |
|
"reward_std": 0.7333636283874512, |
|
"rewards/cosine_scaled_reward/mean": -0.026578888297080994, |
|
"rewards/cosine_scaled_reward/std": 0.49515098333358765, |
|
"rewards/format_reward/mean": 0.671875, |
|
"rewards/format_reward/std": 0.4732423722743988, |
|
"step": 156 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2021.0, |
|
"completions/mean_length": 1259.796875, |
|
"completions/mean_terminated_length": 1077.9039306640625, |
|
"completions/min_length": 471.0, |
|
"completions/min_terminated_length": 471.0, |
|
"epoch": 0.17942857142857144, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15207421779632568, |
|
"learning_rate": 2.2629708984760706e-07, |
|
"loss": 0.002, |
|
"num_tokens": 16524646.0, |
|
"reward": 0.48799604177474976, |
|
"reward_std": 0.5923628211021423, |
|
"rewards/cosine_scaled_reward/mean": -0.18568949401378632, |
|
"rewards/cosine_scaled_reward/std": 0.28887510299682617, |
|
"rewards/format_reward/mean": 0.859375, |
|
"rewards/format_reward/std": 0.3503824472427368, |
|
"step": 157 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.078125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1980.0, |
|
"completions/mean_length": 1055.890625, |
|
"completions/mean_terminated_length": 971.8135375976562, |
|
"completions/min_length": 132.0, |
|
"completions/min_terminated_length": 132.0, |
|
"epoch": 0.18057142857142858, |
|
"frac_reward_zero_std": 0.125, |
|
"grad_norm": 0.13212887942790985, |
|
"learning_rate": 2.2089083427137329e-07, |
|
"loss": 0.0164, |
|
"num_tokens": 16602343.0, |
|
"reward": 0.9118403196334839, |
|
"reward_std": 0.5433474779129028, |
|
"rewards/cosine_scaled_reward/mean": -0.03626735508441925, |
|
"rewards/cosine_scaled_reward/std": 0.5205101370811462, |
|
"rewards/format_reward/mean": 0.984375, |
|
"rewards/format_reward/std": 0.125, |
|
"step": 158 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.140625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2008.0, |
|
"completions/mean_length": 1232.875, |
|
"completions/mean_terminated_length": 1099.4908447265625, |
|
"completions/min_length": 426.0, |
|
"completions/min_terminated_length": 426.0, |
|
"epoch": 0.18171428571428572, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1433304101228714, |
|
"learning_rate": 2.1558482853517253e-07, |
|
"loss": 0.066, |
|
"num_tokens": 16692927.0, |
|
"reward": 0.4464070796966553, |
|
"reward_std": 0.5299515128135681, |
|
"rewards/cosine_scaled_reward/mean": -0.22210896015167236, |
|
"rewards/cosine_scaled_reward/std": 0.27688807249069214, |
|
"rewards/format_reward/mean": 0.890625, |
|
"rewards/format_reward/std": 0.3145764470100403, |
|
"step": 159 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.140625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1549.0, |
|
"completions/mean_length": 1081.25, |
|
"completions/mean_terminated_length": 923.0545043945312, |
|
"completions/min_length": 361.0, |
|
"completions/min_terminated_length": 361.0, |
|
"epoch": 0.18285714285714286, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15510301291942596, |
|
"learning_rate": 2.1038068889975259e-07, |
|
"loss": 0.071, |
|
"num_tokens": 16773711.0, |
|
"reward": 0.8579483032226562, |
|
"reward_std": 0.7331453561782837, |
|
"rewards/cosine_scaled_reward/mean": -0.024150855839252472, |
|
"rewards/cosine_scaled_reward/std": 0.43525949120521545, |
|
"rewards/format_reward/mean": 0.90625, |
|
"rewards/format_reward/std": 0.29378482699394226, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.15625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1896.0, |
|
"completions/mean_length": 1213.984375, |
|
"completions/mean_terminated_length": 1059.5369873046875, |
|
"completions/min_length": 421.0, |
|
"completions/min_terminated_length": 421.0, |
|
"epoch": 0.184, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1579800397157669, |
|
"learning_rate": 2.0528000059645995e-07, |
|
"loss": 0.0563, |
|
"num_tokens": 16861398.0, |
|
"reward": 0.790129542350769, |
|
"reward_std": 0.8190513849258423, |
|
"rewards/cosine_scaled_reward/mean": -0.04243520647287369, |
|
"rewards/cosine_scaled_reward/std": 0.4257972538471222, |
|
"rewards/format_reward/mean": 0.875, |
|
"rewards/format_reward/std": 0.3333333432674408, |
|
"step": 161 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1871.0, |
|
"completions/mean_length": 1173.828125, |
|
"completions/mean_terminated_length": 972.09619140625, |
|
"completions/min_length": 513.0, |
|
"completions/min_terminated_length": 513.0, |
|
"epoch": 0.18514285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14933519065380096, |
|
"learning_rate": 2.0028431734436308e-07, |
|
"loss": 0.0335, |
|
"num_tokens": 16946827.0, |
|
"reward": 0.6066349744796753, |
|
"reward_std": 0.807995080947876, |
|
"rewards/cosine_scaled_reward/mean": -0.11855749785900116, |
|
"rewards/cosine_scaled_reward/std": 0.40160706639289856, |
|
"rewards/format_reward/mean": 0.84375, |
|
"rewards/format_reward/std": 0.36596253514289856, |
|
"step": 162 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.25, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1776.0, |
|
"completions/mean_length": 1202.625, |
|
"completions/mean_terminated_length": 920.8333740234375, |
|
"completions/min_length": 410.0, |
|
"completions/min_terminated_length": 410.0, |
|
"epoch": 0.18628571428571428, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17073573172092438, |
|
"learning_rate": 1.9539516087697517e-07, |
|
"loss": 0.008, |
|
"num_tokens": 17034979.0, |
|
"reward": 1.164199709892273, |
|
"reward_std": 0.6732690930366516, |
|
"rewards/cosine_scaled_reward/mean": 0.22272484004497528, |
|
"rewards/cosine_scaled_reward/std": 0.5151689648628235, |
|
"rewards/format_reward/mean": 0.71875, |
|
"rewards/format_reward/std": 0.4531635046005249, |
|
"step": 163 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.109375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2032.0, |
|
"completions/mean_length": 1055.265625, |
|
"completions/mean_terminated_length": 933.3508911132812, |
|
"completions/min_length": 278.0, |
|
"completions/min_terminated_length": 278.0, |
|
"epoch": 0.18742857142857142, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14955352246761322, |
|
"learning_rate": 1.9061402047871833e-07, |
|
"loss": 0.0617, |
|
"num_tokens": 17113044.0, |
|
"reward": 1.0745567083358765, |
|
"reward_std": 0.44688692688941956, |
|
"rewards/cosine_scaled_reward/mean": 0.07634085416793823, |
|
"rewards/cosine_scaled_reward/std": 0.45942261815071106, |
|
"rewards/format_reward/mean": 0.921875, |
|
"rewards/format_reward/std": 0.27048972249031067, |
|
"step": 164 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.09375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1950.0, |
|
"completions/mean_length": 1082.15625, |
|
"completions/mean_terminated_length": 982.2413940429688, |
|
"completions/min_length": 425.0, |
|
"completions/min_terminated_length": 425.0, |
|
"epoch": 0.18857142857142858, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15888644754886627, |
|
"learning_rate": 1.8594235253127372e-07, |
|
"loss": 0.0088, |
|
"num_tokens": 17193718.0, |
|
"reward": 0.9942861199378967, |
|
"reward_std": 0.6299077272415161, |
|
"rewards/cosine_scaled_reward/mean": 0.02058056741952896, |
|
"rewards/cosine_scaled_reward/std": 0.46080252528190613, |
|
"rewards/format_reward/mean": 0.953125, |
|
"rewards/format_reward/std": 0.21304203569889069, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.25, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1826.0, |
|
"completions/mean_length": 1271.890625, |
|
"completions/mean_terminated_length": 1013.1875, |
|
"completions/min_length": 363.0, |
|
"completions/min_terminated_length": 363.0, |
|
"epoch": 0.18971428571428572, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14011530578136444, |
|
"learning_rate": 1.8138158006995363e-07, |
|
"loss": 0.0285, |
|
"num_tokens": 17286695.0, |
|
"reward": 0.5431326627731323, |
|
"reward_std": 0.6457577347755432, |
|
"rewards/cosine_scaled_reward/mean": -0.13468365371227264, |
|
"rewards/cosine_scaled_reward/std": 0.3553418219089508, |
|
"rewards/format_reward/mean": 0.8125, |
|
"rewards/format_reward/std": 0.39339789748191833, |
|
"step": 166 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.09375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1882.0, |
|
"completions/mean_length": 1100.046875, |
|
"completions/mean_terminated_length": 1001.9827270507812, |
|
"completions/min_length": 476.0, |
|
"completions/min_terminated_length": 476.0, |
|
"epoch": 0.19085714285714286, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14937180280685425, |
|
"learning_rate": 1.7693309235023127e-07, |
|
"loss": 0.0807, |
|
"num_tokens": 17368642.0, |
|
"reward": 0.6264936923980713, |
|
"reward_std": 0.5748982429504395, |
|
"rewards/cosine_scaled_reward/mean": -0.14769065380096436, |
|
"rewards/cosine_scaled_reward/std": 0.2645467221736908, |
|
"rewards/format_reward/mean": 0.921875, |
|
"rewards/format_reward/std": 0.27048972249031067, |
|
"step": 167 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.015625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1992.0, |
|
"completions/mean_length": 1130.84375, |
|
"completions/mean_terminated_length": 1116.2857666015625, |
|
"completions/min_length": 437.0, |
|
"completions/min_terminated_length": 437.0, |
|
"epoch": 0.192, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.13785859942436218, |
|
"learning_rate": 1.7259824442455923e-07, |
|
"loss": -0.0247, |
|
"num_tokens": 17451856.0, |
|
"reward": 1.0183875560760498, |
|
"reward_std": 0.7866266965866089, |
|
"rewards/cosine_scaled_reward/mean": 0.017006313428282738, |
|
"rewards/cosine_scaled_reward/std": 0.48554277420043945, |
|
"rewards/format_reward/mean": 0.984375, |
|
"rewards/format_reward/std": 0.125, |
|
"step": 168 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.03125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1877.0, |
|
"completions/mean_length": 963.734375, |
|
"completions/mean_terminated_length": 928.758056640625, |
|
"completions/min_length": 498.0, |
|
"completions/min_terminated_length": 498.0, |
|
"epoch": 0.19314285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.132929727435112, |
|
"learning_rate": 1.6837835672960831e-07, |
|
"loss": 0.0452, |
|
"num_tokens": 17524159.0, |
|
"reward": 1.5141942501068115, |
|
"reward_std": 0.7578620910644531, |
|
"rewards/cosine_scaled_reward/mean": 0.26490968465805054, |
|
"rewards/cosine_scaled_reward/std": 0.53211909532547, |
|
"rewards/format_reward/mean": 0.984375, |
|
"rewards/format_reward/std": 0.125, |
|
"step": 169 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.21875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1697.0, |
|
"completions/mean_length": 1058.453125, |
|
"completions/mean_terminated_length": 781.3800048828125, |
|
"completions/min_length": 337.0, |
|
"completions/min_terminated_length": 337.0, |
|
"epoch": 0.19428571428571428, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14723296463489532, |
|
"learning_rate": 1.6427471468404952e-07, |
|
"loss": 0.0659, |
|
"num_tokens": 17601684.0, |
|
"reward": 0.8584200739860535, |
|
"reward_std": 0.4904913902282715, |
|
"rewards/cosine_scaled_reward/mean": 0.007335059344768524, |
|
"rewards/cosine_scaled_reward/std": 0.44158241152763367, |
|
"rewards/format_reward/mean": 0.84375, |
|
"rewards/format_reward/std": 0.36596253514289856, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.25, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1732.0, |
|
"completions/mean_length": 1232.28125, |
|
"completions/mean_terminated_length": 960.375, |
|
"completions/min_length": 414.0, |
|
"completions/min_terminated_length": 414.0, |
|
"epoch": 0.19542857142857142, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.16656361520290375, |
|
"learning_rate": 1.6028856829700258e-07, |
|
"loss": -0.0264, |
|
"num_tokens": 17690942.0, |
|
"reward": 0.6898657083511353, |
|
"reward_std": 0.6278946399688721, |
|
"rewards/cosine_scaled_reward/mean": -0.030067168176174164, |
|
"rewards/cosine_scaled_reward/std": 0.45971429347991943, |
|
"rewards/format_reward/mean": 0.75, |
|
"rewards/format_reward/std": 0.4364357888698578, |
|
"step": 171 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.140625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1805.0, |
|
"completions/mean_length": 1040.625, |
|
"completions/mean_terminated_length": 875.7817993164062, |
|
"completions/min_length": 162.0, |
|
"completions/min_terminated_length": 162.0, |
|
"epoch": 0.19657142857142856, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15691731870174408, |
|
"learning_rate": 1.5642113178727193e-07, |
|
"loss": 0.0625, |
|
"num_tokens": 17768158.0, |
|
"reward": 1.2213534116744995, |
|
"reward_std": 0.6515992879867554, |
|
"rewards/cosine_scaled_reward/mean": 0.17317672073841095, |
|
"rewards/cosine_scaled_reward/std": 0.5265737771987915, |
|
"rewards/format_reward/mean": 0.875, |
|
"rewards/format_reward/std": 0.3333333432674408, |
|
"step": 172 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.109375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1873.0, |
|
"completions/mean_length": 899.28125, |
|
"completions/mean_terminated_length": 758.2105102539062, |
|
"completions/min_length": 292.0, |
|
"completions/min_terminated_length": 292.0, |
|
"epoch": 0.1977142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1264735609292984, |
|
"learning_rate": 1.5267358321348285e-07, |
|
"loss": 0.0532, |
|
"num_tokens": 17837024.0, |
|
"reward": 0.7364885210990906, |
|
"reward_std": 0.6678578853607178, |
|
"rewards/cosine_scaled_reward/mean": -0.0848807543516159, |
|
"rewards/cosine_scaled_reward/std": 0.4483066201210022, |
|
"rewards/format_reward/mean": 0.90625, |
|
"rewards/format_reward/std": 0.29378482699394226, |
|
"step": 173 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 1777.0, |
|
"completions/max_terminated_length": 1777.0, |
|
"completions/mean_length": 953.328125, |
|
"completions/mean_terminated_length": 953.328125, |
|
"completions/min_length": 508.0, |
|
"completions/min_terminated_length": 508.0, |
|
"epoch": 0.19885714285714284, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.13611741364002228, |
|
"learning_rate": 1.4904706411523448e-07, |
|
"loss": 0.0037, |
|
"num_tokens": 17908373.0, |
|
"reward": 0.9751720428466797, |
|
"reward_std": 0.5935230255126953, |
|
"rewards/cosine_scaled_reward/mean": -0.012413978576660156, |
|
"rewards/cosine_scaled_reward/std": 0.4495556354522705, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 174 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1927.0, |
|
"completions/mean_length": 1108.25, |
|
"completions/mean_terminated_length": 974.0000610351562, |
|
"completions/min_length": 390.0, |
|
"completions/min_terminated_length": 390.0, |
|
"epoch": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15151762962341309, |
|
"learning_rate": 1.4554267916537495e-07, |
|
"loss": 0.0591, |
|
"num_tokens": 17990125.0, |
|
"reward": 0.881943941116333, |
|
"reward_std": 0.575822114944458, |
|
"rewards/cosine_scaled_reward/mean": -0.0121530219912529, |
|
"rewards/cosine_scaled_reward/std": 0.49256107211112976, |
|
"rewards/format_reward/mean": 0.90625, |
|
"rewards/format_reward/std": 0.29378482699394226, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.171875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1940.0, |
|
"completions/mean_length": 1168.84375, |
|
"completions/mean_terminated_length": 986.3773803710938, |
|
"completions/min_length": 257.0, |
|
"completions/min_terminated_length": 257.0, |
|
"epoch": 0.20114285714285715, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15197090804576874, |
|
"learning_rate": 1.4216149583350755e-07, |
|
"loss": 0.0193, |
|
"num_tokens": 18076099.0, |
|
"reward": 0.5906968712806702, |
|
"reward_std": 0.5817879438400269, |
|
"rewards/cosine_scaled_reward/mean": -0.12652656435966492, |
|
"rewards/cosine_scaled_reward/std": 0.3300129473209381, |
|
"rewards/format_reward/mean": 0.84375, |
|
"rewards/format_reward/std": 0.36596253514289856, |
|
"step": 176 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1916.0, |
|
"completions/mean_length": 1038.0625, |
|
"completions/mean_terminated_length": 970.7333984375, |
|
"completions/min_length": 390.0, |
|
"completions/min_terminated_length": 390.0, |
|
"epoch": 0.2022857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15584589540958405, |
|
"learning_rate": 1.3890454406082956e-07, |
|
"loss": 0.0401, |
|
"num_tokens": 18152759.0, |
|
"reward": 1.083601474761963, |
|
"reward_std": 0.8219331502914429, |
|
"rewards/cosine_scaled_reward/mean": 0.08086325228214264, |
|
"rewards/cosine_scaled_reward/std": 0.47295841574668884, |
|
"rewards/format_reward/mean": 0.921875, |
|
"rewards/format_reward/std": 0.27048972249031067, |
|
"step": 177 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1839.0, |
|
"completions/mean_length": 1048.5625, |
|
"completions/mean_terminated_length": 905.7857666015625, |
|
"completions/min_length": 300.0, |
|
"completions/min_terminated_length": 300.0, |
|
"epoch": 0.20342857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14128631353378296, |
|
"learning_rate": 1.3577281594640182e-07, |
|
"loss": 0.0298, |
|
"num_tokens": 18231403.0, |
|
"reward": 0.9733308553695679, |
|
"reward_std": 0.6629190444946289, |
|
"rewards/cosine_scaled_reward/mean": 0.02572791464626789, |
|
"rewards/cosine_scaled_reward/std": 0.47114452719688416, |
|
"rewards/format_reward/mean": 0.921875, |
|
"rewards/format_reward/std": 0.27048972249031067, |
|
"step": 178 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.265625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1927.0, |
|
"completions/mean_length": 1293.578125, |
|
"completions/mean_terminated_length": 1020.7020874023438, |
|
"completions/min_length": 245.0, |
|
"completions/min_terminated_length": 245.0, |
|
"epoch": 0.20457142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.16287265717983246, |
|
"learning_rate": 1.3276726544494571e-07, |
|
"loss": 0.0196, |
|
"num_tokens": 18325024.0, |
|
"reward": 0.5872488617897034, |
|
"reward_std": 0.6428846120834351, |
|
"rewards/cosine_scaled_reward/mean": -0.08137557655572891, |
|
"rewards/cosine_scaled_reward/std": 0.3453543484210968, |
|
"rewards/format_reward/mean": 0.75, |
|
"rewards/format_reward/std": 0.4364357888698578, |
|
"step": 179 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.09375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1951.0, |
|
"completions/mean_length": 1036.78125, |
|
"completions/mean_terminated_length": 932.1724243164062, |
|
"completions/min_length": 262.0, |
|
"completions/min_terminated_length": 262.0, |
|
"epoch": 0.2057142857142857, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14584973454475403, |
|
"learning_rate": 1.2988880807625927e-07, |
|
"loss": 0.0066, |
|
"num_tokens": 18402554.0, |
|
"reward": 1.347097396850586, |
|
"reward_std": 0.8030112385749817, |
|
"rewards/cosine_scaled_reward/mean": 0.19698619842529297, |
|
"rewards/cosine_scaled_reward/std": 0.48687708377838135, |
|
"rewards/format_reward/mean": 0.953125, |
|
"rewards/format_reward/std": 0.21304203569889069, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.203125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1869.0, |
|
"completions/mean_length": 1165.484375, |
|
"completions/mean_terminated_length": 940.5294189453125, |
|
"completions/min_length": 442.0, |
|
"completions/min_terminated_length": 442.0, |
|
"epoch": 0.20685714285714285, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1534472554922104, |
|
"learning_rate": 1.2713832064634125e-07, |
|
"loss": 0.0795, |
|
"num_tokens": 18488617.0, |
|
"reward": 0.6842443346977234, |
|
"reward_std": 0.6290575265884399, |
|
"rewards/cosine_scaled_reward/mean": -0.0563153512775898, |
|
"rewards/cosine_scaled_reward/std": 0.5009898543357849, |
|
"rewards/format_reward/mean": 0.796875, |
|
"rewards/format_reward/std": 0.40550529956817627, |
|
"step": 181 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.265625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2037.0, |
|
"completions/mean_length": 1240.1875, |
|
"completions/mean_terminated_length": 948.0, |
|
"completions/min_length": 264.0, |
|
"completions/min_terminated_length": 264.0, |
|
"epoch": 0.208, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.13502204418182373, |
|
"learning_rate": 1.2451664098030743e-07, |
|
"loss": 0.0042, |
|
"num_tokens": 18577781.0, |
|
"reward": 0.5206961631774902, |
|
"reward_std": 0.6657352447509766, |
|
"rewards/cosine_scaled_reward/mean": -0.1380893886089325, |
|
"rewards/cosine_scaled_reward/std": 0.3631601333618164, |
|
"rewards/format_reward/mean": 0.796875, |
|
"rewards/format_reward/std": 0.40550529956817627, |
|
"step": 182 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.078125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1898.0, |
|
"completions/mean_length": 1029.140625, |
|
"completions/mean_terminated_length": 942.796630859375, |
|
"completions/min_length": 459.0, |
|
"completions/min_terminated_length": 459.0, |
|
"epoch": 0.20914285714285713, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.12827463448047638, |
|
"learning_rate": 1.220245676671809e-07, |
|
"loss": 0.0038, |
|
"num_tokens": 18654262.0, |
|
"reward": 1.1728923320770264, |
|
"reward_std": 0.6444723010063171, |
|
"rewards/cosine_scaled_reward/mean": 0.08644616603851318, |
|
"rewards/cosine_scaled_reward/std": 0.49451789259910583, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 183 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.171875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2041.0, |
|
"completions/mean_length": 1162.984375, |
|
"completions/mean_terminated_length": 979.3018798828125, |
|
"completions/min_length": 100.0, |
|
"completions/min_terminated_length": 100.0, |
|
"epoch": 0.2102857142857143, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1496909260749817, |
|
"learning_rate": 1.1966285981663407e-07, |
|
"loss": 0.0474, |
|
"num_tokens": 18740045.0, |
|
"reward": 0.738210916519165, |
|
"reward_std": 0.540239155292511, |
|
"rewards/cosine_scaled_reward/mean": -0.07620704174041748, |
|
"rewards/cosine_scaled_reward/std": 0.37467995285987854, |
|
"rewards/format_reward/mean": 0.890625, |
|
"rewards/format_reward/std": 0.3145764470100403, |
|
"step": 184 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1991.0, |
|
"completions/mean_length": 1057.15625, |
|
"completions/mean_terminated_length": 991.1000366210938, |
|
"completions/min_length": 290.0, |
|
"completions/min_terminated_length": 290.0, |
|
"epoch": 0.21142857142857144, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.13016612827777863, |
|
"learning_rate": 1.1743223682775649e-07, |
|
"loss": 0.0175, |
|
"num_tokens": 18817887.0, |
|
"reward": 0.5949590802192688, |
|
"reward_std": 0.6293296813964844, |
|
"rewards/cosine_scaled_reward/mean": -0.1868954598903656, |
|
"rewards/cosine_scaled_reward/std": 0.4017287492752075, |
|
"rewards/format_reward/mean": 0.96875, |
|
"rewards/format_reward/std": 0.17536810040473938, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1410.0, |
|
"completions/mean_length": 1108.890625, |
|
"completions/mean_terminated_length": 974.732177734375, |
|
"completions/min_length": 354.0, |
|
"completions/min_terminated_length": 354.0, |
|
"epoch": 0.21257142857142858, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14262138307094574, |
|
"learning_rate": 1.1533337816991931e-07, |
|
"loss": 0.0015, |
|
"num_tokens": 18899552.0, |
|
"reward": 0.6897875070571899, |
|
"reward_std": 0.5968158841133118, |
|
"rewards/cosine_scaled_reward/mean": -0.08479373157024384, |
|
"rewards/cosine_scaled_reward/std": 0.4098339378833771, |
|
"rewards/format_reward/mean": 0.859375, |
|
"rewards/format_reward/std": 0.3503824472427368, |
|
"step": 186 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2022.0, |
|
"completions/mean_length": 1067.40625, |
|
"completions/mean_terminated_length": 1002.0333862304688, |
|
"completions/min_length": 408.0, |
|
"completions/min_terminated_length": 408.0, |
|
"epoch": 0.21371428571428572, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.16053920984268188, |
|
"learning_rate": 1.1336692317580158e-07, |
|
"loss": 0.0076, |
|
"num_tokens": 18978290.0, |
|
"reward": 0.7425481677055359, |
|
"reward_std": 0.5081203579902649, |
|
"rewards/cosine_scaled_reward/mean": -0.12091340124607086, |
|
"rewards/cosine_scaled_reward/std": 0.43119898438453674, |
|
"rewards/format_reward/mean": 0.984375, |
|
"rewards/format_reward/std": 0.125, |
|
"step": 187 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.109375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1906.0, |
|
"completions/mean_length": 1119.453125, |
|
"completions/mean_terminated_length": 1005.4210815429688, |
|
"completions/min_length": 563.0, |
|
"completions/min_terminated_length": 563.0, |
|
"epoch": 0.21485714285714286, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1582225263118744, |
|
"learning_rate": 1.1153347084664419e-07, |
|
"loss": 0.0305, |
|
"num_tokens": 19061735.0, |
|
"reward": 0.5219712257385254, |
|
"reward_std": 0.5593596696853638, |
|
"rewards/cosine_scaled_reward/mean": -0.1999519169330597, |
|
"rewards/cosine_scaled_reward/std": 0.32119491696357727, |
|
"rewards/format_reward/mean": 0.921875, |
|
"rewards/format_reward/std": 0.27048972249031067, |
|
"step": 188 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.046875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1796.0, |
|
"completions/mean_length": 872.5625, |
|
"completions/mean_terminated_length": 814.7540283203125, |
|
"completions/min_length": 287.0, |
|
"completions/min_terminated_length": 287.0, |
|
"epoch": 0.216, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.13409367203712463, |
|
"learning_rate": 1.0983357966978745e-07, |
|
"loss": 0.0456, |
|
"num_tokens": 19126867.0, |
|
"reward": 0.7454105615615845, |
|
"reward_std": 0.605484127998352, |
|
"rewards/cosine_scaled_reward/mean": -0.11166971176862717, |
|
"rewards/cosine_scaled_reward/std": 0.4444236159324646, |
|
"rewards/format_reward/mean": 0.96875, |
|
"rewards/format_reward/std": 0.17536810040473938, |
|
"step": 189 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.078125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2046.0, |
|
"completions/mean_length": 1080.25, |
|
"completions/mean_terminated_length": 998.2373046875, |
|
"completions/min_length": 423.0, |
|
"completions/min_terminated_length": 423.0, |
|
"epoch": 0.21714285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.12813109159469604, |
|
"learning_rate": 1.0826776744855121e-07, |
|
"loss": -0.0287, |
|
"num_tokens": 19205771.0, |
|
"reward": 1.0522401332855225, |
|
"reward_std": 0.5290870070457458, |
|
"rewards/cosine_scaled_reward/mean": 0.026120096445083618, |
|
"rewards/cosine_scaled_reward/std": 0.4774343967437744, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.046875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1906.0, |
|
"completions/mean_length": 952.4375, |
|
"completions/mean_terminated_length": 898.5573120117188, |
|
"completions/min_length": 284.0, |
|
"completions/min_terminated_length": 284.0, |
|
"epoch": 0.21828571428571428, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.13046617805957794, |
|
"learning_rate": 1.068365111445064e-07, |
|
"loss": 0.0316, |
|
"num_tokens": 19277015.0, |
|
"reward": 1.01558518409729, |
|
"reward_std": 0.6485674381256104, |
|
"rewards/cosine_scaled_reward/mean": 0.023417577147483826, |
|
"rewards/cosine_scaled_reward/std": 0.4800501763820648, |
|
"rewards/format_reward/mean": 0.96875, |
|
"rewards/format_reward/std": 0.17536810040473938, |
|
"step": 191 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.328125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1859.0, |
|
"completions/mean_length": 1371.515625, |
|
"completions/mean_terminated_length": 1041.1395263671875, |
|
"completions/min_length": 382.0, |
|
"completions/min_terminated_length": 382.0, |
|
"epoch": 0.21942857142857142, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.14968900382518768, |
|
"learning_rate": 1.0554024673218806e-07, |
|
"loss": 0.0953, |
|
"num_tokens": 19376088.0, |
|
"reward": 0.3939949572086334, |
|
"reward_std": 0.577399730682373, |
|
"rewards/cosine_scaled_reward/mean": -0.19362753629684448, |
|
"rewards/cosine_scaled_reward/std": 0.30269211530685425, |
|
"rewards/format_reward/mean": 0.78125, |
|
"rewards/format_reward/std": 0.4166666865348816, |
|
"step": 192 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.21875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1859.0, |
|
"completions/mean_length": 1186.921875, |
|
"completions/mean_terminated_length": 945.8200073242188, |
|
"completions/min_length": 493.0, |
|
"completions/min_terminated_length": 493.0, |
|
"epoch": 0.22057142857142858, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.16263115406036377, |
|
"learning_rate": 1.0437936906629334e-07, |
|
"loss": 0.0846, |
|
"num_tokens": 19463195.0, |
|
"reward": 0.6804449558258057, |
|
"reward_std": 0.794600248336792, |
|
"rewards/cosine_scaled_reward/mean": -0.058215029537677765, |
|
"rewards/cosine_scaled_reward/std": 0.45185160636901855, |
|
"rewards/format_reward/mean": 0.796875, |
|
"rewards/format_reward/std": 0.40550529956817627, |
|
"step": 193 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.34375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1870.0, |
|
"completions/mean_length": 1393.921875, |
|
"completions/mean_terminated_length": 1051.3095703125, |
|
"completions/min_length": 483.0, |
|
"completions/min_terminated_length": 483.0, |
|
"epoch": 0.22171428571428572, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1629265695810318, |
|
"learning_rate": 1.0335423176140511e-07, |
|
"loss": -0.0049, |
|
"num_tokens": 19563766.0, |
|
"reward": 0.7986553907394409, |
|
"reward_std": 0.874267578125, |
|
"rewards/cosine_scaled_reward/mean": 0.03214021399617195, |
|
"rewards/cosine_scaled_reward/std": 0.47694674134254456, |
|
"rewards/format_reward/mean": 0.734375, |
|
"rewards/format_reward/std": 0.44515693187713623, |
|
"step": 194 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.078125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1993.0, |
|
"completions/mean_length": 1115.015625, |
|
"completions/mean_terminated_length": 1035.9490966796875, |
|
"completions/min_length": 458.0, |
|
"completions/min_terminated_length": 458.0, |
|
"epoch": 0.22285714285714286, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.139028862118721, |
|
"learning_rate": 1.0246514708427701e-07, |
|
"loss": -0.0035, |
|
"num_tokens": 19646271.0, |
|
"reward": 0.7042949795722961, |
|
"reward_std": 0.5829262733459473, |
|
"rewards/cosine_scaled_reward/mean": -0.10879002511501312, |
|
"rewards/cosine_scaled_reward/std": 0.38450202345848083, |
|
"rewards/format_reward/mean": 0.921875, |
|
"rewards/format_reward/std": 0.27048972249031067, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.21875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1984.0, |
|
"completions/mean_length": 1338.078125, |
|
"completions/mean_terminated_length": 1139.2999267578125, |
|
"completions/min_length": 390.0, |
|
"completions/min_terminated_length": 390.0, |
|
"epoch": 0.224, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.17117220163345337, |
|
"learning_rate": 1.017123858587145e-07, |
|
"loss": 0.0298, |
|
"num_tokens": 19743500.0, |
|
"reward": 0.3932352066040039, |
|
"reward_std": 0.6573115587234497, |
|
"rewards/cosine_scaled_reward/mean": -0.20181991159915924, |
|
"rewards/cosine_scaled_reward/std": 0.3404424488544464, |
|
"rewards/format_reward/mean": 0.796875, |
|
"rewards/format_reward/std": 0.40550529956817627, |
|
"step": 196 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.015625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1562.0, |
|
"completions/mean_length": 872.078125, |
|
"completions/mean_terminated_length": 853.4127197265625, |
|
"completions/min_length": 416.0, |
|
"completions/min_terminated_length": 416.0, |
|
"epoch": 0.22514285714285714, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.12287131696939468, |
|
"learning_rate": 1.0109617738307911e-07, |
|
"loss": -0.0059, |
|
"num_tokens": 19809681.0, |
|
"reward": 1.2395715713500977, |
|
"reward_std": 0.6934706568717957, |
|
"rewards/cosine_scaled_reward/mean": 0.11978581547737122, |
|
"rewards/cosine_scaled_reward/std": 0.5448962450027466, |
|
"rewards/format_reward/mean": 1.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 197 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2024.0, |
|
"completions/mean_length": 1120.28125, |
|
"completions/mean_terminated_length": 987.7500610351562, |
|
"completions/min_length": 320.0, |
|
"completions/min_terminated_length": 320.0, |
|
"epoch": 0.22628571428571428, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15039725601673126, |
|
"learning_rate": 1.0061670936044178e-07, |
|
"loss": 0.0362, |
|
"num_tokens": 19892883.0, |
|
"reward": 1.0277272462844849, |
|
"reward_std": 0.74528968334198, |
|
"rewards/cosine_scaled_reward/mean": 0.021676115691661835, |
|
"rewards/cosine_scaled_reward/std": 0.5368949174880981, |
|
"rewards/format_reward/mean": 0.984375, |
|
"rewards/format_reward/std": 0.125, |
|
"step": 198 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1958.0, |
|
"completions/mean_length": 1294.71875, |
|
"completions/mean_terminated_length": 1187.107177734375, |
|
"completions/min_length": 577.0, |
|
"completions/min_terminated_length": 577.0, |
|
"epoch": 0.22742857142857142, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15263773500919342, |
|
"learning_rate": 1.002741278414069e-07, |
|
"loss": 0.0084, |
|
"num_tokens": 19987249.0, |
|
"reward": 0.6131043434143066, |
|
"reward_std": 0.7018917798995972, |
|
"rewards/cosine_scaled_reward/mean": -0.1543852984905243, |
|
"rewards/cosine_scaled_reward/std": 0.35418131947517395, |
|
"rewards/format_reward/mean": 0.921875, |
|
"rewards/format_reward/std": 0.27048972249031067, |
|
"step": 199 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.171875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 1991.0, |
|
"completions/mean_length": 1115.796875, |
|
"completions/mean_terminated_length": 922.3207397460938, |
|
"completions/min_length": 509.0, |
|
"completions/min_terminated_length": 509.0, |
|
"epoch": 0.22857142857142856, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.16787739098072052, |
|
"learning_rate": 1.0006853717962393e-07, |
|
"loss": 0.0407, |
|
"num_tokens": 20068780.0, |
|
"reward": 0.9602231979370117, |
|
"reward_std": 0.8039394617080688, |
|
"rewards/cosine_scaled_reward/mean": 0.05823659524321556, |
|
"rewards/cosine_scaled_reward/std": 0.5022075772285461, |
|
"rewards/format_reward/mean": 0.84375, |
|
"rewards/format_reward/std": 0.36596253514289856, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22857142857142856, |
|
"step": 200, |
|
"total_flos": 0.0, |
|
"train_loss": 0.03711814505979419, |
|
"train_runtime": 10340.5912, |
|
"train_samples_per_second": 1.238, |
|
"train_steps_per_second": 0.019 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 20068780, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|