|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.997867803837953, |
|
"eval_steps": 100, |
|
"global_step": 117, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.259765625, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1016.0, |
|
"completions/mean_length": 622.638671875, |
|
"completions/mean_terminated_length": 481.79156494140625, |
|
"completions/min_length": 6.0, |
|
"completions/min_terminated_length": 6.0, |
|
"epoch": 0.008528784648187633, |
|
"grad_norm": 0.8145178930391833, |
|
"kl": 0.00037097930908203125, |
|
"learning_rate": 0.0, |
|
"loss": 0.0196, |
|
"num_tokens": 392023.0, |
|
"reward": 0.634765625, |
|
"reward_std": 0.33141425251960754, |
|
"rewards/accuracy_reward/mean": 0.634765625, |
|
"rewards/accuracy_reward/std": 0.4819667339324951, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2314453125, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1019.75, |
|
"completions/mean_length": 595.36865234375, |
|
"completions/mean_terminated_length": 466.6201705932617, |
|
"completions/min_length": 7.25, |
|
"completions/min_terminated_length": 7.25, |
|
"epoch": 0.042643923240938165, |
|
"grad_norm": 0.4773668247144155, |
|
"kl": 0.00041562318801879883, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0477, |
|
"num_tokens": 1910946.0, |
|
"reward": 0.64501953125, |
|
"reward_std": 0.37129127234220505, |
|
"rewards/accuracy_reward/mean": 0.64501953125, |
|
"rewards/accuracy_reward/std": 0.47847116738557816, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2375, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1019.2, |
|
"completions/mean_length": 618.060546875, |
|
"completions/mean_terminated_length": 491.431787109375, |
|
"completions/min_length": 10.8, |
|
"completions/min_terminated_length": 10.8, |
|
"epoch": 0.08528784648187633, |
|
"grad_norm": 0.3298661973983867, |
|
"kl": 0.0007388591766357422, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.0181, |
|
"num_tokens": 3860773.0, |
|
"reward": 0.6421875, |
|
"reward_std": 0.33047744631767273, |
|
"rewards/accuracy_reward/mean": 0.6421875, |
|
"rewards/accuracy_reward/std": 0.4783483386039734, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.203125, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1017.6, |
|
"completions/mean_length": 603.215625, |
|
"completions/mean_terminated_length": 495.9770202636719, |
|
"completions/min_length": 45.2, |
|
"completions/min_terminated_length": 45.2, |
|
"epoch": 0.1279317697228145, |
|
"grad_norm": 0.25911245136387523, |
|
"kl": 0.0020538330078125, |
|
"learning_rate": 2.9973151946516025e-06, |
|
"loss": 0.0687, |
|
"num_tokens": 5749941.0, |
|
"reward": 0.734375, |
|
"reward_std": 0.2392146944999695, |
|
"rewards/accuracy_reward/mean": 0.734375, |
|
"rewards/accuracy_reward/std": 0.44130025506019593, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.15078125, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1018.2, |
|
"completions/mean_length": 585.84765625, |
|
"completions/mean_terminated_length": 508.5999755859375, |
|
"completions/min_length": 148.8, |
|
"completions/min_terminated_length": 148.8, |
|
"epoch": 0.17057569296375266, |
|
"grad_norm": 0.19464089548857677, |
|
"kl": 0.0031925201416015624, |
|
"learning_rate": 2.9672214011007086e-06, |
|
"loss": 0.0434, |
|
"num_tokens": 7604799.0, |
|
"reward": 0.76953125, |
|
"reward_std": 0.181648451089859, |
|
"rewards/accuracy_reward/mean": 0.76953125, |
|
"rewards/accuracy_reward/std": 0.41696075201034544, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1265625, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1015.4, |
|
"completions/mean_length": 569.831640625, |
|
"completions/mean_terminated_length": 505.2327087402344, |
|
"completions/min_length": 126.2, |
|
"completions/min_terminated_length": 126.2, |
|
"epoch": 0.21321961620469082, |
|
"grad_norm": 0.11280922307838523, |
|
"kl": 0.003424835205078125, |
|
"learning_rate": 2.904352305959606e-06, |
|
"loss": 0.0414, |
|
"num_tokens": 9406560.0, |
|
"reward": 0.800390625, |
|
"reward_std": 0.1668152093887329, |
|
"rewards/accuracy_reward/mean": 0.800390625, |
|
"rewards/accuracy_reward/std": 0.3962217092514038, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.123828125, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1019.4, |
|
"completions/mean_length": 572.0875, |
|
"completions/mean_terminated_length": 508.3062744140625, |
|
"completions/min_length": 119.0, |
|
"completions/min_terminated_length": 119.0, |
|
"epoch": 0.255863539445629, |
|
"grad_norm": 0.17292784666143843, |
|
"kl": 0.0038557052612304688, |
|
"learning_rate": 2.8101123009256945e-06, |
|
"loss": 0.0444, |
|
"num_tokens": 11235280.0, |
|
"reward": 0.779296875, |
|
"reward_std": 0.18240008652210235, |
|
"rewards/accuracy_reward/mean": 0.779296875, |
|
"rewards/accuracy_reward/std": 0.4136933445930481, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.10546875, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1020.2, |
|
"completions/mean_length": 563.6625, |
|
"completions/mean_terminated_length": 509.4048156738281, |
|
"completions/min_length": 91.2, |
|
"completions/min_terminated_length": 91.2, |
|
"epoch": 0.29850746268656714, |
|
"grad_norm": 0.1417198452884755, |
|
"kl": 0.004206085205078125, |
|
"learning_rate": 2.6866065519845123e-06, |
|
"loss": 0.0393, |
|
"num_tokens": 13029192.0, |
|
"reward": 0.796875, |
|
"reward_std": 0.1687217354774475, |
|
"rewards/accuracy_reward/mean": 0.796875, |
|
"rewards/accuracy_reward/std": 0.40032246708869934, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.11015625, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1019.8, |
|
"completions/mean_length": 555.61328125, |
|
"completions/mean_terminated_length": 498.0840698242188, |
|
"completions/min_length": 122.6, |
|
"completions/min_terminated_length": 122.6, |
|
"epoch": 0.3411513859275053, |
|
"grad_norm": 0.2464658893666007, |
|
"kl": 0.003921127319335938, |
|
"learning_rate": 2.5365939734802974e-06, |
|
"loss": 0.0384, |
|
"num_tokens": 14806018.0, |
|
"reward": 0.796484375, |
|
"reward_std": 0.16222528517246246, |
|
"rewards/accuracy_reward/mean": 0.796484375, |
|
"rewards/accuracy_reward/std": 0.40181103348731995, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.121875, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1019.4, |
|
"completions/mean_length": 568.50703125, |
|
"completions/mean_terminated_length": 505.3303649902344, |
|
"completions/min_length": 146.8, |
|
"completions/min_terminated_length": 146.8, |
|
"epoch": 0.3837953091684435, |
|
"grad_norm": 0.229924275356646, |
|
"kl": 0.004242706298828125, |
|
"learning_rate": 2.3634255985285104e-06, |
|
"loss": 0.0377, |
|
"num_tokens": 16620316.0, |
|
"reward": 0.81171875, |
|
"reward_std": 0.16615791916847228, |
|
"rewards/accuracy_reward/mean": 0.81171875, |
|
"rewards/accuracy_reward/std": 0.39103145599365235, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.122265625, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1014.2, |
|
"completions/mean_length": 565.183984375, |
|
"completions/mean_terminated_length": 501.5477783203125, |
|
"completions/min_length": 119.6, |
|
"completions/min_terminated_length": 119.6, |
|
"epoch": 0.42643923240938164, |
|
"grad_norm": 0.2797579550491506, |
|
"kl": 0.00439910888671875, |
|
"learning_rate": 2.1709697224734488e-06, |
|
"loss": 0.0355, |
|
"num_tokens": 18421059.0, |
|
"reward": 0.766015625, |
|
"reward_std": 0.1700347661972046, |
|
"rewards/accuracy_reward/mean": 0.766015625, |
|
"rewards/accuracy_reward/std": 0.41948946118354796, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.111328125, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1014.0, |
|
"completions/mean_length": 558.638671875, |
|
"completions/mean_terminated_length": 500.31190795898436, |
|
"completions/min_length": 114.4, |
|
"completions/min_terminated_length": 114.4, |
|
"epoch": 0.4690831556503198, |
|
"grad_norm": 0.2632961965405764, |
|
"kl": 0.0050487518310546875, |
|
"learning_rate": 1.963525491562421e-06, |
|
"loss": 0.0222, |
|
"num_tokens": 20196126.0, |
|
"reward": 0.7828125, |
|
"reward_std": 0.1633252739906311, |
|
"rewards/accuracy_reward/mean": 0.7828125, |
|
"rewards/accuracy_reward/std": 0.41108490228652955, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.117578125, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1020.4, |
|
"completions/mean_length": 581.921875, |
|
"completions/mean_terminated_length": 522.9397216796875, |
|
"completions/min_length": 131.6, |
|
"completions/min_terminated_length": 131.6, |
|
"epoch": 0.511727078891258, |
|
"grad_norm": 0.0965979552879068, |
|
"kl": 0.004494857788085937, |
|
"learning_rate": 1.7457268671227065e-06, |
|
"loss": 0.0396, |
|
"num_tokens": 22047438.0, |
|
"reward": 0.77734375, |
|
"reward_std": 0.16624618470668792, |
|
"rewards/accuracy_reward/mean": 0.77734375, |
|
"rewards/accuracy_reward/std": 0.41559609174728396, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.116015625, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1014.8, |
|
"completions/mean_length": 552.442578125, |
|
"completions/mean_terminated_length": 490.71544189453124, |
|
"completions/min_length": 101.6, |
|
"completions/min_terminated_length": 101.6, |
|
"epoch": 0.5543710021321961, |
|
"grad_norm": 0.17017618787674732, |
|
"kl": 0.004914474487304687, |
|
"learning_rate": 1.5224391105228955e-06, |
|
"loss": 0.0323, |
|
"num_tokens": 23838739.0, |
|
"reward": 0.774609375, |
|
"reward_std": 0.1651999369263649, |
|
"rewards/accuracy_reward/mean": 0.774609375, |
|
"rewards/accuracy_reward/std": 0.4162830650806427, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.122265625, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1019.2, |
|
"completions/mean_length": 572.6484375, |
|
"completions/mean_terminated_length": 510.603759765625, |
|
"completions/min_length": 98.0, |
|
"completions/min_terminated_length": 98.0, |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 0.12570880852383362, |
|
"kl": 0.004351425170898438, |
|
"learning_rate": 1.2986501012735172e-06, |
|
"loss": 0.0361, |
|
"num_tokens": 25673351.0, |
|
"reward": 0.75234375, |
|
"reward_std": 0.18619680404663086, |
|
"rewards/accuracy_reward/mean": 0.75234375, |
|
"rewards/accuracy_reward/std": 0.42559565901756286, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.105078125, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1013.8, |
|
"completions/mean_length": 539.875, |
|
"completions/mean_terminated_length": 483.08538818359375, |
|
"completions/min_length": 105.2, |
|
"completions/min_terminated_length": 105.2, |
|
"epoch": 0.6396588486140725, |
|
"grad_norm": 0.19371389016243035, |
|
"kl": 0.005275344848632813, |
|
"learning_rate": 1.079358916040996e-06, |
|
"loss": 0.0372, |
|
"num_tokens": 27420743.0, |
|
"reward": 0.78046875, |
|
"reward_std": 0.17587369978427886, |
|
"rewards/accuracy_reward/mean": 0.78046875, |
|
"rewards/accuracy_reward/std": 0.41278024911880495, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.107421875, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1014.6, |
|
"completions/mean_length": 551.752734375, |
|
"completions/mean_terminated_length": 495.41151733398436, |
|
"completions/min_length": 92.4, |
|
"completions/min_terminated_length": 92.4, |
|
"epoch": 0.6823027718550106, |
|
"grad_norm": 0.12351401523997702, |
|
"kl": 0.004492950439453125, |
|
"learning_rate": 8.69464157535652e-07, |
|
"loss": 0.0355, |
|
"num_tokens": 29190430.0, |
|
"reward": 0.775390625, |
|
"reward_std": 0.16248664855957032, |
|
"rewards/accuracy_reward/mean": 0.775390625, |
|
"rewards/accuracy_reward/std": 0.4170423984527588, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.10546875, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1017.8, |
|
"completions/mean_length": 564.80859375, |
|
"completions/mean_terminated_length": 510.5605102539063, |
|
"completions/min_length": 99.2, |
|
"completions/min_terminated_length": 99.2, |
|
"epoch": 0.7249466950959488, |
|
"grad_norm": 0.11800011260860456, |
|
"kl": 0.004195022583007813, |
|
"learning_rate": 6.736545278218464e-07, |
|
"loss": 0.0232, |
|
"num_tokens": 30987452.0, |
|
"reward": 0.792578125, |
|
"reward_std": 0.17066446840763091, |
|
"rewards/accuracy_reward/mean": 0.7951016902923584, |
|
"rewards/accuracy_reward/std": 0.40229706168174745, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.108203125, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1020.4, |
|
"completions/mean_length": 555.931640625, |
|
"completions/mean_terminated_length": 499.55921630859376, |
|
"completions/min_length": 100.0, |
|
"completions/min_terminated_length": 100.0, |
|
"epoch": 0.767590618336887, |
|
"grad_norm": 0.1766286378559812, |
|
"kl": 0.004351806640625, |
|
"learning_rate": 4.963040904617131e-07, |
|
"loss": 0.0305, |
|
"num_tokens": 32791613.0, |
|
"reward": 0.762890625, |
|
"reward_std": 0.16622219383716583, |
|
"rewards/accuracy_reward/mean": 0.762890625, |
|
"rewards/accuracy_reward/std": 0.42507564425468447, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.110546875, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1020.4, |
|
"completions/mean_length": 567.742578125, |
|
"completions/mean_terminated_length": 511.2064636230469, |
|
"completions/min_length": 89.6, |
|
"completions/min_terminated_length": 89.6, |
|
"epoch": 0.8102345415778252, |
|
"grad_norm": 0.13097760910471143, |
|
"kl": 0.004387664794921875, |
|
"learning_rate": 3.4137456116310087e-07, |
|
"loss": 0.0181, |
|
"num_tokens": 34613034.0, |
|
"reward": 0.76953125, |
|
"reward_std": 0.16518225371837617, |
|
"rewards/accuracy_reward/mean": 0.76953125, |
|
"rewards/accuracy_reward/std": 0.42095457315444945, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.10234375, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1016.8, |
|
"completions/mean_length": 570.152734375, |
|
"completions/mean_terminated_length": 518.7150634765625, |
|
"completions/min_length": 120.8, |
|
"completions/min_terminated_length": 120.8, |
|
"epoch": 0.8528784648187633, |
|
"grad_norm": 0.1265224718572913, |
|
"kl": 0.004515457153320313, |
|
"learning_rate": 2.1232680959720086e-07, |
|
"loss": 0.0369, |
|
"num_tokens": 36434305.0, |
|
"reward": 0.77890625, |
|
"reward_std": 0.16256613433361053, |
|
"rewards/accuracy_reward/mean": 0.7815476179122924, |
|
"rewards/accuracy_reward/std": 0.41042966246604917, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8528784648187633, |
|
"eval_clip_ratio/high_max": 0.0, |
|
"eval_clip_ratio/high_mean": 0.0, |
|
"eval_clip_ratio/low_mean": 0.0, |
|
"eval_clip_ratio/low_min": 0.0, |
|
"eval_clip_ratio/region_mean": 0.0, |
|
"eval_completions/clipped_ratio": 0.093025, |
|
"eval_completions/max_length": 1017.8096, |
|
"eval_completions/max_terminated_length": 956.9056, |
|
"eval_completions/mean_length": 544.63845, |
|
"eval_completions/mean_terminated_length": 497.0462825683594, |
|
"eval_completions/min_length": 184.3936, |
|
"eval_completions/min_terminated_length": 184.3936, |
|
"eval_kl": 0.00679013671875, |
|
"eval_loss": 0.01577703095972538, |
|
"eval_num_tokens": 36434305.0, |
|
"eval_reward": 0.694925, |
|
"eval_reward_std": 0.20438497549295426, |
|
"eval_rewards/accuracy_reward/mean": 0.694925, |
|
"eval_rewards/accuracy_reward/std": 0.44015464553833006, |
|
"eval_rewards/format_reward/mean": 0.0, |
|
"eval_rewards/format_reward/std": 0.0, |
|
"eval_runtime": 4105.356, |
|
"eval_samples_per_second": 1.218, |
|
"eval_steps_per_second": 0.019, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.093359375, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1006.6, |
|
"completions/mean_length": 551.971875, |
|
"completions/mean_terminated_length": 503.73955688476565, |
|
"completions/min_length": 124.4, |
|
"completions/min_terminated_length": 124.4, |
|
"epoch": 0.8955223880597015, |
|
"grad_norm": 0.11809578314174589, |
|
"kl": 0.004218292236328125, |
|
"learning_rate": 1.1204354928900495e-07, |
|
"loss": 0.0265, |
|
"num_tokens": 38198833.0, |
|
"reward": 0.803125, |
|
"reward_std": 0.1608460694551468, |
|
"rewards/accuracy_reward/mean": 0.803125, |
|
"rewards/accuracy_reward/std": 0.39661717414855957, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.08984375, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1018.0, |
|
"completions/mean_length": 559.37890625, |
|
"completions/mean_terminated_length": 514.0422424316406, |
|
"completions/min_length": 139.2, |
|
"completions/min_terminated_length": 139.2, |
|
"epoch": 0.9381663113006397, |
|
"grad_norm": 0.1543236952292962, |
|
"kl": 0.0042022705078125, |
|
"learning_rate": 4.276494256069874e-08, |
|
"loss": 0.0293, |
|
"num_tokens": 39998331.0, |
|
"reward": 0.817578125, |
|
"reward_std": 0.17398287057876588, |
|
"rewards/accuracy_reward/mean": 0.817578125, |
|
"rewards/accuracy_reward/std": 0.3809166610240936, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.093359375, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1012.8, |
|
"completions/mean_length": 548.9046875, |
|
"completions/mean_terminated_length": 499.97105102539064, |
|
"completions/min_length": 94.4, |
|
"completions/min_terminated_length": 94.4, |
|
"epoch": 0.9808102345415778, |
|
"grad_norm": 0.15606339132567215, |
|
"kl": 0.00447540283203125, |
|
"learning_rate": 6.038559007141398e-09, |
|
"loss": 0.0288, |
|
"num_tokens": 41769847.0, |
|
"reward": 0.787890625, |
|
"reward_std": 0.1641687572002411, |
|
"rewards/accuracy_reward/mean": 0.787890625, |
|
"rewards/accuracy_reward/std": 0.4066555380821228, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.08664344879518071, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 1010.0, |
|
"completions/mean_length": 558.7720031738281, |
|
"completions/mean_terminated_length": 514.6327362060547, |
|
"completions/min_length": 124.0, |
|
"completions/min_terminated_length": 124.0, |
|
"epoch": 0.997867803837953, |
|
"kl": 0.003989219665527344, |
|
"num_tokens": 42497058.0, |
|
"reward": 0.7802734375, |
|
"reward_std": 0.19359246641397476, |
|
"rewards/accuracy_reward/mean": 0.7802734375, |
|
"rewards/accuracy_reward/std": 0.41445116698741913, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 117, |
|
"total_flos": 0.0, |
|
"train_loss": 0.03524629572708892, |
|
"train_runtime": 9378.8625, |
|
"train_samples_per_second": 0.8, |
|
"train_steps_per_second": 0.012 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 117, |
|
"num_input_tokens_seen": 42497058, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|