|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.4605543710021323, |
|
"eval_steps": 10, |
|
"global_step": 201, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 605.8180999755859, |
|
"epoch": 0.017057569296375266, |
|
"grad_norm": 11.068035125732422, |
|
"kl": 0.0, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 0.0, |
|
"reward": 0.6227678880095482, |
|
"reward_std": 0.34988817013800144, |
|
"rewards/accuracy_reward": 0.6227678880095482, |
|
"rewards/format_reward": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 606.5859651565552, |
|
"epoch": 0.08528784648187633, |
|
"grad_norm": 10.528549194335938, |
|
"kl": 0.0009853541851043701, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.5948660997673869, |
|
"reward_std": 0.3529963130131364, |
|
"rewards/accuracy_reward": 0.5948660997673869, |
|
"rewards/format_reward": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 623.1681053161622, |
|
"epoch": 0.17057569296375266, |
|
"grad_norm": 0.528905987739563, |
|
"kl": 0.008502006530761719, |
|
"learning_rate": 2.9095389311788626e-06, |
|
"loss": 0.0003, |
|
"reward": 0.689285746216774, |
|
"reward_std": 0.29115155190229414, |
|
"rewards/accuracy_reward": 0.689285746216774, |
|
"rewards/format_reward": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.17057569296375266, |
|
"eval_completion_length": 573.9749799455915, |
|
"eval_kl": 1.3873639787946428, |
|
"eval_loss": 0.056397709995508194, |
|
"eval_reward": 0.7653061492102486, |
|
"eval_reward_std": 0.2849572343485696, |
|
"eval_rewards/accuracy_reward": 0.7653061492102486, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 229.3061, |
|
"eval_samples_per_second": 0.436, |
|
"eval_steps_per_second": 0.004, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 624.6886390686035, |
|
"epoch": 0.255863539445629, |
|
"grad_norm": 0.8065505623817444, |
|
"kl": 0.014313316345214844, |
|
"learning_rate": 2.649066664678467e-06, |
|
"loss": 0.0006, |
|
"reward": 0.7506696745753288, |
|
"reward_std": 0.23205664344131946, |
|
"rewards/accuracy_reward": 0.7506696745753288, |
|
"rewards/format_reward": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 613.3145324707032, |
|
"epoch": 0.3411513859275053, |
|
"grad_norm": 0.18300795555114746, |
|
"kl": 0.38666706085205077, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.0154, |
|
"reward": 0.7591518208384513, |
|
"reward_std": 0.21772289499640465, |
|
"rewards/accuracy_reward": 0.7591518208384513, |
|
"rewards/format_reward": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3411513859275053, |
|
"eval_completion_length": 533.1733834402902, |
|
"eval_kl": 0.005168369838169643, |
|
"eval_loss": 0.0009401756688021123, |
|
"eval_reward": 0.812500034059797, |
|
"eval_reward_std": 0.22666969469615392, |
|
"eval_rewards/accuracy_reward": 0.812500034059797, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 225.084, |
|
"eval_samples_per_second": 0.444, |
|
"eval_steps_per_second": 0.004, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 611.4100685119629, |
|
"epoch": 0.42643923240938164, |
|
"grad_norm": 0.2118988335132599, |
|
"kl": 0.003771209716796875, |
|
"learning_rate": 1.7604722665003958e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7513393223285675, |
|
"reward_std": 0.19753552060574292, |
|
"rewards/accuracy_reward": 0.7513393223285675, |
|
"rewards/format_reward": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 612.1899833679199, |
|
"epoch": 0.511727078891258, |
|
"grad_norm": 0.2857199013233185, |
|
"kl": 0.003532218933105469, |
|
"learning_rate": 1.2395277334996047e-06, |
|
"loss": 0.0001, |
|
"reward": 0.7535714626312255, |
|
"reward_std": 0.17617593705654144, |
|
"rewards/accuracy_reward": 0.7535714626312255, |
|
"rewards/format_reward": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.511727078891258, |
|
"eval_completion_length": 524.0120588030134, |
|
"eval_kl": 0.004640851702008929, |
|
"eval_loss": 0.000920308637432754, |
|
"eval_reward": 0.8431122728756496, |
|
"eval_reward_std": 0.16344326840979712, |
|
"eval_rewards/accuracy_reward": 0.8431122728756496, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 223.9002, |
|
"eval_samples_per_second": 0.447, |
|
"eval_steps_per_second": 0.004, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 600.6205627441407, |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 0.10232926905155182, |
|
"kl": 0.00328369140625, |
|
"learning_rate": 7.500000000000003e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7500000350177288, |
|
"reward_std": 0.18227138370275497, |
|
"rewards/accuracy_reward": 0.7500000350177288, |
|
"rewards/format_reward": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 588.3980171203614, |
|
"epoch": 0.6823027718550106, |
|
"grad_norm": 0.12149345874786377, |
|
"kl": 0.013450050354003906, |
|
"learning_rate": 3.5093333532153313e-07, |
|
"loss": 0.0005, |
|
"reward": 0.7700893267989158, |
|
"reward_std": 0.18259718772023917, |
|
"rewards/accuracy_reward": 0.7700893267989158, |
|
"rewards/format_reward": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6823027718550106, |
|
"eval_completion_length": 515.6179722377232, |
|
"eval_kl": 0.005268641880580357, |
|
"eval_loss": 0.00021133186237420887, |
|
"eval_reward": 0.8303571854318891, |
|
"eval_reward_std": 0.1784549355506897, |
|
"eval_rewards/accuracy_reward": 0.8303571854318891, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 223.7869, |
|
"eval_samples_per_second": 0.447, |
|
"eval_steps_per_second": 0.004, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 608.0940032958985, |
|
"epoch": 0.767590618336887, |
|
"grad_norm": 0.11995360255241394, |
|
"kl": 0.0032606124877929688, |
|
"learning_rate": 9.046106882113752e-08, |
|
"loss": 0.0001, |
|
"reward": 0.7495535999536515, |
|
"reward_std": 0.19272922482341528, |
|
"rewards/accuracy_reward": 0.7495535999536515, |
|
"rewards/format_reward": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 614.151146697998, |
|
"epoch": 0.8528784648187633, |
|
"grad_norm": 2.694963216781616, |
|
"kl": 0.004784202575683594, |
|
"learning_rate": 0.0, |
|
"loss": 0.0002, |
|
"reward": 0.7437500387430191, |
|
"reward_std": 0.19476121049374343, |
|
"rewards/accuracy_reward": 0.7437500387430191, |
|
"rewards/format_reward": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8528784648187633, |
|
"eval_completion_length": 521.0712454659598, |
|
"eval_kl": 0.005817958286830357, |
|
"eval_loss": 0.00023309080279432237, |
|
"eval_reward": 0.8392857568604606, |
|
"eval_reward_std": 0.2051741225378854, |
|
"eval_rewards/accuracy_reward": 0.8392857568604606, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 225.9, |
|
"eval_samples_per_second": 0.443, |
|
"eval_steps_per_second": 0.004, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 588.8913177490234, |
|
"epoch": 0.9381663113006397, |
|
"grad_norm": 0.08098618686199188, |
|
"kl": 0.0032186508178710938, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.0001, |
|
"reward": 0.7845982491970063, |
|
"reward_std": 0.1832274880260229, |
|
"rewards/accuracy_reward": 0.7845982491970063, |
|
"rewards/format_reward": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 579.3964538574219, |
|
"epoch": 1.0341151385927505, |
|
"grad_norm": 0.12469803541898727, |
|
"kl": 0.003044976128472222, |
|
"learning_rate": 1.2395277334996047e-06, |
|
"loss": 0.0001, |
|
"reward": 0.7807540045844183, |
|
"reward_std": 0.16871399796671338, |
|
"rewards/accuracy_reward": 0.7807540045844183, |
|
"rewards/format_reward": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.0341151385927505, |
|
"eval_completion_length": 504.27440970284596, |
|
"eval_kl": 0.005137852260044643, |
|
"eval_loss": 0.00020576055976562202, |
|
"eval_reward": 0.8545918720109122, |
|
"eval_reward_std": 0.15936485332037723, |
|
"eval_rewards/accuracy_reward": 0.8545918720109122, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 218.3536, |
|
"eval_samples_per_second": 0.458, |
|
"eval_steps_per_second": 0.005, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 566.6837295532226, |
|
"epoch": 1.1194029850746268, |
|
"grad_norm": 0.08429302275180817, |
|
"kl": 0.0031642913818359375, |
|
"learning_rate": 9.86969785011497e-07, |
|
"loss": 0.0001, |
|
"reward": 0.783928607404232, |
|
"reward_std": 0.16070863213390113, |
|
"rewards/accuracy_reward": 0.783928607404232, |
|
"rewards/format_reward": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 565.8973487854004, |
|
"epoch": 1.2046908315565032, |
|
"grad_norm": 0.1427806168794632, |
|
"kl": 0.0034132003784179688, |
|
"learning_rate": 7.500000000000003e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7837053954601287, |
|
"reward_std": 0.14835985810495914, |
|
"rewards/accuracy_reward": 0.7837053954601287, |
|
"rewards/format_reward": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.2046908315565032, |
|
"eval_completion_length": 487.16007341657365, |
|
"eval_kl": 0.006696428571428571, |
|
"eval_loss": 0.0002668871602509171, |
|
"eval_reward": 0.8686224818229675, |
|
"eval_reward_std": 0.14786859175988606, |
|
"eval_rewards/accuracy_reward": 0.8686224818229675, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 220.7164, |
|
"eval_samples_per_second": 0.453, |
|
"eval_steps_per_second": 0.005, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 578.0143165588379, |
|
"epoch": 1.2899786780383795, |
|
"grad_norm": 0.16407515108585358, |
|
"kl": 0.0034801483154296873, |
|
"learning_rate": 5.358185854701909e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7758928909897804, |
|
"reward_std": 0.17325164508074523, |
|
"rewards/accuracy_reward": 0.7758928909897804, |
|
"rewards/format_reward": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 567.1640846252442, |
|
"epoch": 1.375266524520256, |
|
"grad_norm": 0.06801605224609375, |
|
"kl": 0.0032863616943359375, |
|
"learning_rate": 3.5093333532153313e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7915178939700127, |
|
"reward_std": 0.14882502863183616, |
|
"rewards/accuracy_reward": 0.7915178939700127, |
|
"rewards/format_reward": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.375266524520256, |
|
"eval_completion_length": 490.3266034807478, |
|
"eval_kl": 0.004799979073660714, |
|
"eval_loss": 0.0001912058360176161, |
|
"eval_reward": 0.8494898336274284, |
|
"eval_reward_std": 0.165963847722326, |
|
"eval_rewards/accuracy_reward": 0.8494898336274284, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 221.7278, |
|
"eval_samples_per_second": 0.451, |
|
"eval_steps_per_second": 0.005, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 568.004940032959, |
|
"epoch": 1.4605543710021323, |
|
"grad_norm": 0.11520688235759735, |
|
"kl": 0.00350799560546875, |
|
"learning_rate": 2.0096189432334195e-07, |
|
"loss": 0.0001, |
|
"reward": 0.791741105914116, |
|
"reward_std": 0.15448834607377648, |
|
"rewards/accuracy_reward": 0.791741105914116, |
|
"rewards/format_reward": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 579.3893096923828, |
|
"epoch": 1.5458422174840085, |
|
"grad_norm": 0.096395343542099, |
|
"kl": 0.0031429290771484374, |
|
"learning_rate": 9.046106882113752e-08, |
|
"loss": 0.0001, |
|
"reward": 0.7698661059141159, |
|
"reward_std": 0.16837037783116102, |
|
"rewards/accuracy_reward": 0.7698661059141159, |
|
"rewards/format_reward": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.5458422174840085, |
|
"eval_completion_length": 489.6377737862723, |
|
"eval_kl": 0.005275181361607143, |
|
"eval_loss": 0.00021086516790091991, |
|
"eval_reward": 0.8596939103943961, |
|
"eval_reward_std": 0.16273953127009527, |
|
"eval_rewards/accuracy_reward": 0.8596939103943961, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 220.5157, |
|
"eval_samples_per_second": 0.453, |
|
"eval_steps_per_second": 0.005, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 579.5056037902832, |
|
"epoch": 1.6311300639658848, |
|
"grad_norm": 0.08153486996889114, |
|
"kl": 0.003292083740234375, |
|
"learning_rate": 2.278837048168797e-08, |
|
"loss": 0.0001, |
|
"reward": 0.7850446745753288, |
|
"reward_std": 0.17225408796221017, |
|
"rewards/accuracy_reward": 0.7850446745753288, |
|
"rewards/format_reward": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 569.5910942077637, |
|
"epoch": 1.716417910447761, |
|
"grad_norm": 0.07987989485263824, |
|
"kl": 0.0033542633056640623, |
|
"learning_rate": 0.0, |
|
"loss": 0.0001, |
|
"reward": 0.797544676065445, |
|
"reward_std": 0.15488467887043952, |
|
"rewards/accuracy_reward": 0.797544676065445, |
|
"rewards/format_reward": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.716417910447761, |
|
"eval_completion_length": 488.6938912527902, |
|
"eval_kl": 0.005329677036830357, |
|
"eval_loss": 0.0009469491196796298, |
|
"eval_reward": 0.8507653389658246, |
|
"eval_reward_std": 0.1611656959035567, |
|
"eval_rewards/accuracy_reward": 0.8507653389658246, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 224.3175, |
|
"eval_samples_per_second": 0.446, |
|
"eval_steps_per_second": 0.004, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 583.1759201049805, |
|
"epoch": 1.8017057569296375, |
|
"grad_norm": 0.10411060601472855, |
|
"kl": 0.003289031982421875, |
|
"learning_rate": 7.500000000000003e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7611607491970063, |
|
"reward_std": 0.16414006650447846, |
|
"rewards/accuracy_reward": 0.7611607491970063, |
|
"rewards/format_reward": 0.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 582.3049369812012, |
|
"epoch": 1.886993603411514, |
|
"grad_norm": 0.15299023687839508, |
|
"kl": 0.003238677978515625, |
|
"learning_rate": 6.04262112445821e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7812500387430191, |
|
"reward_std": 0.16288615837693216, |
|
"rewards/accuracy_reward": 0.7812500387430191, |
|
"rewards/format_reward": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.886993603411514, |
|
"eval_completion_length": 482.53951154436385, |
|
"eval_kl": 0.005589076450892857, |
|
"eval_loss": 0.00022271877969615161, |
|
"eval_reward": 0.8698980041912624, |
|
"eval_reward_std": 0.148396335542202, |
|
"eval_rewards/accuracy_reward": 0.8698980041912624, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 218.9173, |
|
"eval_samples_per_second": 0.457, |
|
"eval_steps_per_second": 0.005, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 569.8573867797852, |
|
"epoch": 1.9722814498933903, |
|
"grad_norm": 0.06015090271830559, |
|
"kl": 0.0032745361328125, |
|
"learning_rate": 4.7063754319689976e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7970982477068901, |
|
"reward_std": 0.16446643033996225, |
|
"rewards/accuracy_reward": 0.7970982477068901, |
|
"rewards/format_reward": 0.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 567.0639133029514, |
|
"epoch": 2.068230277185501, |
|
"grad_norm": 0.17577879130840302, |
|
"kl": 0.003118896484375, |
|
"learning_rate": 3.5093333532153313e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7855159123738606, |
|
"reward_std": 0.16644797714220153, |
|
"rewards/accuracy_reward": 0.7855159123738606, |
|
"rewards/format_reward": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.068230277185501, |
|
"eval_completion_length": 480.4123011997768, |
|
"eval_kl": 0.006166730608258929, |
|
"eval_loss": 0.00024634136934764683, |
|
"eval_reward": 0.848214328289032, |
|
"eval_reward_std": 0.1697287346635546, |
|
"eval_rewards/accuracy_reward": 0.848214328289032, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 217.3304, |
|
"eval_samples_per_second": 0.46, |
|
"eval_steps_per_second": 0.005, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 545.7962287902832, |
|
"epoch": 2.1535181236673773, |
|
"grad_norm": 0.15464283525943756, |
|
"kl": 0.00330963134765625, |
|
"learning_rate": 2.467682828805956e-07, |
|
"loss": 0.0001, |
|
"reward": 0.796428607404232, |
|
"reward_std": 0.15349247655831277, |
|
"rewards/accuracy_reward": 0.796428607404232, |
|
"rewards/format_reward": 0.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 574.9627479553222, |
|
"epoch": 2.2388059701492535, |
|
"grad_norm": 0.11914752423763275, |
|
"kl": 0.0033355712890625, |
|
"learning_rate": 1.5955103951488177e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7667411088943481, |
|
"reward_std": 0.15913840509019792, |
|
"rewards/accuracy_reward": 0.7667411088943481, |
|
"rewards/format_reward": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.2388059701492535, |
|
"eval_completion_length": 475.7303728376116, |
|
"eval_kl": 0.005048479352678571, |
|
"eval_loss": 0.00020054024935234338, |
|
"eval_reward": 0.8596939189093453, |
|
"eval_reward_std": 0.14824596978724003, |
|
"eval_rewards/accuracy_reward": 0.8596939189093453, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 220.4076, |
|
"eval_samples_per_second": 0.454, |
|
"eval_steps_per_second": 0.005, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 557.3685493469238, |
|
"epoch": 2.3240938166311302, |
|
"grad_norm": 0.07406862825155258, |
|
"kl": 0.0036487579345703125, |
|
"learning_rate": 9.046106882113752e-08, |
|
"loss": 0.0001, |
|
"reward": 0.7955357536673546, |
|
"reward_std": 0.15597790591418742, |
|
"rewards/accuracy_reward": 0.7955357536673546, |
|
"rewards/format_reward": 0.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 564.0457908630372, |
|
"epoch": 2.4093816631130065, |
|
"grad_norm": 0.14250978827476501, |
|
"kl": 0.003148651123046875, |
|
"learning_rate": 4.0432694130264294e-08, |
|
"loss": 0.0001, |
|
"reward": 0.8013393253087997, |
|
"reward_std": 0.1507680752314627, |
|
"rewards/accuracy_reward": 0.8013393253087997, |
|
"rewards/format_reward": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.4093816631130065, |
|
"eval_completion_length": 482.3264944893973, |
|
"eval_kl": 0.005327497209821429, |
|
"eval_loss": 0.00020964255963917822, |
|
"eval_reward": 0.8584184050559998, |
|
"eval_reward_std": 0.15163347657237733, |
|
"eval_rewards/accuracy_reward": 0.8584184050559998, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 216.7121, |
|
"eval_samples_per_second": 0.461, |
|
"eval_steps_per_second": 0.005, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 573.7694519042968, |
|
"epoch": 2.4946695095948828, |
|
"grad_norm": 0.11774393916130066, |
|
"kl": 0.0032398223876953123, |
|
"learning_rate": 1.0142463387085465e-08, |
|
"loss": 0.0001, |
|
"reward": 0.7705357506871223, |
|
"reward_std": 0.16194519642740487, |
|
"rewards/accuracy_reward": 0.7705357506871223, |
|
"rewards/format_reward": 0.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 554.9973442077637, |
|
"epoch": 2.579957356076759, |
|
"grad_norm": 0.10069162398576736, |
|
"kl": 0.003186798095703125, |
|
"learning_rate": 0.0, |
|
"loss": 0.0001, |
|
"reward": 0.7941964656114578, |
|
"reward_std": 0.15693624839186668, |
|
"rewards/accuracy_reward": 0.7941964656114578, |
|
"rewards/format_reward": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.579957356076759, |
|
"eval_completion_length": 478.3360900878906, |
|
"eval_kl": 0.005617414202008929, |
|
"eval_loss": 0.00022248993627727032, |
|
"eval_reward": 0.8380102430071149, |
|
"eval_reward_std": 0.17689391651323863, |
|
"eval_rewards/accuracy_reward": 0.8380102430071149, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 216.3428, |
|
"eval_samples_per_second": 0.462, |
|
"eval_steps_per_second": 0.005, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 560.3859672546387, |
|
"epoch": 2.6652452025586353, |
|
"grad_norm": 0.08840309083461761, |
|
"kl": 0.0033924102783203123, |
|
"learning_rate": 4.3933982822017883e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7861607491970062, |
|
"reward_std": 0.15671849967911838, |
|
"rewards/accuracy_reward": 0.7861607491970062, |
|
"rewards/format_reward": 0.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 585.1167678833008, |
|
"epoch": 2.750533049040512, |
|
"grad_norm": 0.07753895968198776, |
|
"kl": 0.003240966796875, |
|
"learning_rate": 3.5093333532153313e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7700893253087997, |
|
"reward_std": 0.17175142988562583, |
|
"rewards/accuracy_reward": 0.7700893253087997, |
|
"rewards/format_reward": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.750533049040512, |
|
"eval_completion_length": 480.5063912527902, |
|
"eval_kl": 0.005467006138392857, |
|
"eval_loss": 0.00021828452008776367, |
|
"eval_reward": 0.8647959572928292, |
|
"eval_reward_std": 0.14701131811099394, |
|
"eval_rewards/accuracy_reward": 0.8647959572928292, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 220.7103, |
|
"eval_samples_per_second": 0.453, |
|
"eval_steps_per_second": 0.005, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 578.4064949035644, |
|
"epoch": 2.835820895522388, |
|
"grad_norm": 0.09659363329410553, |
|
"kl": 0.00327301025390625, |
|
"learning_rate": 2.7127193356651214e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7720982506871223, |
|
"reward_std": 0.1718197108246386, |
|
"rewards/accuracy_reward": 0.7720982506871223, |
|
"rewards/format_reward": 0.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 568.9587310791015, |
|
"epoch": 2.9211087420042645, |
|
"grad_norm": 0.13311569392681122, |
|
"kl": 0.0035663604736328124, |
|
"learning_rate": 2.0096189432334195e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7866071820259094, |
|
"reward_std": 0.16021770192310214, |
|
"rewards/accuracy_reward": 0.7866071820259094, |
|
"rewards/format_reward": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.9211087420042645, |
|
"eval_completion_length": 485.0064479282924, |
|
"eval_kl": 0.005305698939732143, |
|
"eval_loss": 0.000213223320315592, |
|
"eval_reward": 0.8584183965410505, |
|
"eval_reward_std": 0.156809002161026, |
|
"eval_rewards/accuracy_reward": 0.8584183965410505, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 215.4697, |
|
"eval_samples_per_second": 0.464, |
|
"eval_steps_per_second": 0.005, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 568.291493733724, |
|
"epoch": 3.0170575692963753, |
|
"grad_norm": 0.1568661630153656, |
|
"kl": 0.0034328884548611113, |
|
"learning_rate": 1.405383194450251e-07, |
|
"loss": 0.0002, |
|
"reward": 0.798611146873898, |
|
"reward_std": 0.1725065532657835, |
|
"rewards/accuracy_reward": 0.798611146873898, |
|
"rewards/format_reward": 0.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 563.8743576049804, |
|
"epoch": 3.1023454157782515, |
|
"grad_norm": 0.09700077027082443, |
|
"kl": 0.00376434326171875, |
|
"learning_rate": 9.046106882113752e-08, |
|
"loss": 0.0002, |
|
"reward": 0.7857143208384514, |
|
"reward_std": 0.15767460195347666, |
|
"rewards/accuracy_reward": 0.7857143208384514, |
|
"rewards/format_reward": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.1023454157782515, |
|
"eval_completion_length": 484.9557364327567, |
|
"eval_kl": 0.005445207868303571, |
|
"eval_loss": 0.00021743674005847424, |
|
"eval_reward": 0.8622449381010873, |
|
"eval_reward_std": 0.14529993225421226, |
|
"eval_rewards/accuracy_reward": 0.8622449381010873, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 222.0031, |
|
"eval_samples_per_second": 0.45, |
|
"eval_steps_per_second": 0.005, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 537.6498031616211, |
|
"epoch": 3.1876332622601278, |
|
"grad_norm": 0.09764347225427628, |
|
"kl": 0.003476715087890625, |
|
"learning_rate": 5.11112605663977e-08, |
|
"loss": 0.0001, |
|
"reward": 0.8029018208384514, |
|
"reward_std": 0.15667821522802114, |
|
"rewards/accuracy_reward": 0.8029018208384514, |
|
"rewards/format_reward": 0.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 580.5053825378418, |
|
"epoch": 3.272921108742004, |
|
"grad_norm": 0.10233303159475327, |
|
"kl": 0.0035114288330078125, |
|
"learning_rate": 2.278837048168797e-08, |
|
"loss": 0.0001, |
|
"reward": 0.7638393223285675, |
|
"reward_std": 0.18133262265473604, |
|
"rewards/accuracy_reward": 0.7638393223285675, |
|
"rewards/format_reward": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.272921108742004, |
|
"eval_completion_length": 475.9809875488281, |
|
"eval_kl": 0.0054931640625, |
|
"eval_loss": 0.00022025364160072058, |
|
"eval_reward": 0.8609694242477417, |
|
"eval_reward_std": 0.14643230502094542, |
|
"eval_rewards/accuracy_reward": 0.8609694242477417, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 217.2878, |
|
"eval_samples_per_second": 0.46, |
|
"eval_steps_per_second": 0.005, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 554.5051574707031, |
|
"epoch": 3.3582089552238807, |
|
"grad_norm": 0.11225981265306473, |
|
"kl": 0.003415489196777344, |
|
"learning_rate": 5.707952862381682e-09, |
|
"loss": 0.0001, |
|
"reward": 0.802678607404232, |
|
"reward_std": 0.1451864595990628, |
|
"rewards/accuracy_reward": 0.802678607404232, |
|
"rewards/format_reward": 0.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 560.3868598937988, |
|
"epoch": 3.443496801705757, |
|
"grad_norm": 0.1300434023141861, |
|
"kl": 0.0034725189208984373, |
|
"learning_rate": 0.0, |
|
"loss": 0.0001, |
|
"reward": 0.799553605914116, |
|
"reward_std": 0.14306436143815518, |
|
"rewards/accuracy_reward": 0.799553605914116, |
|
"rewards/format_reward": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.443496801705757, |
|
"eval_completion_length": 486.1337367466518, |
|
"eval_kl": 0.005728585379464286, |
|
"eval_loss": 0.00022739818086847663, |
|
"eval_reward": 0.871173518044608, |
|
"eval_reward_std": 0.13813474109130247, |
|
"eval_rewards/accuracy_reward": 0.871173518044608, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 216.9308, |
|
"eval_samples_per_second": 0.461, |
|
"eval_steps_per_second": 0.005, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 571.4754753112793, |
|
"epoch": 3.4605543710021323, |
|
"kl": 0.0033359527587890625, |
|
"reward": 0.7745536044239998, |
|
"reward_std": 0.1426069471053779, |
|
"rewards/accuracy_reward": 0.7745536044239998, |
|
"rewards/format_reward": 0.0, |
|
"step": 201, |
|
"total_flos": 0.0, |
|
"train_loss": 3.630263656748468e-05, |
|
"train_runtime": 619.0643, |
|
"train_samples_per_second": 289.469, |
|
"train_steps_per_second": 0.323 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|