Qwen-2.5-7B-Simple-RL-0222 / trainer_state.json
kekema19's picture
Model save
64edc27 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.4605543710021323,
"eval_steps": 10,
"global_step": 201,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 605.8180999755859,
"epoch": 0.017057569296375266,
"grad_norm": 11.068035125732422,
"kl": 0.0,
"learning_rate": 6.000000000000001e-07,
"loss": 0.0,
"reward": 0.6227678880095482,
"reward_std": 0.34988817013800144,
"rewards/accuracy_reward": 0.6227678880095482,
"rewards/format_reward": 0.0,
"step": 1
},
{
"completion_length": 606.5859651565552,
"epoch": 0.08528784648187633,
"grad_norm": 10.528549194335938,
"kl": 0.0009853541851043701,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.5948660997673869,
"reward_std": 0.3529963130131364,
"rewards/accuracy_reward": 0.5948660997673869,
"rewards/format_reward": 0.0,
"step": 5
},
{
"completion_length": 623.1681053161622,
"epoch": 0.17057569296375266,
"grad_norm": 0.528905987739563,
"kl": 0.008502006530761719,
"learning_rate": 2.9095389311788626e-06,
"loss": 0.0003,
"reward": 0.689285746216774,
"reward_std": 0.29115155190229414,
"rewards/accuracy_reward": 0.689285746216774,
"rewards/format_reward": 0.0,
"step": 10
},
{
"epoch": 0.17057569296375266,
"eval_completion_length": 573.9749799455915,
"eval_kl": 1.3873639787946428,
"eval_loss": 0.056397709995508194,
"eval_reward": 0.7653061492102486,
"eval_reward_std": 0.2849572343485696,
"eval_rewards/accuracy_reward": 0.7653061492102486,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 229.3061,
"eval_samples_per_second": 0.436,
"eval_steps_per_second": 0.004,
"step": 10
},
{
"completion_length": 624.6886390686035,
"epoch": 0.255863539445629,
"grad_norm": 0.8065505623817444,
"kl": 0.014313316345214844,
"learning_rate": 2.649066664678467e-06,
"loss": 0.0006,
"reward": 0.7506696745753288,
"reward_std": 0.23205664344131946,
"rewards/accuracy_reward": 0.7506696745753288,
"rewards/format_reward": 0.0,
"step": 15
},
{
"completion_length": 613.3145324707032,
"epoch": 0.3411513859275053,
"grad_norm": 0.18300795555114746,
"kl": 0.38666706085205077,
"learning_rate": 2.25e-06,
"loss": 0.0154,
"reward": 0.7591518208384513,
"reward_std": 0.21772289499640465,
"rewards/accuracy_reward": 0.7591518208384513,
"rewards/format_reward": 0.0,
"step": 20
},
{
"epoch": 0.3411513859275053,
"eval_completion_length": 533.1733834402902,
"eval_kl": 0.005168369838169643,
"eval_loss": 0.0009401756688021123,
"eval_reward": 0.812500034059797,
"eval_reward_std": 0.22666969469615392,
"eval_rewards/accuracy_reward": 0.812500034059797,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 225.084,
"eval_samples_per_second": 0.444,
"eval_steps_per_second": 0.004,
"step": 20
},
{
"completion_length": 611.4100685119629,
"epoch": 0.42643923240938164,
"grad_norm": 0.2118988335132599,
"kl": 0.003771209716796875,
"learning_rate": 1.7604722665003958e-06,
"loss": 0.0002,
"reward": 0.7513393223285675,
"reward_std": 0.19753552060574292,
"rewards/accuracy_reward": 0.7513393223285675,
"rewards/format_reward": 0.0,
"step": 25
},
{
"completion_length": 612.1899833679199,
"epoch": 0.511727078891258,
"grad_norm": 0.2857199013233185,
"kl": 0.003532218933105469,
"learning_rate": 1.2395277334996047e-06,
"loss": 0.0001,
"reward": 0.7535714626312255,
"reward_std": 0.17617593705654144,
"rewards/accuracy_reward": 0.7535714626312255,
"rewards/format_reward": 0.0,
"step": 30
},
{
"epoch": 0.511727078891258,
"eval_completion_length": 524.0120588030134,
"eval_kl": 0.004640851702008929,
"eval_loss": 0.000920308637432754,
"eval_reward": 0.8431122728756496,
"eval_reward_std": 0.16344326840979712,
"eval_rewards/accuracy_reward": 0.8431122728756496,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 223.9002,
"eval_samples_per_second": 0.447,
"eval_steps_per_second": 0.004,
"step": 30
},
{
"completion_length": 600.6205627441407,
"epoch": 0.5970149253731343,
"grad_norm": 0.10232926905155182,
"kl": 0.00328369140625,
"learning_rate": 7.500000000000003e-07,
"loss": 0.0001,
"reward": 0.7500000350177288,
"reward_std": 0.18227138370275497,
"rewards/accuracy_reward": 0.7500000350177288,
"rewards/format_reward": 0.0,
"step": 35
},
{
"completion_length": 588.3980171203614,
"epoch": 0.6823027718550106,
"grad_norm": 0.12149345874786377,
"kl": 0.013450050354003906,
"learning_rate": 3.5093333532153313e-07,
"loss": 0.0005,
"reward": 0.7700893267989158,
"reward_std": 0.18259718772023917,
"rewards/accuracy_reward": 0.7700893267989158,
"rewards/format_reward": 0.0,
"step": 40
},
{
"epoch": 0.6823027718550106,
"eval_completion_length": 515.6179722377232,
"eval_kl": 0.005268641880580357,
"eval_loss": 0.00021133186237420887,
"eval_reward": 0.8303571854318891,
"eval_reward_std": 0.1784549355506897,
"eval_rewards/accuracy_reward": 0.8303571854318891,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 223.7869,
"eval_samples_per_second": 0.447,
"eval_steps_per_second": 0.004,
"step": 40
},
{
"completion_length": 608.0940032958985,
"epoch": 0.767590618336887,
"grad_norm": 0.11995360255241394,
"kl": 0.0032606124877929688,
"learning_rate": 9.046106882113752e-08,
"loss": 0.0001,
"reward": 0.7495535999536515,
"reward_std": 0.19272922482341528,
"rewards/accuracy_reward": 0.7495535999536515,
"rewards/format_reward": 0.0,
"step": 45
},
{
"completion_length": 614.151146697998,
"epoch": 0.8528784648187633,
"grad_norm": 2.694963216781616,
"kl": 0.004784202575683594,
"learning_rate": 0.0,
"loss": 0.0002,
"reward": 0.7437500387430191,
"reward_std": 0.19476121049374343,
"rewards/accuracy_reward": 0.7437500387430191,
"rewards/format_reward": 0.0,
"step": 50
},
{
"epoch": 0.8528784648187633,
"eval_completion_length": 521.0712454659598,
"eval_kl": 0.005817958286830357,
"eval_loss": 0.00023309080279432237,
"eval_reward": 0.8392857568604606,
"eval_reward_std": 0.2051741225378854,
"eval_rewards/accuracy_reward": 0.8392857568604606,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 225.9,
"eval_samples_per_second": 0.443,
"eval_steps_per_second": 0.004,
"step": 50
},
{
"completion_length": 588.8913177490234,
"epoch": 0.9381663113006397,
"grad_norm": 0.08098618686199188,
"kl": 0.0032186508178710938,
"learning_rate": 1.5e-06,
"loss": 0.0001,
"reward": 0.7845982491970063,
"reward_std": 0.1832274880260229,
"rewards/accuracy_reward": 0.7845982491970063,
"rewards/format_reward": 0.0,
"step": 55
},
{
"completion_length": 579.3964538574219,
"epoch": 1.0341151385927505,
"grad_norm": 0.12469803541898727,
"kl": 0.003044976128472222,
"learning_rate": 1.2395277334996047e-06,
"loss": 0.0001,
"reward": 0.7807540045844183,
"reward_std": 0.16871399796671338,
"rewards/accuracy_reward": 0.7807540045844183,
"rewards/format_reward": 0.0,
"step": 60
},
{
"epoch": 1.0341151385927505,
"eval_completion_length": 504.27440970284596,
"eval_kl": 0.005137852260044643,
"eval_loss": 0.00020576055976562202,
"eval_reward": 0.8545918720109122,
"eval_reward_std": 0.15936485332037723,
"eval_rewards/accuracy_reward": 0.8545918720109122,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 218.3536,
"eval_samples_per_second": 0.458,
"eval_steps_per_second": 0.005,
"step": 60
},
{
"completion_length": 566.6837295532226,
"epoch": 1.1194029850746268,
"grad_norm": 0.08429302275180817,
"kl": 0.0031642913818359375,
"learning_rate": 9.86969785011497e-07,
"loss": 0.0001,
"reward": 0.783928607404232,
"reward_std": 0.16070863213390113,
"rewards/accuracy_reward": 0.783928607404232,
"rewards/format_reward": 0.0,
"step": 65
},
{
"completion_length": 565.8973487854004,
"epoch": 1.2046908315565032,
"grad_norm": 0.1427806168794632,
"kl": 0.0034132003784179688,
"learning_rate": 7.500000000000003e-07,
"loss": 0.0001,
"reward": 0.7837053954601287,
"reward_std": 0.14835985810495914,
"rewards/accuracy_reward": 0.7837053954601287,
"rewards/format_reward": 0.0,
"step": 70
},
{
"epoch": 1.2046908315565032,
"eval_completion_length": 487.16007341657365,
"eval_kl": 0.006696428571428571,
"eval_loss": 0.0002668871602509171,
"eval_reward": 0.8686224818229675,
"eval_reward_std": 0.14786859175988606,
"eval_rewards/accuracy_reward": 0.8686224818229675,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 220.7164,
"eval_samples_per_second": 0.453,
"eval_steps_per_second": 0.005,
"step": 70
},
{
"completion_length": 578.0143165588379,
"epoch": 1.2899786780383795,
"grad_norm": 0.16407515108585358,
"kl": 0.0034801483154296873,
"learning_rate": 5.358185854701909e-07,
"loss": 0.0001,
"reward": 0.7758928909897804,
"reward_std": 0.17325164508074523,
"rewards/accuracy_reward": 0.7758928909897804,
"rewards/format_reward": 0.0,
"step": 75
},
{
"completion_length": 567.1640846252442,
"epoch": 1.375266524520256,
"grad_norm": 0.06801605224609375,
"kl": 0.0032863616943359375,
"learning_rate": 3.5093333532153313e-07,
"loss": 0.0001,
"reward": 0.7915178939700127,
"reward_std": 0.14882502863183616,
"rewards/accuracy_reward": 0.7915178939700127,
"rewards/format_reward": 0.0,
"step": 80
},
{
"epoch": 1.375266524520256,
"eval_completion_length": 490.3266034807478,
"eval_kl": 0.004799979073660714,
"eval_loss": 0.0001912058360176161,
"eval_reward": 0.8494898336274284,
"eval_reward_std": 0.165963847722326,
"eval_rewards/accuracy_reward": 0.8494898336274284,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 221.7278,
"eval_samples_per_second": 0.451,
"eval_steps_per_second": 0.005,
"step": 80
},
{
"completion_length": 568.004940032959,
"epoch": 1.4605543710021323,
"grad_norm": 0.11520688235759735,
"kl": 0.00350799560546875,
"learning_rate": 2.0096189432334195e-07,
"loss": 0.0001,
"reward": 0.791741105914116,
"reward_std": 0.15448834607377648,
"rewards/accuracy_reward": 0.791741105914116,
"rewards/format_reward": 0.0,
"step": 85
},
{
"completion_length": 579.3893096923828,
"epoch": 1.5458422174840085,
"grad_norm": 0.096395343542099,
"kl": 0.0031429290771484374,
"learning_rate": 9.046106882113752e-08,
"loss": 0.0001,
"reward": 0.7698661059141159,
"reward_std": 0.16837037783116102,
"rewards/accuracy_reward": 0.7698661059141159,
"rewards/format_reward": 0.0,
"step": 90
},
{
"epoch": 1.5458422174840085,
"eval_completion_length": 489.6377737862723,
"eval_kl": 0.005275181361607143,
"eval_loss": 0.00021086516790091991,
"eval_reward": 0.8596939103943961,
"eval_reward_std": 0.16273953127009527,
"eval_rewards/accuracy_reward": 0.8596939103943961,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 220.5157,
"eval_samples_per_second": 0.453,
"eval_steps_per_second": 0.005,
"step": 90
},
{
"completion_length": 579.5056037902832,
"epoch": 1.6311300639658848,
"grad_norm": 0.08153486996889114,
"kl": 0.003292083740234375,
"learning_rate": 2.278837048168797e-08,
"loss": 0.0001,
"reward": 0.7850446745753288,
"reward_std": 0.17225408796221017,
"rewards/accuracy_reward": 0.7850446745753288,
"rewards/format_reward": 0.0,
"step": 95
},
{
"completion_length": 569.5910942077637,
"epoch": 1.716417910447761,
"grad_norm": 0.07987989485263824,
"kl": 0.0033542633056640623,
"learning_rate": 0.0,
"loss": 0.0001,
"reward": 0.797544676065445,
"reward_std": 0.15488467887043952,
"rewards/accuracy_reward": 0.797544676065445,
"rewards/format_reward": 0.0,
"step": 100
},
{
"epoch": 1.716417910447761,
"eval_completion_length": 488.6938912527902,
"eval_kl": 0.005329677036830357,
"eval_loss": 0.0009469491196796298,
"eval_reward": 0.8507653389658246,
"eval_reward_std": 0.1611656959035567,
"eval_rewards/accuracy_reward": 0.8507653389658246,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 224.3175,
"eval_samples_per_second": 0.446,
"eval_steps_per_second": 0.004,
"step": 100
},
{
"completion_length": 583.1759201049805,
"epoch": 1.8017057569296375,
"grad_norm": 0.10411060601472855,
"kl": 0.003289031982421875,
"learning_rate": 7.500000000000003e-07,
"loss": 0.0001,
"reward": 0.7611607491970063,
"reward_std": 0.16414006650447846,
"rewards/accuracy_reward": 0.7611607491970063,
"rewards/format_reward": 0.0,
"step": 105
},
{
"completion_length": 582.3049369812012,
"epoch": 1.886993603411514,
"grad_norm": 0.15299023687839508,
"kl": 0.003238677978515625,
"learning_rate": 6.04262112445821e-07,
"loss": 0.0001,
"reward": 0.7812500387430191,
"reward_std": 0.16288615837693216,
"rewards/accuracy_reward": 0.7812500387430191,
"rewards/format_reward": 0.0,
"step": 110
},
{
"epoch": 1.886993603411514,
"eval_completion_length": 482.53951154436385,
"eval_kl": 0.005589076450892857,
"eval_loss": 0.00022271877969615161,
"eval_reward": 0.8698980041912624,
"eval_reward_std": 0.148396335542202,
"eval_rewards/accuracy_reward": 0.8698980041912624,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 218.9173,
"eval_samples_per_second": 0.457,
"eval_steps_per_second": 0.005,
"step": 110
},
{
"completion_length": 569.8573867797852,
"epoch": 1.9722814498933903,
"grad_norm": 0.06015090271830559,
"kl": 0.0032745361328125,
"learning_rate": 4.7063754319689976e-07,
"loss": 0.0001,
"reward": 0.7970982477068901,
"reward_std": 0.16446643033996225,
"rewards/accuracy_reward": 0.7970982477068901,
"rewards/format_reward": 0.0,
"step": 115
},
{
"completion_length": 567.0639133029514,
"epoch": 2.068230277185501,
"grad_norm": 0.17577879130840302,
"kl": 0.003118896484375,
"learning_rate": 3.5093333532153313e-07,
"loss": 0.0001,
"reward": 0.7855159123738606,
"reward_std": 0.16644797714220153,
"rewards/accuracy_reward": 0.7855159123738606,
"rewards/format_reward": 0.0,
"step": 120
},
{
"epoch": 2.068230277185501,
"eval_completion_length": 480.4123011997768,
"eval_kl": 0.006166730608258929,
"eval_loss": 0.00024634136934764683,
"eval_reward": 0.848214328289032,
"eval_reward_std": 0.1697287346635546,
"eval_rewards/accuracy_reward": 0.848214328289032,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 217.3304,
"eval_samples_per_second": 0.46,
"eval_steps_per_second": 0.005,
"step": 120
},
{
"completion_length": 545.7962287902832,
"epoch": 2.1535181236673773,
"grad_norm": 0.15464283525943756,
"kl": 0.00330963134765625,
"learning_rate": 2.467682828805956e-07,
"loss": 0.0001,
"reward": 0.796428607404232,
"reward_std": 0.15349247655831277,
"rewards/accuracy_reward": 0.796428607404232,
"rewards/format_reward": 0.0,
"step": 125
},
{
"completion_length": 574.9627479553222,
"epoch": 2.2388059701492535,
"grad_norm": 0.11914752423763275,
"kl": 0.0033355712890625,
"learning_rate": 1.5955103951488177e-07,
"loss": 0.0001,
"reward": 0.7667411088943481,
"reward_std": 0.15913840509019792,
"rewards/accuracy_reward": 0.7667411088943481,
"rewards/format_reward": 0.0,
"step": 130
},
{
"epoch": 2.2388059701492535,
"eval_completion_length": 475.7303728376116,
"eval_kl": 0.005048479352678571,
"eval_loss": 0.00020054024935234338,
"eval_reward": 0.8596939189093453,
"eval_reward_std": 0.14824596978724003,
"eval_rewards/accuracy_reward": 0.8596939189093453,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 220.4076,
"eval_samples_per_second": 0.454,
"eval_steps_per_second": 0.005,
"step": 130
},
{
"completion_length": 557.3685493469238,
"epoch": 2.3240938166311302,
"grad_norm": 0.07406862825155258,
"kl": 0.0036487579345703125,
"learning_rate": 9.046106882113752e-08,
"loss": 0.0001,
"reward": 0.7955357536673546,
"reward_std": 0.15597790591418742,
"rewards/accuracy_reward": 0.7955357536673546,
"rewards/format_reward": 0.0,
"step": 135
},
{
"completion_length": 564.0457908630372,
"epoch": 2.4093816631130065,
"grad_norm": 0.14250978827476501,
"kl": 0.003148651123046875,
"learning_rate": 4.0432694130264294e-08,
"loss": 0.0001,
"reward": 0.8013393253087997,
"reward_std": 0.1507680752314627,
"rewards/accuracy_reward": 0.8013393253087997,
"rewards/format_reward": 0.0,
"step": 140
},
{
"epoch": 2.4093816631130065,
"eval_completion_length": 482.3264944893973,
"eval_kl": 0.005327497209821429,
"eval_loss": 0.00020964255963917822,
"eval_reward": 0.8584184050559998,
"eval_reward_std": 0.15163347657237733,
"eval_rewards/accuracy_reward": 0.8584184050559998,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 216.7121,
"eval_samples_per_second": 0.461,
"eval_steps_per_second": 0.005,
"step": 140
},
{
"completion_length": 573.7694519042968,
"epoch": 2.4946695095948828,
"grad_norm": 0.11774393916130066,
"kl": 0.0032398223876953123,
"learning_rate": 1.0142463387085465e-08,
"loss": 0.0001,
"reward": 0.7705357506871223,
"reward_std": 0.16194519642740487,
"rewards/accuracy_reward": 0.7705357506871223,
"rewards/format_reward": 0.0,
"step": 145
},
{
"completion_length": 554.9973442077637,
"epoch": 2.579957356076759,
"grad_norm": 0.10069162398576736,
"kl": 0.003186798095703125,
"learning_rate": 0.0,
"loss": 0.0001,
"reward": 0.7941964656114578,
"reward_std": 0.15693624839186668,
"rewards/accuracy_reward": 0.7941964656114578,
"rewards/format_reward": 0.0,
"step": 150
},
{
"epoch": 2.579957356076759,
"eval_completion_length": 478.3360900878906,
"eval_kl": 0.005617414202008929,
"eval_loss": 0.00022248993627727032,
"eval_reward": 0.8380102430071149,
"eval_reward_std": 0.17689391651323863,
"eval_rewards/accuracy_reward": 0.8380102430071149,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 216.3428,
"eval_samples_per_second": 0.462,
"eval_steps_per_second": 0.005,
"step": 150
},
{
"completion_length": 560.3859672546387,
"epoch": 2.6652452025586353,
"grad_norm": 0.08840309083461761,
"kl": 0.0033924102783203123,
"learning_rate": 4.3933982822017883e-07,
"loss": 0.0001,
"reward": 0.7861607491970062,
"reward_std": 0.15671849967911838,
"rewards/accuracy_reward": 0.7861607491970062,
"rewards/format_reward": 0.0,
"step": 155
},
{
"completion_length": 585.1167678833008,
"epoch": 2.750533049040512,
"grad_norm": 0.07753895968198776,
"kl": 0.003240966796875,
"learning_rate": 3.5093333532153313e-07,
"loss": 0.0001,
"reward": 0.7700893253087997,
"reward_std": 0.17175142988562583,
"rewards/accuracy_reward": 0.7700893253087997,
"rewards/format_reward": 0.0,
"step": 160
},
{
"epoch": 2.750533049040512,
"eval_completion_length": 480.5063912527902,
"eval_kl": 0.005467006138392857,
"eval_loss": 0.00021828452008776367,
"eval_reward": 0.8647959572928292,
"eval_reward_std": 0.14701131811099394,
"eval_rewards/accuracy_reward": 0.8647959572928292,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 220.7103,
"eval_samples_per_second": 0.453,
"eval_steps_per_second": 0.005,
"step": 160
},
{
"completion_length": 578.4064949035644,
"epoch": 2.835820895522388,
"grad_norm": 0.09659363329410553,
"kl": 0.00327301025390625,
"learning_rate": 2.7127193356651214e-07,
"loss": 0.0001,
"reward": 0.7720982506871223,
"reward_std": 0.1718197108246386,
"rewards/accuracy_reward": 0.7720982506871223,
"rewards/format_reward": 0.0,
"step": 165
},
{
"completion_length": 568.9587310791015,
"epoch": 2.9211087420042645,
"grad_norm": 0.13311569392681122,
"kl": 0.0035663604736328124,
"learning_rate": 2.0096189432334195e-07,
"loss": 0.0001,
"reward": 0.7866071820259094,
"reward_std": 0.16021770192310214,
"rewards/accuracy_reward": 0.7866071820259094,
"rewards/format_reward": 0.0,
"step": 170
},
{
"epoch": 2.9211087420042645,
"eval_completion_length": 485.0064479282924,
"eval_kl": 0.005305698939732143,
"eval_loss": 0.000213223320315592,
"eval_reward": 0.8584183965410505,
"eval_reward_std": 0.156809002161026,
"eval_rewards/accuracy_reward": 0.8584183965410505,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 215.4697,
"eval_samples_per_second": 0.464,
"eval_steps_per_second": 0.005,
"step": 170
},
{
"completion_length": 568.291493733724,
"epoch": 3.0170575692963753,
"grad_norm": 0.1568661630153656,
"kl": 0.0034328884548611113,
"learning_rate": 1.405383194450251e-07,
"loss": 0.0002,
"reward": 0.798611146873898,
"reward_std": 0.1725065532657835,
"rewards/accuracy_reward": 0.798611146873898,
"rewards/format_reward": 0.0,
"step": 175
},
{
"completion_length": 563.8743576049804,
"epoch": 3.1023454157782515,
"grad_norm": 0.09700077027082443,
"kl": 0.00376434326171875,
"learning_rate": 9.046106882113752e-08,
"loss": 0.0002,
"reward": 0.7857143208384514,
"reward_std": 0.15767460195347666,
"rewards/accuracy_reward": 0.7857143208384514,
"rewards/format_reward": 0.0,
"step": 180
},
{
"epoch": 3.1023454157782515,
"eval_completion_length": 484.9557364327567,
"eval_kl": 0.005445207868303571,
"eval_loss": 0.00021743674005847424,
"eval_reward": 0.8622449381010873,
"eval_reward_std": 0.14529993225421226,
"eval_rewards/accuracy_reward": 0.8622449381010873,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 222.0031,
"eval_samples_per_second": 0.45,
"eval_steps_per_second": 0.005,
"step": 180
},
{
"completion_length": 537.6498031616211,
"epoch": 3.1876332622601278,
"grad_norm": 0.09764347225427628,
"kl": 0.003476715087890625,
"learning_rate": 5.11112605663977e-08,
"loss": 0.0001,
"reward": 0.8029018208384514,
"reward_std": 0.15667821522802114,
"rewards/accuracy_reward": 0.8029018208384514,
"rewards/format_reward": 0.0,
"step": 185
},
{
"completion_length": 580.5053825378418,
"epoch": 3.272921108742004,
"grad_norm": 0.10233303159475327,
"kl": 0.0035114288330078125,
"learning_rate": 2.278837048168797e-08,
"loss": 0.0001,
"reward": 0.7638393223285675,
"reward_std": 0.18133262265473604,
"rewards/accuracy_reward": 0.7638393223285675,
"rewards/format_reward": 0.0,
"step": 190
},
{
"epoch": 3.272921108742004,
"eval_completion_length": 475.9809875488281,
"eval_kl": 0.0054931640625,
"eval_loss": 0.00022025364160072058,
"eval_reward": 0.8609694242477417,
"eval_reward_std": 0.14643230502094542,
"eval_rewards/accuracy_reward": 0.8609694242477417,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 217.2878,
"eval_samples_per_second": 0.46,
"eval_steps_per_second": 0.005,
"step": 190
},
{
"completion_length": 554.5051574707031,
"epoch": 3.3582089552238807,
"grad_norm": 0.11225981265306473,
"kl": 0.003415489196777344,
"learning_rate": 5.707952862381682e-09,
"loss": 0.0001,
"reward": 0.802678607404232,
"reward_std": 0.1451864595990628,
"rewards/accuracy_reward": 0.802678607404232,
"rewards/format_reward": 0.0,
"step": 195
},
{
"completion_length": 560.3868598937988,
"epoch": 3.443496801705757,
"grad_norm": 0.1300434023141861,
"kl": 0.0034725189208984373,
"learning_rate": 0.0,
"loss": 0.0001,
"reward": 0.799553605914116,
"reward_std": 0.14306436143815518,
"rewards/accuracy_reward": 0.799553605914116,
"rewards/format_reward": 0.0,
"step": 200
},
{
"epoch": 3.443496801705757,
"eval_completion_length": 486.1337367466518,
"eval_kl": 0.005728585379464286,
"eval_loss": 0.00022739818086847663,
"eval_reward": 0.871173518044608,
"eval_reward_std": 0.13813474109130247,
"eval_rewards/accuracy_reward": 0.871173518044608,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 216.9308,
"eval_samples_per_second": 0.461,
"eval_steps_per_second": 0.005,
"step": 200
},
{
"completion_length": 571.4754753112793,
"epoch": 3.4605543710021323,
"kl": 0.0033359527587890625,
"reward": 0.7745536044239998,
"reward_std": 0.1426069471053779,
"rewards/accuracy_reward": 0.7745536044239998,
"rewards/format_reward": 0.0,
"step": 201,
"total_flos": 0.0,
"train_loss": 3.630263656748468e-05,
"train_runtime": 619.0643,
"train_samples_per_second": 289.469,
"train_steps_per_second": 0.323
}
],
"logging_steps": 5,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}