Qwen-2.5-7B-Simple-RL / trainer_state.json
kekema19's picture
Model save
f169e5b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9211087420042645,
"eval_steps": 100,
"global_step": 170,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 613.281721496582,
"epoch": 0.08528784648187633,
"grad_norm": 4.2232208251953125,
"kl": 0.00017681121826171876,
"learning_rate": 2.5e-06,
"loss": 0.0,
"reward": 0.6537946701049805,
"reward_std": 0.3218739811331034,
"rewards/accuracy_reward": 0.6537946701049805,
"rewards/format_reward2": 0.0,
"step": 5
},
{
"completion_length": 618.741323852539,
"epoch": 0.17057569296375266,
"grad_norm": 0.7694841027259827,
"kl": 0.23766450881958007,
"learning_rate": 2.956412726139078e-06,
"loss": 0.0095,
"reward": 0.7125000342726707,
"reward_std": 0.26975345350801944,
"rewards/accuracy_reward": 0.7125000342726707,
"rewards/format_reward2": 0.0,
"step": 10
},
{
"completion_length": 625.0417678833007,
"epoch": 0.255863539445629,
"grad_norm": 0.11407410353422165,
"kl": 0.00294036865234375,
"learning_rate": 2.7836719084521715e-06,
"loss": 0.0001,
"reward": 0.7500000342726707,
"reward_std": 0.2328610809519887,
"rewards/accuracy_reward": 0.7500000342726707,
"rewards/format_reward2": 0.0,
"step": 15
},
{
"completion_length": 604.2779296875,
"epoch": 0.3411513859275053,
"grad_norm": 0.10821383446455002,
"kl": 0.0032527923583984377,
"learning_rate": 2.4946839873611927e-06,
"loss": 0.0001,
"reward": 0.7553571820259094,
"reward_std": 0.21391915678977966,
"rewards/accuracy_reward": 0.7553571820259094,
"rewards/format_reward2": 0.0,
"step": 20
},
{
"completion_length": 611.8196701049804,
"epoch": 0.42643923240938164,
"grad_norm": 0.1587938815355301,
"kl": 0.003980827331542969,
"learning_rate": 2.1156192081791355e-06,
"loss": 0.0002,
"reward": 0.755357176065445,
"reward_std": 0.1927801643498242,
"rewards/accuracy_reward": 0.755357176065445,
"rewards/format_reward2": 0.0,
"step": 25
},
{
"completion_length": 614.662525177002,
"epoch": 0.511727078891258,
"grad_norm": 0.1258433312177658,
"kl": 0.0038448333740234374,
"learning_rate": 1.6808050203829845e-06,
"loss": 0.0002,
"reward": 0.7575893223285675,
"reward_std": 0.18457154743373394,
"rewards/accuracy_reward": 0.7575893223285675,
"rewards/format_reward2": 0.0,
"step": 30
},
{
"completion_length": 599.8576171875,
"epoch": 0.5970149253731343,
"grad_norm": 0.088959701359272,
"kl": 0.004636383056640625,
"learning_rate": 1.2296174432791415e-06,
"loss": 0.0002,
"reward": 0.7524553865194321,
"reward_std": 0.1772271953523159,
"rewards/accuracy_reward": 0.7524553865194321,
"rewards/format_reward2": 0.0,
"step": 35
},
{
"completion_length": 587.4435562133789,
"epoch": 0.6823027718550106,
"grad_norm": 0.08021266013383865,
"kl": 0.004290008544921875,
"learning_rate": 8.029152419343472e-07,
"loss": 0.0002,
"reward": 0.7700893208384514,
"reward_std": 0.1737084028776735,
"rewards/accuracy_reward": 0.7700893208384514,
"rewards/format_reward2": 0.0,
"step": 40
},
{
"completion_length": 605.4605155944824,
"epoch": 0.767590618336887,
"grad_norm": 0.0632205456495285,
"kl": 0.003778839111328125,
"learning_rate": 4.3933982822017883e-07,
"loss": 0.0002,
"reward": 0.7537946745753288,
"reward_std": 0.1909334436058998,
"rewards/accuracy_reward": 0.7537946745753288,
"rewards/format_reward2": 0.0,
"step": 45
},
{
"completion_length": 606.9638671875,
"epoch": 0.8528784648187633,
"grad_norm": 0.08492382615804672,
"kl": 0.003963851928710937,
"learning_rate": 1.718159615201853e-07,
"loss": 0.0002,
"reward": 0.7578125342726707,
"reward_std": 0.1673258093651384,
"rewards/accuracy_reward": 0.7578125342726707,
"rewards/format_reward2": 0.0,
"step": 50
},
{
"completion_length": 599.2339523315429,
"epoch": 0.9381663113006397,
"grad_norm": 0.07253226637840271,
"kl": 0.011330032348632812,
"learning_rate": 2.4570139579284723e-08,
"loss": 0.0005,
"reward": 0.7895089671015739,
"reward_std": 0.17561167925596238,
"rewards/accuracy_reward": 0.7895089671015739,
"rewards/format_reward2": 0.0,
"step": 55
},
{
"completion_length": 578.6864166259766,
"epoch": 1.0341151385927505,
"grad_norm": 0.12633396685123444,
"kl": 0.003711700439453125,
"learning_rate": 5.358185854701909e-07,
"loss": 0.0001,
"reward": 0.7918527200818062,
"reward_std": 0.17118105152621865,
"rewards/accuracy_reward": 0.7918527200818062,
"rewards/format_reward4": 0.0,
"step": 60
},
{
"completion_length": 579.6618591308594,
"epoch": 1.1194029850746268,
"grad_norm": 0.0748809352517128,
"kl": 0.00350341796875,
"learning_rate": 3.0996998956314745e-07,
"loss": 0.0001,
"reward": 0.7732143223285675,
"reward_std": 0.1733042700216174,
"rewards/accuracy_reward": 0.7732143223285675,
"rewards/format_reward4": 0.0,
"step": 65
},
{
"completion_length": 564.5884208679199,
"epoch": 1.2046908315565032,
"grad_norm": 0.17819823324680328,
"kl": 0.003855133056640625,
"learning_rate": 1.405383194450251e-07,
"loss": 0.0002,
"reward": 0.8015625357627869,
"reward_std": 0.16864687129855155,
"rewards/accuracy_reward": 0.8015625357627869,
"rewards/format_reward4": 0.0,
"step": 70
},
{
"completion_length": 561.2647575378418,
"epoch": 1.2899786780383795,
"grad_norm": 0.08583667129278183,
"kl": 0.0035480499267578126,
"learning_rate": 3.5555989320099955e-08,
"loss": 0.0001,
"reward": 0.7986607506871224,
"reward_std": 0.15244121365249158,
"rewards/accuracy_reward": 0.7986607506871224,
"rewards/format_reward4": 0.0,
"step": 75
},
{
"completion_length": 572.7236892700196,
"epoch": 1.375266524520256,
"grad_norm": 0.13771697878837585,
"kl": 0.0034299850463867187,
"learning_rate": 0.0,
"loss": 0.0001,
"reward": 0.8002232491970063,
"reward_std": 0.15689483480527996,
"rewards/accuracy_reward": 0.8002232491970063,
"rewards/format_reward4": 0.0,
"step": 80
},
{
"completion_length": 578.9810539245606,
"epoch": 1.4605543710021323,
"grad_norm": 2.014723777770996,
"kl": 0.010892486572265625,
"learning_rate": 2.0096189432334195e-07,
"loss": 0.0004,
"reward": 0.7611607536673546,
"reward_std": 0.18238836526870728,
"rewards/accuracy_reward": 0.7611607536673546,
"rewards/format_reward4": 0.0,
"step": 85
},
{
"completion_length": 589.071681213379,
"epoch": 1.5458422174840085,
"grad_norm": 0.07733023166656494,
"kl": 0.00322265625,
"learning_rate": 9.046106882113752e-08,
"loss": 0.0001,
"reward": 0.7718750327825546,
"reward_std": 0.16937124980613588,
"rewards/accuracy_reward": 0.7718750327825546,
"rewards/format_reward4": 0.0,
"step": 90
},
{
"completion_length": 584.4857421875,
"epoch": 1.6311300639658848,
"grad_norm": 0.08772465586662292,
"kl": 0.003212738037109375,
"learning_rate": 2.278837048168797e-08,
"loss": 0.0001,
"reward": 0.765625037252903,
"reward_std": 0.17046672012656927,
"rewards/accuracy_reward": 0.765625037252903,
"rewards/format_reward4": 0.0,
"step": 95
},
{
"completion_length": 584.8821662902832,
"epoch": 1.716417910447761,
"grad_norm": 0.09350935369729996,
"kl": 0.0032810211181640626,
"learning_rate": 0.0,
"loss": 0.0001,
"reward": 0.7743303939700127,
"reward_std": 0.17159467502497136,
"rewards/accuracy_reward": 0.7743303939700127,
"rewards/format_reward4": 0.0,
"step": 100
},
{
"epoch": 1.716417910447761,
"eval_completion_length": 572.5716213433507,
"eval_kl": 0.004271193434255192,
"eval_loss": 0.00017078800010494888,
"eval_reward": 0.6897820954124767,
"eval_reward_std": 0.20645368708589207,
"eval_rewards/accuracy_reward": 0.6897820954124767,
"eval_rewards/format_reward4": 0.0,
"eval_runtime": 10542.252,
"eval_samples_per_second": 0.474,
"eval_steps_per_second": 0.004,
"step": 100
},
{
"completion_length": 593.587084197998,
"epoch": 1.8017057569296375,
"grad_norm": 0.07390806823968887,
"kl": 0.00316009521484375,
"learning_rate": 3.2546120637356677e-07,
"loss": 0.0001,
"reward": 0.7497768238186836,
"reward_std": 0.18519118977710605,
"rewards/accuracy_reward": 0.7497768238186836,
"rewards/format_reward4": 0.0,
"step": 105
},
{
"completion_length": 589.4265899658203,
"epoch": 1.886993603411514,
"grad_norm": 0.07380172610282898,
"kl": 0.0030284881591796874,
"learning_rate": 2.1114787115667477e-07,
"loss": 0.0001,
"reward": 0.7691964611411095,
"reward_std": 0.1665174851194024,
"rewards/accuracy_reward": 0.7691964611411095,
"rewards/format_reward4": 0.0,
"step": 110
},
{
"completion_length": 581.2207862854004,
"epoch": 1.9722814498933903,
"grad_norm": 0.06949003785848618,
"kl": 0.003309822082519531,
"learning_rate": 1.2003083451176365e-07,
"loss": 0.0001,
"reward": 0.7843750312924385,
"reward_std": 0.17008549151942134,
"rewards/accuracy_reward": 0.7843750312924385,
"rewards/format_reward4": 0.0,
"step": 115
},
{
"completion_length": 576.8970560709636,
"epoch": 2.068230277185501,
"grad_norm": 0.0717669427394867,
"kl": 0.0029771592881944445,
"learning_rate": 5.374998819965654e-08,
"loss": 0.0001,
"reward": 0.7791667024294535,
"reward_std": 0.1727727702094449,
"rewards/accuracy_reward": 0.7791667024294535,
"rewards/format_reward4": 0.0,
"step": 120
},
{
"completion_length": 552.3944480895996,
"epoch": 2.1535181236673773,
"grad_norm": 3.9083805084228516,
"kl": 0.00341339111328125,
"learning_rate": 1.3498231131137295e-08,
"loss": 0.0001,
"reward": 0.7910714641213417,
"reward_std": 0.15105125531554223,
"rewards/accuracy_reward": 0.7910714641213417,
"rewards/format_reward4": 0.0,
"step": 125
},
{
"completion_length": 586.4814987182617,
"epoch": 2.2388059701492535,
"grad_norm": 0.08841477334499359,
"kl": 0.0033771514892578123,
"learning_rate": 0.0,
"loss": 0.0001,
"reward": 0.7645089626312256,
"reward_std": 0.16334964451380074,
"rewards/accuracy_reward": 0.7645089626312256,
"rewards/format_reward4": 0.0,
"step": 130
},
{
"completion_length": 562.0841804504395,
"epoch": 2.3240938166311302,
"grad_norm": 0.0749669224023819,
"kl": 0.003040122985839844,
"learning_rate": 3.709719800782133e-07,
"loss": 0.0001,
"reward": 0.8017857521772385,
"reward_std": 0.1651908096857369,
"rewards/accuracy_reward": 0.8017857521772385,
"rewards/format_reward4": 0.0,
"step": 135
},
{
"completion_length": 568.1428817749023,
"epoch": 2.4093816631130065,
"grad_norm": 0.06772708147764206,
"kl": 0.0032039642333984374,
"learning_rate": 2.757046314656676e-07,
"loss": 0.0001,
"reward": 0.799776816368103,
"reward_std": 0.15835139667615294,
"rewards/accuracy_reward": 0.799776816368103,
"rewards/format_reward4": 0.0,
"step": 140
},
{
"completion_length": 576.2410957336426,
"epoch": 2.4946695095948828,
"grad_norm": 0.07914450764656067,
"kl": 0.0030412673950195312,
"learning_rate": 1.9333050887001336e-07,
"loss": 0.0001,
"reward": 0.772544676065445,
"reward_std": 0.1686524854041636,
"rewards/accuracy_reward": 0.772544676065445,
"rewards/format_reward4": 0.0,
"step": 145
},
{
"completion_length": 580.0102951049805,
"epoch": 2.579957356076759,
"grad_norm": 0.09259422868490219,
"kl": 0.0027721405029296877,
"learning_rate": 1.2471710571470578e-07,
"loss": 0.0001,
"reward": 0.7595982506871224,
"reward_std": 0.15487177977338434,
"rewards/accuracy_reward": 0.7595982506871224,
"rewards/format_reward4": 0.0,
"step": 150
},
{
"completion_length": 568.7931098937988,
"epoch": 2.6652452025586353,
"grad_norm": 0.33623555302619934,
"kl": 0.003169822692871094,
"learning_rate": 7.058699935926527e-08,
"loss": 0.0001,
"reward": 0.7776786103844643,
"reward_std": 0.1605815477669239,
"rewards/accuracy_reward": 0.7776786103844643,
"rewards/format_reward4": 0.0,
"step": 155
},
{
"completion_length": 597.3964576721191,
"epoch": 2.750533049040512,
"grad_norm": 0.07021904736757278,
"kl": 0.0028789520263671877,
"learning_rate": 3.151024153589321e-08,
"loss": 0.0001,
"reward": 0.7546875327825546,
"reward_std": 0.18458664841018618,
"rewards/accuracy_reward": 0.7546875327825546,
"rewards/format_reward4": 0.0,
"step": 160
},
{
"completion_length": 582.8522552490234,
"epoch": 2.835820895522388,
"grad_norm": 0.10628537088632584,
"kl": 0.0029039382934570312,
"learning_rate": 7.898355054830719e-09,
"loss": 0.0001,
"reward": 0.7602678939700127,
"reward_std": 0.17046223413199185,
"rewards/accuracy_reward": 0.7602678939700127,
"rewards/format_reward4": 0.0,
"step": 165
},
{
"completion_length": 577.0710075378418,
"epoch": 2.9211087420042645,
"grad_norm": 0.09094743430614471,
"kl": 0.0032497406005859374,
"learning_rate": 0.0,
"loss": 0.0001,
"reward": 0.779241107404232,
"reward_std": 0.16634787572547793,
"rewards/accuracy_reward": 0.779241107404232,
"rewards/format_reward4": 0.0,
"step": 170
},
{
"epoch": 2.9211087420042645,
"step": 170,
"total_flos": 0.0,
"train_loss": 2.8551620540811734e-05,
"train_runtime": 11823.9785,
"train_samples_per_second": 12.882,
"train_steps_per_second": 0.014
}
],
"logging_steps": 5,
"max_steps": 170,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}