Qwen-2.5-7B-Simple-RL / trainer_state.json
PrinceOfDryRice's picture
Model save
832ba80 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9893390191897654,
"eval_steps": 100,
"global_step": 174,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 606.0714492797852,
"epoch": 0.005685856432125089,
"grad_norm": 0.4518553614616394,
"kl": 0.0,
"learning_rate": 1.6666666666666665e-07,
"loss": 0.0324,
"reward": 0.6272321715950966,
"reward_std": 0.3481696490198374,
"rewards/accuracy_reward": 0.6272321715950966,
"rewards/format_reward": 0.0,
"step": 1
},
{
"clip_ratio": 0.00032178488891076995,
"completion_length": 595.1618576049805,
"epoch": 0.028429282160625444,
"grad_norm": 0.3228553831577301,
"kl": 0.00016799569129943848,
"learning_rate": 8.333333333333334e-07,
"loss": 0.0257,
"reward": 0.616071455180645,
"reward_std": 0.3413039892911911,
"rewards/accuracy_reward": 0.616071455180645,
"rewards/format_reward": 0.0,
"step": 5
},
{
"clip_ratio": 0.0005511968294740655,
"completion_length": 622.5971336364746,
"epoch": 0.05685856432125089,
"grad_norm": 0.5083388686180115,
"kl": 0.0019717693328857424,
"learning_rate": 1.6666666666666669e-06,
"loss": 0.0222,
"reward": 0.6210937760770321,
"reward_std": 0.34381177742034197,
"rewards/accuracy_reward": 0.6210937760770321,
"rewards/format_reward": 0.0,
"step": 10
},
{
"clip_ratio": 0.0008204746780393179,
"completion_length": 605.372802734375,
"epoch": 0.08528784648187633,
"grad_norm": 3.3226027488708496,
"kl": 0.05862216949462891,
"learning_rate": 2.5e-06,
"loss": 0.0525,
"reward": 0.678571455180645,
"reward_std": 0.267782362177968,
"rewards/accuracy_reward": 0.678571455180645,
"rewards/format_reward": 0.0,
"step": 15
},
{
"clip_ratio": 0.0004576380066282582,
"completion_length": 613.8236846923828,
"epoch": 0.11371712864250177,
"grad_norm": 0.32018226385116577,
"kl": 0.01548614501953125,
"learning_rate": 2.9987989426927397e-06,
"loss": 0.0725,
"reward": 0.7137277089059353,
"reward_std": 0.24586139619350433,
"rewards/accuracy_reward": 0.7137277089059353,
"rewards/format_reward": 0.0,
"step": 20
},
{
"clip_ratio": 0.00030219302170735317,
"completion_length": 607.8783798217773,
"epoch": 0.14214641080312723,
"grad_norm": 0.25089940428733826,
"kl": 0.004609870910644531,
"learning_rate": 2.9853091271324575e-06,
"loss": 0.0621,
"reward": 0.7572545073926449,
"reward_std": 0.22519285418093204,
"rewards/accuracy_reward": 0.7572545073926449,
"rewards/format_reward": 0.0,
"step": 25
},
{
"clip_ratio": 0.0003304076311906101,
"completion_length": 593.8370819091797,
"epoch": 0.17057569296375266,
"grad_norm": 0.0912187322974205,
"kl": 0.004130172729492188,
"learning_rate": 2.9569635476423816e-06,
"loss": 0.0519,
"reward": 0.7421875298023224,
"reward_std": 0.22574844770133495,
"rewards/accuracy_reward": 0.7421875298023224,
"rewards/format_reward": 0.0,
"step": 30
},
{
"clip_ratio": 0.00020269225624360842,
"completion_length": 576.3767032623291,
"epoch": 0.19900497512437812,
"grad_norm": 0.2136966437101364,
"kl": 0.006443023681640625,
"learning_rate": 2.9140457110223196e-06,
"loss": 0.0436,
"reward": 0.7890625335276127,
"reward_std": 0.18770856643095613,
"rewards/accuracy_reward": 0.7890625335276127,
"rewards/format_reward": 0.0,
"step": 35
},
{
"clip_ratio": 0.00019782203207796556,
"completion_length": 575.7327270507812,
"epoch": 0.22743425728500355,
"grad_norm": 0.20037101209163666,
"kl": 0.005139541625976562,
"learning_rate": 2.8569848728646314e-06,
"loss": 0.0472,
"reward": 0.7767857424914837,
"reward_std": 0.17583139101043344,
"rewards/accuracy_reward": 0.7767857424914837,
"rewards/format_reward": 0.0,
"step": 40
},
{
"clip_ratio": 0.00027204428533877945,
"completion_length": 569.5413284301758,
"epoch": 0.255863539445629,
"grad_norm": 0.14864350855350494,
"kl": 0.00561370849609375,
"learning_rate": 2.786351744225906e-06,
"loss": 0.028,
"reward": 0.7868304029107094,
"reward_std": 0.1821051500737667,
"rewards/accuracy_reward": 0.7868304029107094,
"rewards/format_reward": 0.0,
"step": 45
},
{
"clip_ratio": 0.00019200436654500663,
"completion_length": 561.1612930297852,
"epoch": 0.28429282160625446,
"grad_norm": 0.18077002465724945,
"kl": 0.0052585601806640625,
"learning_rate": 2.70285278348946e-06,
"loss": 0.0254,
"reward": 0.8041295073926449,
"reward_std": 0.151426344178617,
"rewards/accuracy_reward": 0.8041295073926449,
"rewards/format_reward": 0.0,
"step": 50
},
{
"clip_ratio": 0.0002560705053838319,
"completion_length": 578.782392501831,
"epoch": 0.31272210376687987,
"grad_norm": 0.13589029014110565,
"kl": 0.037276077270507815,
"learning_rate": 2.607323130510307e-06,
"loss": 0.0773,
"reward": 0.7767857499420643,
"reward_std": 0.17133072740398347,
"rewards/accuracy_reward": 0.7767857499420643,
"rewards/format_reward": 0.0,
"step": 55
},
{
"clip_ratio": 0.00025125542724708795,
"completion_length": 534.5167694091797,
"epoch": 0.3411513859275053,
"grad_norm": 0.13941466808319092,
"kl": 0.006060791015625,
"learning_rate": 2.5007182537138604e-06,
"loss": 0.0176,
"reward": 0.8069196790456772,
"reward_std": 0.14778494741767645,
"rewards/accuracy_reward": 0.8069196790456772,
"rewards/format_reward": 0.0,
"step": 60
},
{
"clip_ratio": 0.00017752129824657458,
"completion_length": 572.3789253234863,
"epoch": 0.3695806680881308,
"grad_norm": 0.12366141378879547,
"kl": 0.008144378662109375,
"learning_rate": 2.3841043936924138e-06,
"loss": 0.0264,
"reward": 0.7974330671131611,
"reward_std": 0.16408017370849848,
"rewards/accuracy_reward": 0.7974330671131611,
"rewards/format_reward": 0.0,
"step": 65
},
{
"clip_ratio": 0.0002240494894067524,
"completion_length": 573.7991371154785,
"epoch": 0.39800995024875624,
"grad_norm": 0.3389054536819458,
"kl": 0.0106170654296875,
"learning_rate": 2.2586478988806294e-06,
"loss": 0.0254,
"reward": 0.7712053991854191,
"reward_std": 0.1784203890711069,
"rewards/accuracy_reward": 0.7712053991854191,
"rewards/format_reward": 0.0,
"step": 70
},
{
"clip_ratio": 0.00038792909363110083,
"completion_length": 561.3504638671875,
"epoch": 0.42643923240938164,
"grad_norm": 408230.84375,
"kl": 369.6084808349609,
"learning_rate": 2.1256035599724704e-06,
"loss": 20.6357,
"reward": 0.733258955180645,
"reward_std": 0.1863593803718686,
"rewards/accuracy_reward": 0.733258955180645,
"rewards/format_reward": 0.0,
"step": 75
},
{
"clip_ratio": 0.0003454070749285165,
"completion_length": 570.4168720245361,
"epoch": 0.4548685145700071,
"grad_norm": 0.29222372174263,
"kl": 0.011145782470703126,
"learning_rate": 1.986302059756393e-06,
"loss": 0.0441,
"reward": 0.7550223544239998,
"reward_std": 0.19908471265807748,
"rewards/accuracy_reward": 0.7550223544239998,
"rewards/format_reward": 0.0,
"step": 80
},
{
"clip_ratio": 0.0004956771896104329,
"completion_length": 582.5736961364746,
"epoch": 0.48329779673063256,
"grad_norm": 0.44193872809410095,
"kl": 0.020770263671875,
"learning_rate": 1.8421366638930446e-06,
"loss": 0.0448,
"reward": 0.7533482499420643,
"reward_std": 0.21024669613689184,
"rewards/accuracy_reward": 0.7533482499420643,
"rewards/format_reward": 0.0,
"step": 85
},
{
"clip_ratio": 0.0008349075542355422,
"completion_length": 619.9062728881836,
"epoch": 0.511727078891258,
"grad_norm": 0.3540225625038147,
"kl": 0.032684326171875,
"learning_rate": 1.6945492857516126e-06,
"loss": 0.7814,
"reward": 0.6886161044239998,
"reward_std": 0.24419322237372398,
"rewards/accuracy_reward": 0.6886161044239998,
"rewards/format_reward": 0.0,
"step": 90
},
{
"clip_ratio": 0.0012230878186528572,
"completion_length": 597.3794937133789,
"epoch": 0.5401563610518835,
"grad_norm": 0.8603457808494568,
"kl": 12.069256591796876,
"learning_rate": 1.545016064681567e-06,
"loss": 0.4244,
"reward": 0.6863839533179998,
"reward_std": 0.27123062824830413,
"rewards/accuracy_reward": 0.6863839533179998,
"rewards/format_reward": 0.0,
"step": 95
},
{
"epoch": 0.5685856432125089,
"grad_norm": 3.7414331436157227,
"learning_rate": 1.3950326019630044e-06,
"loss": 0.1093,
"step": 100
},
{
"epoch": 0.5685856432125089,
"eval_clip_ratio": 0.0,
"eval_completion_length": 597.2208096927728,
"eval_kl": 0.3533604046026358,
"eval_loss": 0.09554016590118408,
"eval_reward": 0.6108797613424234,
"eval_reward_std": 0.2930047449926599,
"eval_rewards/accuracy_reward": 0.6108797613424234,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 4704.6499,
"eval_samples_per_second": 1.063,
"eval_steps_per_second": 0.01,
"step": 100
},
{
"clip_ratio": 0.0036425826241611504,
"completion_length": 612.2809066772461,
"epoch": 0.5970149253731343,
"grad_norm": 3.4328973293304443,
"kl": 0.23067626953125,
"learning_rate": 1.246099002102669e-06,
"loss": 0.0648,
"reward": 0.6726190770665804,
"reward_std": 0.2866488757232825,
"rewards/accuracy_reward": 0.6726190770665804,
"rewards/format_reward": 0.0,
"step": 105
},
{
"clip_ratio": 0.002448439208092168,
"completion_length": 575.006721496582,
"epoch": 0.6254442075337597,
"grad_norm": 6.977348327636719,
"kl": 0.39283447265625,
"learning_rate": 1.0997048690896047e-06,
"loss": 0.1102,
"reward": 0.7215402089059353,
"reward_std": 0.2666306449100375,
"rewards/accuracy_reward": 0.7215402089059353,
"rewards/format_reward": 0.0,
"step": 110
},
{
"clip_ratio": 0.0015252021345077082,
"completion_length": 571.7087326049805,
"epoch": 0.6538734896943852,
"grad_norm": 18.035612106323242,
"kl": 0.4377197265625,
"learning_rate": 9.57314407674877e-07,
"loss": 0.1127,
"reward": 0.726004496216774,
"reward_std": 0.2556505873799324,
"rewards/accuracy_reward": 0.726004496216774,
"rewards/format_reward": 0.0,
"step": 115
},
{
"clip_ratio": 0.0011677921371301635,
"completion_length": 625.1451110839844,
"epoch": 0.6823027718550106,
"grad_norm": 11.473644256591797,
"kl": 0.5058837890625,
"learning_rate": 8.203517786893802e-07,
"loss": 0.1134,
"reward": 0.6819196790456772,
"reward_std": 0.2964627929031849,
"rewards/accuracy_reward": 0.6819196790456772,
"rewards/format_reward": 0.0,
"step": 120
},
{
"clip_ratio": 0.0015482078888453543,
"completion_length": 606.9152030944824,
"epoch": 0.7107320540156361,
"grad_norm": 15.141037940979004,
"kl": 0.71937255859375,
"learning_rate": 6.901868548728988e-07,
"loss": 0.1873,
"reward": 0.7176339589059353,
"reward_std": 0.26815181598067284,
"rewards/accuracy_reward": 0.7176339589059353,
"rewards/format_reward": 0.0,
"step": 125
},
{
"clip_ratio": 0.0011899313576577697,
"completion_length": 614.9598484039307,
"epoch": 0.7391613361762616,
"grad_norm": 19.52332878112793,
"kl": 0.57347412109375,
"learning_rate": 5.681215196817688e-07,
"loss": 0.1139,
"reward": 0.6986607536673546,
"reward_std": 0.26849195174872875,
"rewards/accuracy_reward": 0.6986607536673546,
"rewards/format_reward": 0.0,
"step": 130
},
{
"clip_ratio": 0.000972740968427388,
"completion_length": 574.955379486084,
"epoch": 0.767590618336887,
"grad_norm": 84.11812591552734,
"kl": 6.191357421875,
"learning_rate": 4.553766461117076e-07,
"loss": 0.5015,
"reward": 0.6919643133878708,
"reward_std": 0.27763173170387745,
"rewards/accuracy_reward": 0.6919643133878708,
"rewards/format_reward": 0.0,
"step": 135
},
{
"clip_ratio": 0.0004989029925127397,
"completion_length": 600.3588371276855,
"epoch": 0.7960199004975125,
"grad_norm": 7.396132469177246,
"kl": 0.93154296875,
"learning_rate": 3.5307988577102803e-07,
"loss": 0.1247,
"reward": 0.6891741380095482,
"reward_std": 0.26927561219781637,
"rewards/accuracy_reward": 0.6891741380095482,
"rewards/format_reward": 0.0,
"step": 140
},
{
"clip_ratio": 0.0012033914652420207,
"completion_length": 599.4185523986816,
"epoch": 0.8244491826581379,
"grad_norm": 229.0263214111328,
"kl": 1.950244140625,
"learning_rate": 2.6225439033546046e-07,
"loss": 0.1983,
"reward": 0.7025670036673546,
"reward_std": 0.2759845983237028,
"rewards/accuracy_reward": 0.7025670036673546,
"rewards/format_reward": 0.0,
"step": 145
},
{
"clip_ratio": 0.002918653353117406,
"completion_length": 614.3248138427734,
"epoch": 0.8528784648187633,
"grad_norm": 32.9469108581543,
"kl": 1.07236328125,
"learning_rate": 1.838085781903433e-07,
"loss": 0.1503,
"reward": 0.6852678880095482,
"reward_std": 0.2656104937195778,
"rewards/accuracy_reward": 0.6852678880095482,
"rewards/format_reward": 0.0,
"step": 150
},
{
"clip_ratio": 0.0016865622681507375,
"completion_length": 595.2756938934326,
"epoch": 0.8813077469793887,
"grad_norm": 29.517351150512695,
"kl": 2.5845703125,
"learning_rate": 1.1852704861216351e-07,
"loss": 0.2295,
"reward": 0.7081473581492901,
"reward_std": 0.28944313153624535,
"rewards/accuracy_reward": 0.7081473581492901,
"rewards/format_reward": 0.0,
"step": 155
},
{
"clip_ratio": 0.0005666015515089385,
"completion_length": 587.8806037902832,
"epoch": 0.9097370291400142,
"grad_norm": 34.70378494262695,
"kl": 0.88984375,
"learning_rate": 6.706273436399024e-08,
"loss": 0.1321,
"reward": 0.6880580671131611,
"reward_std": 0.3093922510743141,
"rewards/accuracy_reward": 0.6880580671131611,
"rewards/format_reward": 0.0,
"step": 160
},
{
"clip_ratio": 0.00019017228769371285,
"completion_length": 611.3214645385742,
"epoch": 0.9381663113006397,
"grad_norm": 20.277708053588867,
"kl": 0.973486328125,
"learning_rate": 2.993037119295694e-08,
"loss": 0.1165,
"reward": 0.746651828289032,
"reward_std": 0.2960765212774277,
"rewards/accuracy_reward": 0.746651828289032,
"rewards/format_reward": 0.0,
"step": 165
},
{
"clip_ratio": 0.0001261272067495156,
"completion_length": 583.6373023986816,
"epoch": 0.9665955934612651,
"grad_norm": 26.07307243347168,
"kl": 3.51806640625,
"learning_rate": 7.501349546579329e-09,
"loss": 0.3474,
"reward": 0.7053571753203869,
"reward_std": 0.27511973679065704,
"rewards/accuracy_reward": 0.7053571753203869,
"rewards/format_reward": 0.0,
"step": 170
},
{
"clip_ratio": 0.00015486263464481453,
"completion_length": 589.4955673217773,
"epoch": 0.9893390191897654,
"kl": 0.896728515625,
"reward": 0.6707589626312256,
"reward_std": 0.2826360985636711,
"rewards/accuracy_reward": 0.6707589626312256,
"rewards/format_reward": 0.0,
"step": 174,
"total_flos": 0.0,
"train_loss": 0.7242226951338094,
"train_runtime": 14838.9521,
"train_samples_per_second": 0.505,
"train_steps_per_second": 0.012
}
],
"logging_steps": 5,
"max_steps": 175,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}