Qwen-2.5-7B-Simple-RL-E4 / trainer_state.json
chenggong1995's picture
Model save
97449fc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.992,
"eval_steps": 100,
"global_step": 372,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 603.5946578979492,
"epoch": 0.010666666666666666,
"grad_norm": 0.5514267683029175,
"kl": 0.0,
"learning_rate": 7.894736842105262e-08,
"loss": 0.02,
"reward": 0.6678571403026581,
"reward_std": 0.3216256983578205,
"rewards/accuracy_reward": 0.6571428701281548,
"rewards/format_reward": 0.010714285774156451,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 601.3660755157471,
"epoch": 0.05333333333333334,
"grad_norm": 0.26260125637054443,
"kl": 0.00015875697135925293,
"learning_rate": 3.9473684210526315e-07,
"loss": 0.0497,
"reward": 0.6678571440279484,
"reward_std": 0.31925761327147484,
"rewards/accuracy_reward": 0.6620535738766193,
"rewards/format_reward": 0.0058035714901052415,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 580.2717948913574,
"epoch": 0.10666666666666667,
"grad_norm": 0.41621726751327515,
"kl": 0.00022451877593994142,
"learning_rate": 7.894736842105263e-07,
"loss": 0.04,
"reward": 0.6660714320838451,
"reward_std": 0.3105974230915308,
"rewards/accuracy_reward": 0.6582142896950245,
"rewards/format_reward": 0.007857142924331128,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 601.5649993896484,
"epoch": 0.16,
"grad_norm": 0.8033285737037659,
"kl": 0.0008542776107788086,
"learning_rate": 1.1842105263157894e-06,
"loss": 0.0401,
"reward": 0.6753571435809136,
"reward_std": 0.3171094346791506,
"rewards/accuracy_reward": 0.6692857146263123,
"rewards/format_reward": 0.006071428628638386,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 592.3267883300781,
"epoch": 0.21333333333333335,
"grad_norm": 5.2608418464660645,
"kl": 0.002477073669433594,
"learning_rate": 1.5789473684210526e-06,
"loss": 0.0603,
"reward": 0.724999999254942,
"reward_std": 0.28942007161676886,
"rewards/accuracy_reward": 0.7196428619325161,
"rewards/format_reward": 0.005357142887078226,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 595.3885772705078,
"epoch": 0.26666666666666666,
"grad_norm": 1.9151082038879395,
"kl": 0.004088020324707032,
"learning_rate": 1.973684210526316e-06,
"loss": 0.0597,
"reward": 0.736785712838173,
"reward_std": 0.25792951658368113,
"rewards/accuracy_reward": 0.7317857101559639,
"rewards/format_reward": 0.005000000027939678,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 617.260718536377,
"epoch": 0.32,
"grad_norm": 0.24098791182041168,
"kl": 0.007413291931152343,
"learning_rate": 2.368421052631579e-06,
"loss": 0.0587,
"reward": 0.7442857131361962,
"reward_std": 0.22607315629720687,
"rewards/accuracy_reward": 0.7432142853736877,
"rewards/format_reward": 0.0010714285774156452,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 582.1746444702148,
"epoch": 0.37333333333333335,
"grad_norm": 0.15359072387218475,
"kl": 0.002556610107421875,
"learning_rate": 2.763157894736842e-06,
"loss": 0.0581,
"reward": 0.7835714235901833,
"reward_std": 0.20161447767168283,
"rewards/accuracy_reward": 0.7803571403026581,
"rewards/format_reward": 0.0032142857555299996,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 600.7846466064453,
"epoch": 0.4266666666666667,
"grad_norm": 0.26131102442741394,
"kl": 0.0032720565795898438,
"learning_rate": 2.9997345912364375e-06,
"loss": 0.0606,
"reward": 0.7528571419417858,
"reward_std": 0.21564750857651233,
"rewards/accuracy_reward": 0.7450000010430813,
"rewards/format_reward": 0.007857142924331128,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 576.6367889404297,
"epoch": 0.48,
"grad_norm": 0.3388746976852417,
"kl": 0.006422805786132813,
"learning_rate": 2.996749821181634e-06,
"loss": 0.0272,
"reward": 0.8014285683631897,
"reward_std": 0.2300501298159361,
"rewards/accuracy_reward": 0.7671428561210633,
"rewards/format_reward": 0.034285714849829674,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 597.5764289855957,
"epoch": 0.5333333333333333,
"grad_norm": 0.31223493814468384,
"kl": 0.011295318603515625,
"learning_rate": 2.9904551426434754e-06,
"loss": 0.0452,
"reward": 0.856071425974369,
"reward_std": 0.3176830269396305,
"rewards/accuracy_reward": 0.746785718202591,
"rewards/format_reward": 0.10928571680560709,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 574.4592880249023,
"epoch": 0.5866666666666667,
"grad_norm": 0.28976941108703613,
"kl": 0.01858978271484375,
"learning_rate": 2.980864475656959e-06,
"loss": 0.0357,
"reward": 0.9096428632736206,
"reward_std": 0.35986471064388753,
"rewards/accuracy_reward": 0.7403571456670761,
"rewards/format_reward": 0.16928571905009449,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 565.7932189941406,
"epoch": 0.64,
"grad_norm": 0.5990632176399231,
"kl": 0.06176605224609375,
"learning_rate": 2.9679990289969723e-06,
"loss": 0.026,
"reward": 0.9946428626775742,
"reward_std": 0.41035427935421465,
"rewards/accuracy_reward": 0.7496428593993187,
"rewards/format_reward": 0.24500000569969416,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 594.3525047302246,
"epoch": 0.6933333333333334,
"grad_norm": 0.7594552636146545,
"kl": 0.041387939453125,
"learning_rate": 2.951887253277264e-06,
"loss": 0.0353,
"reward": 0.9875000134110451,
"reward_std": 0.40259603410959244,
"rewards/accuracy_reward": 0.7228571429848671,
"rewards/format_reward": 0.2646428645588458,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 618.348575592041,
"epoch": 0.7466666666666667,
"grad_norm": 0.4860813319683075,
"kl": 0.0418487548828125,
"learning_rate": 2.9325647780348364e-06,
"loss": 0.0408,
"reward": 0.9564285814762116,
"reward_std": 0.4505664937198162,
"rewards/accuracy_reward": 0.7017857164144516,
"rewards/format_reward": 0.25464286394417285,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 629.8578582763672,
"epoch": 0.8,
"grad_norm": 1.0445483922958374,
"kl": 0.095233154296875,
"learning_rate": 2.9100743329388826e-06,
"loss": 0.0442,
"reward": 0.8553571477532387,
"reward_std": 0.5062661200761795,
"rewards/accuracy_reward": 0.6053571484982967,
"rewards/format_reward": 0.25000000689178703,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 614.864640045166,
"epoch": 0.8533333333333334,
"grad_norm": 7.832869052886963,
"kl": 0.3074462890625,
"learning_rate": 2.884465653298514e-06,
"loss": 0.0442,
"reward": 0.6000000044703484,
"reward_std": 0.5314710065722466,
"rewards/accuracy_reward": 0.45214286893606187,
"rewards/format_reward": 0.1478571461746469,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 607.6428573608398,
"epoch": 0.9066666666666666,
"grad_norm": 2.0023958683013916,
"kl": 0.2650146484375,
"learning_rate": 2.8557953700782305e-06,
"loss": 0.0802,
"reward": 0.4507142949849367,
"reward_std": 0.39981199279427526,
"rewards/accuracy_reward": 0.44285715110599994,
"rewards/format_reward": 0.007857142924331128,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 638.4660736083985,
"epoch": 0.96,
"grad_norm": 12.028424263000488,
"kl": 2.06630859375,
"learning_rate": 2.8241268846643613e-06,
"loss": 0.2324,
"reward": 0.5167857263237238,
"reward_std": 0.38161189407110213,
"rewards/accuracy_reward": 0.5167857263237238,
"rewards/format_reward": 0.0,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 645.2714347839355,
"epoch": 1.0213333333333334,
"grad_norm": 12.778322219848633,
"kl": 2.92353515625,
"learning_rate": 2.789530228659411e-06,
"loss": 0.32,
"reward": 0.5471428662538529,
"reward_std": 0.3661020591855049,
"rewards/accuracy_reward": 0.5467857226729393,
"rewards/format_reward": 0.00035714285913854835,
"step": 95
},
{
"epoch": 1.0746666666666667,
"grad_norm": 103.01477813720703,
"learning_rate": 2.7520819090143655e-06,
"loss": 0.3999,
"step": 100
},
{
"epoch": 1.0746666666666667,
"eval_clip_ratio": 0.0,
"eval_completion_length": 641.2306123046875,
"eval_kl": 2.0040703125,
"eval_loss": 0.2535090744495392,
"eval_reward": 0.5025428644418717,
"eval_reward_std": 0.32912875032424926,
"eval_rewards/accuracy_reward": 0.5025142929553985,
"eval_rewards/format_reward": 2.857142873108387e-05,
"eval_runtime": 6399.9846,
"eval_samples_per_second": 0.781,
"eval_steps_per_second": 0.011,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 653.1387554168701,
"epoch": 1.1280000000000001,
"grad_norm": 10.29504108428955,
"kl": 3.306494140625,
"learning_rate": 2.711864738841427e-06,
"loss": 0.313,
"reward": 0.5321428634226322,
"reward_std": 0.36240698825567963,
"rewards/accuracy_reward": 0.5319642923772335,
"rewards/format_reward": 0.00017857142956927418,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 622.9542854309082,
"epoch": 1.1813333333333333,
"grad_norm": 4.739567756652832,
"kl": 2.80986328125,
"learning_rate": 2.668967654281324e-06,
"loss": 0.3163,
"reward": 0.5210714336484671,
"reward_std": 0.3654427368193865,
"rewards/accuracy_reward": 0.5210714336484671,
"rewards/format_reward": 0.0,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 633.2053619384766,
"epoch": 1.2346666666666666,
"grad_norm": 7.334438800811768,
"kl": 2.38583984375,
"learning_rate": 2.6234855178301717e-06,
"loss": 0.2608,
"reward": 0.5717857219278812,
"reward_std": 0.34984773173928263,
"rewards/accuracy_reward": 0.5717857219278812,
"rewards/format_reward": 0.0,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 622.6389305114747,
"epoch": 1.288,
"grad_norm": 5.251643657684326,
"kl": 7.957421875,
"learning_rate": 2.5755189085608046e-06,
"loss": 0.6151,
"reward": 0.56142857670784,
"reward_std": 0.36079162210226057,
"rewards/accuracy_reward": 0.56142857670784,
"rewards/format_reward": 0.0,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 608.3346481323242,
"epoch": 1.3413333333333333,
"grad_norm": 6.392853736877441,
"kl": 3.24140625,
"learning_rate": 2.5251738997024913e-06,
"loss": 0.3223,
"reward": 0.6085714355111123,
"reward_std": 0.31923084184527395,
"rewards/accuracy_reward": 0.6085714355111123,
"rewards/format_reward": 0.0,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 579.453929901123,
"epoch": 1.3946666666666667,
"grad_norm": 4.198450088500977,
"kl": 2.5896484375,
"learning_rate": 2.4725618240708804e-06,
"loss": 0.2742,
"reward": 0.699642863869667,
"reward_std": 0.297719220072031,
"rewards/accuracy_reward": 0.699642863869667,
"rewards/format_reward": 0.0,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 581.9385719299316,
"epoch": 1.448,
"grad_norm": 12.581628799438477,
"kl": 3.10703125,
"learning_rate": 2.417799027866917e-06,
"loss": 0.277,
"reward": 0.6610714331269264,
"reward_std": 0.3010447319597006,
"rewards/accuracy_reward": 0.6610714331269264,
"rewards/format_reward": 0.0,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 623.4474975585938,
"epoch": 1.5013333333333332,
"grad_norm": 16.22210121154785,
"kl": 3.727734375,
"learning_rate": 2.3610066133891706e-06,
"loss": 0.3426,
"reward": 0.6021428607404232,
"reward_std": 0.3193087562918663,
"rewards/accuracy_reward": 0.6021428607404232,
"rewards/format_reward": 0.0,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 604.1760726928711,
"epoch": 1.5546666666666666,
"grad_norm": 7.088801383972168,
"kl": 2.53134765625,
"learning_rate": 2.3023101712285398e-06,
"loss": 0.2652,
"reward": 0.620357146114111,
"reward_std": 0.31110015958547593,
"rewards/accuracy_reward": 0.620357146114111,
"rewards/format_reward": 0.0,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 598.1307167053222,
"epoch": 1.608,
"grad_norm": 6.647467136383057,
"kl": 3.04794921875,
"learning_rate": 2.241839502537563e-06,
"loss": 0.2696,
"reward": 0.6471428565680981,
"reward_std": 0.32793208621442316,
"rewards/accuracy_reward": 0.6471428565680981,
"rewards/format_reward": 0.0,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 591.0500015258789,
"epoch": 1.6613333333333333,
"grad_norm": 3.569319009780884,
"kl": 3.45087890625,
"learning_rate": 2.179728331988501e-06,
"loss": 0.2897,
"reward": 0.6439285717904568,
"reward_std": 0.3045485220849514,
"rewards/accuracy_reward": 0.6439285717904568,
"rewards/format_reward": 0.0,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 592.1617858886718,
"epoch": 1.7146666666666666,
"grad_norm": 6.636693954467773,
"kl": 3.13828125,
"learning_rate": 2.116114012054961e-06,
"loss": 0.2941,
"reward": 0.6228571489453316,
"reward_std": 0.3313132245093584,
"rewards/accuracy_reward": 0.6228571489453316,
"rewards/format_reward": 0.0,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 593.5107170104981,
"epoch": 1.768,
"grad_norm": 4.461601257324219,
"kl": 2.5765625,
"learning_rate": 2.0511372192710126e-06,
"loss": 0.2504,
"reward": 0.6128571487963199,
"reward_std": 0.3229567937552929,
"rewards/accuracy_reward": 0.6128571487963199,
"rewards/format_reward": 0.0,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 573.807861328125,
"epoch": 1.8213333333333335,
"grad_norm": 9.668325424194336,
"kl": 2.962890625,
"learning_rate": 1.984941643139478e-06,
"loss": 0.2452,
"reward": 0.665357144922018,
"reward_std": 0.28491050042212007,
"rewards/accuracy_reward": 0.665357144922018,
"rewards/format_reward": 0.0,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 593.6103607177735,
"epoch": 1.8746666666666667,
"grad_norm": 7.393840312957764,
"kl": 2.665625,
"learning_rate": 1.9176736683773613e-06,
"loss": 0.2629,
"reward": 0.6667857177555561,
"reward_std": 0.29318542815744875,
"rewards/accuracy_reward": 0.6667857177555561,
"rewards/format_reward": 0.0,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 574.5892898559571,
"epoch": 1.928,
"grad_norm": 4.1402411460876465,
"kl": 2.0453125,
"learning_rate": 1.8494820512010797e-06,
"loss": 0.2006,
"reward": 0.651785721629858,
"reward_std": 0.29290417619049547,
"rewards/accuracy_reward": 0.651785721629858,
"rewards/format_reward": 0.0,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 589.5732147216797,
"epoch": 1.9813333333333332,
"grad_norm": 8.444381713867188,
"kl": 3.840625,
"learning_rate": 1.780517590367375e-06,
"loss": 0.2613,
"reward": 0.6317857228219509,
"reward_std": 0.3020774323493242,
"rewards/accuracy_reward": 0.6317857228219509,
"rewards/format_reward": 0.0,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 583.2778633117675,
"epoch": 2.042666666666667,
"grad_norm": 6.335790157318115,
"kl": 2.2416015625,
"learning_rate": 1.7109327936973479e-06,
"loss": 0.2096,
"reward": 0.6082142978906632,
"reward_std": 0.31343335174024106,
"rewards/accuracy_reward": 0.6082142978906632,
"rewards/format_reward": 0.0,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 544.5621482849122,
"epoch": 2.096,
"grad_norm": 5.383208274841309,
"kl": 2.64853515625,
"learning_rate": 1.6408815408210818e-06,
"loss": 0.1812,
"reward": 0.6771428674459458,
"reward_std": 0.29982126969844103,
"rewards/accuracy_reward": 0.6771428674459458,
"rewards/format_reward": 0.0,
"step": 195
},
{
"epoch": 2.1493333333333333,
"grad_norm": 2.0144202709198,
"learning_rate": 1.5705187428886465e-06,
"loss": 0.1637,
"step": 200
},
{
"epoch": 2.1493333333333333,
"eval_clip_ratio": 0.0,
"eval_completion_length": 549.7326699829101,
"eval_kl": 1.38025830078125,
"eval_loss": 0.1164696142077446,
"eval_reward": 0.6539142909049988,
"eval_reward_std": 0.24548907431960107,
"eval_rewards/accuracy_reward": 0.6538857194185257,
"eval_rewards/format_reward": 2.857142873108387e-05,
"eval_runtime": 6160.5151,
"eval_samples_per_second": 0.812,
"eval_steps_per_second": 0.012,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 559.5107177734375,
"epoch": 2.2026666666666666,
"grad_norm": 6.055785655975342,
"kl": 1.8238037109375,
"learning_rate": 1.5e-06,
"loss": 0.1698,
"reward": 0.7205357164144516,
"reward_std": 0.25796807631850244,
"rewards/accuracy_reward": 0.7205357164144516,
"rewards/format_reward": 0.0,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 565.1600074768066,
"epoch": 2.2560000000000002,
"grad_norm": 3.3561553955078125,
"kl": 1.442333984375,
"learning_rate": 1.429481257111354e-06,
"loss": 0.1369,
"reward": 0.7003571465611458,
"reward_std": 0.24939093235880136,
"rewards/accuracy_reward": 0.7003571465611458,
"rewards/format_reward": 0.0,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 566.3482208251953,
"epoch": 2.3093333333333335,
"grad_norm": 3.942612648010254,
"kl": 1.7908203125,
"learning_rate": 1.3591184591789185e-06,
"loss": 0.1523,
"reward": 0.6507142879068851,
"reward_std": 0.2614848371595144,
"rewards/accuracy_reward": 0.6507142879068851,
"rewards/format_reward": 0.0,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 568.5274993896485,
"epoch": 2.3626666666666667,
"grad_norm": 2.1146202087402344,
"kl": 1.714697265625,
"learning_rate": 1.289067206302653e-06,
"loss": 0.1557,
"reward": 0.7114285722374916,
"reward_std": 0.27491063699126245,
"rewards/accuracy_reward": 0.7114285722374916,
"rewards/format_reward": 0.0,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 569.0960746765137,
"epoch": 2.416,
"grad_norm": 3.0883631706237793,
"kl": 2.18818359375,
"learning_rate": 1.2194824096326252e-06,
"loss": 0.1565,
"reward": 0.6735714331269265,
"reward_std": 0.30523640997707846,
"rewards/accuracy_reward": 0.6735714331269265,
"rewards/format_reward": 0.0,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 550.5975044250488,
"epoch": 2.469333333333333,
"grad_norm": 2.8128914833068848,
"kl": 2.0109375,
"learning_rate": 1.1505179487989203e-06,
"loss": 0.1719,
"reward": 0.6689285777509213,
"reward_std": 0.3086254850029945,
"rewards/accuracy_reward": 0.6689285777509213,
"rewards/format_reward": 0.0,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 569.1417877197266,
"epoch": 2.522666666666667,
"grad_norm": 6.377042770385742,
"kl": 2.2880859375,
"learning_rate": 1.0823263316226388e-06,
"loss": 0.1757,
"reward": 0.6125000007450581,
"reward_std": 0.3144677709788084,
"rewards/accuracy_reward": 0.6125000007450581,
"rewards/format_reward": 0.0,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 573.9864295959472,
"epoch": 2.576,
"grad_norm": 3.2370288372039795,
"kl": 1.9962890625,
"learning_rate": 1.0150583568605221e-06,
"loss": 0.1604,
"reward": 0.6492857195436954,
"reward_std": 0.2943309750407934,
"rewards/accuracy_reward": 0.6492857195436954,
"rewards/format_reward": 0.0,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 575.0525032043457,
"epoch": 2.6293333333333333,
"grad_norm": 1.6219196319580078,
"kl": 2.58681640625,
"learning_rate": 9.488627807289882e-07,
"loss": 0.2006,
"reward": 0.6407142907381058,
"reward_std": 0.3086009453982115,
"rewards/accuracy_reward": 0.6403571471571923,
"rewards/format_reward": 0.00035714285913854835,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 542.7453628540039,
"epoch": 2.6826666666666665,
"grad_norm": 5.958657741546631,
"kl": 1.9966796875,
"learning_rate": 8.838859879450389e-07,
"loss": 0.1441,
"reward": 0.6871428593993187,
"reward_std": 0.2942324198782444,
"rewards/accuracy_reward": 0.6871428593993187,
"rewards/format_reward": 0.0,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 575.7157119750976,
"epoch": 2.7359999999999998,
"grad_norm": 6.527657985687256,
"kl": 2.8220703125,
"learning_rate": 8.202716680115e-07,
"loss": 0.2219,
"reward": 0.6150000065565109,
"reward_std": 0.31769666373729705,
"rewards/accuracy_reward": 0.6150000065565109,
"rewards/format_reward": 0.0,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 557.6946426391602,
"epoch": 2.7893333333333334,
"grad_norm": 1.3546433448791504,
"kl": 2.30390625,
"learning_rate": 7.581604974624371e-07,
"loss": 0.1946,
"reward": 0.6285714358091354,
"reward_std": 0.3216796424239874,
"rewards/accuracy_reward": 0.6285714358091354,
"rewards/format_reward": 0.0,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 558.1150016784668,
"epoch": 2.8426666666666667,
"grad_norm": 5.478274822235107,
"kl": 2.551171875,
"learning_rate": 6.976898287714604e-07,
"loss": 0.1833,
"reward": 0.6185714341700077,
"reward_std": 0.28562614060938357,
"rewards/accuracy_reward": 0.6185714341700077,
"rewards/format_reward": 0.0,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 573.2803550720215,
"epoch": 2.896,
"grad_norm": 3.3262879848480225,
"kl": 2.02880859375,
"learning_rate": 6.389933866108296e-07,
"loss": 0.1675,
"reward": 0.6442857190966607,
"reward_std": 0.31180151328444483,
"rewards/accuracy_reward": 0.6442857190966607,
"rewards/format_reward": 0.0,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 569.5950019836425,
"epoch": 2.9493333333333336,
"grad_norm": 4.308130741119385,
"kl": 2.551171875,
"learning_rate": 5.822009721330832e-07,
"loss": 0.198,
"reward": 0.5975000038743019,
"reward_std": 0.30098386816680434,
"rewards/accuracy_reward": 0.5975000038743019,
"rewards/format_reward": 0.0,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 563.0910797119141,
"epoch": 3.010666666666667,
"grad_norm": 3.7203142642974854,
"kl": 2.0693359375,
"learning_rate": 5.2743817592912e-07,
"loss": 0.1603,
"reward": 0.5892857205122709,
"reward_std": 0.31964287385344503,
"rewards/accuracy_reward": 0.5892857205122709,
"rewards/format_reward": 0.0,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 557.4032203674317,
"epoch": 3.064,
"grad_norm": 16.83099365234375,
"kl": 2.18056640625,
"learning_rate": 4.7482610029750927e-07,
"loss": 0.1523,
"reward": 0.628214293718338,
"reward_std": 0.3107015870511532,
"rewards/accuracy_reward": 0.628214293718338,
"rewards/format_reward": 0.0,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 568.0471504211425,
"epoch": 3.1173333333333333,
"grad_norm": 3.361072540283203,
"kl": 2.8162109375,
"learning_rate": 4.244810914391956e-07,
"loss": 0.1969,
"reward": 0.6132142916321754,
"reward_std": 0.3081982746720314,
"rewards/accuracy_reward": 0.6132142916321754,
"rewards/format_reward": 0.0,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 568.8232192993164,
"epoch": 3.1706666666666665,
"grad_norm": 2.53417706489563,
"kl": 2.3166015625,
"learning_rate": 3.7651448216982855e-07,
"loss": 0.183,
"reward": 0.586428577452898,
"reward_std": 0.3080826660618186,
"rewards/accuracy_reward": 0.586428577452898,
"rewards/format_reward": 0.0,
"step": 295
},
{
"epoch": 3.224,
"grad_norm": 2.580390453338623,
"learning_rate": 3.3103234571867633e-07,
"loss": 0.1789,
"step": 300
},
{
"epoch": 3.224,
"eval_clip_ratio": 0.0,
"eval_completion_length": 557.1966226196289,
"eval_kl": 2.1642421875,
"eval_loss": 0.16605359315872192,
"eval_reward": 0.574000006556511,
"eval_reward_std": 0.2975393404364586,
"eval_rewards/accuracy_reward": 0.574000006556511,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 6248.4374,
"eval_samples_per_second": 0.8,
"eval_steps_per_second": 0.012,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 566.4307189941406,
"epoch": 3.2773333333333334,
"grad_norm": 3.120903253555298,
"kl": 2.376708984375,
"learning_rate": 2.8813526115857293e-07,
"loss": 0.1996,
"reward": 0.6005357194691896,
"reward_std": 0.32622543424367906,
"rewards/accuracy_reward": 0.6005357194691896,
"rewards/format_reward": 0.0,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 572.9203590393066,
"epoch": 3.3306666666666667,
"grad_norm": 3.122821807861328,
"kl": 2.814453125,
"learning_rate": 2.479180909856347e-07,
"loss": 0.2152,
"reward": 0.5971428655087948,
"reward_std": 0.31468545868992803,
"rewards/accuracy_reward": 0.5971428655087948,
"rewards/format_reward": 0.0,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 581.5917861938476,
"epoch": 3.384,
"grad_norm": 1.9867832660675049,
"kl": 2.5201171875,
"learning_rate": 2.104697713405892e-07,
"loss": 0.2019,
"reward": 0.5767857223749161,
"reward_std": 0.3214090891182423,
"rewards/accuracy_reward": 0.5767857223749161,
"rewards/format_reward": 0.0,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 553.7017860412598,
"epoch": 3.437333333333333,
"grad_norm": 3.077442169189453,
"kl": 1.91484375,
"learning_rate": 1.7587311533563887e-07,
"loss": 0.1889,
"reward": 0.6357142955064774,
"reward_std": 0.31964287273585795,
"rewards/accuracy_reward": 0.6357142955064774,
"rewards/format_reward": 0.0,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 575.9078590393067,
"epoch": 3.490666666666667,
"grad_norm": 4.862219333648682,
"kl": 2.2921875,
"learning_rate": 1.4420462992176975e-07,
"loss": 0.1884,
"reward": 0.592500003427267,
"reward_std": 0.3214297264814377,
"rewards/accuracy_reward": 0.592500003427267,
"rewards/format_reward": 0.0,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 583.7278625488282,
"epoch": 3.544,
"grad_norm": 2.4073171615600586,
"kl": 2.33125,
"learning_rate": 1.1553434670148605e-07,
"loss": 0.2008,
"reward": 0.6000000059604644,
"reward_std": 0.317187948897481,
"rewards/accuracy_reward": 0.6000000059604644,
"rewards/format_reward": 0.0,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 575.1992881774902,
"epoch": 3.5973333333333333,
"grad_norm": 2.900181770324707,
"kl": 2.4134765625,
"learning_rate": 8.992566706111727e-08,
"loss": 0.2086,
"reward": 0.6085714355111123,
"reward_std": 0.3346530221402645,
"rewards/accuracy_reward": 0.6085714355111123,
"rewards/format_reward": 0.0,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 563.7471504211426,
"epoch": 3.6506666666666665,
"grad_norm": 3.253260612487793,
"kl": 2.58330078125,
"learning_rate": 6.743522196516388e-08,
"loss": 0.2354,
"reward": 0.5932142935693264,
"reward_std": 0.33793471828103067,
"rewards/accuracy_reward": 0.5932142935693264,
"rewards/format_reward": 0.0,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 572.4153633117676,
"epoch": 3.7039999999999997,
"grad_norm": 2.292677402496338,
"kl": 2.8171875,
"learning_rate": 4.811274672273652e-08,
"loss": 0.231,
"reward": 0.6085714347660541,
"reward_std": 0.33270861953496933,
"rewards/accuracy_reward": 0.6085714347660541,
"rewards/format_reward": 0.0,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 562.8014366149903,
"epoch": 3.7573333333333334,
"grad_norm": 2.846649646759033,
"kl": 2.5126953125,
"learning_rate": 3.200097100302812e-08,
"loss": 0.1962,
"reward": 0.5917857199907303,
"reward_std": 0.3207391370087862,
"rewards/accuracy_reward": 0.5917857199907303,
"rewards/format_reward": 0.0,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 579.4639297485352,
"epoch": 3.8106666666666666,
"grad_norm": 3.3534533977508545,
"kl": 2.56982421875,
"learning_rate": 1.9135524343040946e-08,
"loss": 0.2111,
"reward": 0.5914285771548748,
"reward_std": 0.3343843434005976,
"rewards/accuracy_reward": 0.5914285771548748,
"rewards/format_reward": 0.0,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 570.3239334106445,
"epoch": 3.864,
"grad_norm": 3.263777256011963,
"kl": 2.66884765625,
"learning_rate": 9.54485735652455e-09,
"loss": 0.2102,
"reward": 0.5882142938673496,
"reward_std": 0.32250265777111053,
"rewards/accuracy_reward": 0.5882142938673496,
"rewards/format_reward": 0.0,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 569.3417884826661,
"epoch": 3.9173333333333336,
"grad_norm": 3.1193442344665527,
"kl": 2.36357421875,
"learning_rate": 3.2501788183657564e-09,
"loss": 0.18,
"reward": 0.6117857240140439,
"reward_std": 0.32440952584147453,
"rewards/accuracy_reward": 0.6117857240140439,
"rewards/format_reward": 0.0,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 574.9478591918945,
"epoch": 3.970666666666667,
"grad_norm": 1.893235206604004,
"kl": 2.492578125,
"learning_rate": 2.6540876356256906e-10,
"loss": 0.2069,
"reward": 0.6153571508824826,
"reward_std": 0.3225582234561443,
"rewards/accuracy_reward": 0.6153571508824826,
"rewards/format_reward": 0.0,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 523.5830421447754,
"epoch": 3.992,
"kl": 2.380859375,
"reward": 0.5919642969965935,
"reward_std": 0.32083501014858484,
"rewards/accuracy_reward": 0.5919642969965935,
"rewards/format_reward": 0.0,
"step": 372,
"total_flos": 0.0,
"train_loss": 0.18514081492258977,
"train_runtime": 65688.6828,
"train_samples_per_second": 0.457,
"train_steps_per_second": 0.006
}
],
"logging_steps": 5,
"max_steps": 372,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 14,
"trial_name": null,
"trial_params": null
}