Qwen2.5-7B-GRPO-NM-COT-20K-2epoch / trainer_state.json
Haitao999's picture
Model save
05d7c11 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9965010496850945,
"eval_steps": 100,
"global_step": 89,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 776.6575088500977,
"epoch": 0.01119664100769769,
"grad_norm": 1.4259217977523804,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4336734637618065,
"reward_std": 0.13708234671503305,
"rewards/accuracy_reward": 0.4336734637618065,
"step": 1
},
{
"completion_length": 775.5478172302246,
"epoch": 0.02239328201539538,
"grad_norm": 1.6899404525756836,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.3609693795442581,
"reward_std": 0.12760126357898116,
"rewards/accuracy_reward": 0.3609693795442581,
"step": 2
},
{
"completion_length": 770.4444999694824,
"epoch": 0.03358992302309307,
"grad_norm": 1.1099838018417358,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.41772958543151617,
"reward_std": 0.0995770595036447,
"rewards/accuracy_reward": 0.41772958543151617,
"step": 3
},
{
"completion_length": 763.253173828125,
"epoch": 0.04478656403079076,
"grad_norm": 1.734800100326538,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.42602039594203234,
"reward_std": 0.10463267145678401,
"rewards/accuracy_reward": 0.42602039594203234,
"step": 4
},
{
"completion_length": 728.0452613830566,
"epoch": 0.05598320503848846,
"grad_norm": 1.9766837358474731,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.412627543322742,
"reward_std": 0.1070892985444516,
"rewards/accuracy_reward": 0.412627543322742,
"step": 5
},
{
"completion_length": 771.2640113830566,
"epoch": 0.06717984604618614,
"grad_norm": 1.5028815269470215,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.3877550968900323,
"reward_std": 0.11859837244264781,
"rewards/accuracy_reward": 0.3877550968900323,
"step": 6
},
{
"completion_length": 744.927921295166,
"epoch": 0.07837648705388384,
"grad_norm": 2.0006868839263916,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.426658152602613,
"reward_std": 0.11621210724115372,
"rewards/accuracy_reward": 0.426658152602613,
"step": 7
},
{
"completion_length": 776.862865447998,
"epoch": 0.08957312806158152,
"grad_norm": 2.4789130687713623,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.3558673383668065,
"reward_std": 0.12130605196580291,
"rewards/accuracy_reward": 0.3558673383668065,
"step": 8
},
{
"completion_length": 778.5867118835449,
"epoch": 0.10076976906927922,
"grad_norm": 1.8755781650543213,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.41581631638109684,
"reward_std": 0.10032712062820792,
"rewards/accuracy_reward": 0.41581631638109684,
"step": 9
},
{
"completion_length": 749.0809860229492,
"epoch": 0.11196641007697691,
"grad_norm": 2.071565628051758,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4030612176284194,
"reward_std": 0.11739562568254769,
"rewards/accuracy_reward": 0.4030612176284194,
"step": 10
},
{
"completion_length": 814.1294441223145,
"epoch": 0.1231630510846746,
"grad_norm": 5.151350021362305,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.37436224054545164,
"reward_std": 0.14050176995806396,
"rewards/accuracy_reward": 0.37436224054545164,
"step": 11
},
{
"completion_length": 757.7876129150391,
"epoch": 0.13435969209237228,
"grad_norm": 1.5203074216842651,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.41709182877093554,
"reward_std": 0.1278091778513044,
"rewards/accuracy_reward": 0.41709182877093554,
"step": 12
},
{
"completion_length": 777.004451751709,
"epoch": 0.14555633310007,
"grad_norm": 2.49013614654541,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.42410713247954845,
"reward_std": 0.13054731115698814,
"rewards/accuracy_reward": 0.42410713247954845,
"step": 13
},
{
"completion_length": 757.3265151977539,
"epoch": 0.15675297410776767,
"grad_norm": 3.2910609245300293,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.38201529905200005,
"reward_std": 0.10587374167516828,
"rewards/accuracy_reward": 0.38201529905200005,
"step": 14
},
{
"completion_length": 806.7212867736816,
"epoch": 0.16794961511546536,
"grad_norm": 3.803057909011841,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.3826530510559678,
"reward_std": 0.1126359230838716,
"rewards/accuracy_reward": 0.3826530510559678,
"step": 15
},
{
"completion_length": 783.6211624145508,
"epoch": 0.17914625612316304,
"grad_norm": 5.312828540802002,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.3998724380508065,
"reward_std": 0.1370647200383246,
"rewards/accuracy_reward": 0.3998724380508065,
"step": 16
},
{
"completion_length": 751.4138832092285,
"epoch": 0.19034289713086075,
"grad_norm": 5.630074977874756,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.3985969312489033,
"reward_std": 0.1044071288779378,
"rewards/accuracy_reward": 0.3985969312489033,
"step": 17
},
{
"completion_length": 740.3303375244141,
"epoch": 0.20153953813855843,
"grad_norm": 3.4527852535247803,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.45153060369193554,
"reward_std": 0.13771249912679195,
"rewards/accuracy_reward": 0.45153060369193554,
"step": 18
},
{
"completion_length": 762.7110862731934,
"epoch": 0.21273617914625612,
"grad_norm": 2.000288724899292,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.44005101174116135,
"reward_std": 0.11361152515746653,
"rewards/accuracy_reward": 0.44005101174116135,
"step": 19
},
{
"completion_length": 754.8182258605957,
"epoch": 0.22393282015395383,
"grad_norm": 2.939009189605713,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4623724361881614,
"reward_std": 0.1443291292525828,
"rewards/accuracy_reward": 0.4623724361881614,
"step": 20
},
{
"completion_length": 804.9897804260254,
"epoch": 0.2351294611616515,
"grad_norm": 3.3701393604278564,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.42219387367367744,
"reward_std": 0.12525973934680223,
"rewards/accuracy_reward": 0.42219387367367744,
"step": 21
},
{
"completion_length": 760.9719276428223,
"epoch": 0.2463261021693492,
"grad_norm": 4.5709004402160645,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4604591690003872,
"reward_std": 0.13864337070845068,
"rewards/accuracy_reward": 0.4604591690003872,
"step": 22
},
{
"completion_length": 747.7378578186035,
"epoch": 0.2575227431770469,
"grad_norm": 4.603999137878418,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4559948891401291,
"reward_std": 0.11444317712448537,
"rewards/accuracy_reward": 0.4559948891401291,
"step": 23
},
{
"completion_length": 776.5950050354004,
"epoch": 0.26871938418474456,
"grad_norm": 3.4199488162994385,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.38711733650416136,
"reward_std": 0.12604024447500706,
"rewards/accuracy_reward": 0.38711733650416136,
"step": 24
},
{
"completion_length": 805.4457778930664,
"epoch": 0.27991602519244224,
"grad_norm": 2.1585614681243896,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.34757652413100004,
"reward_std": 0.09647424682043493,
"rewards/accuracy_reward": 0.34757652413100004,
"step": 25
},
{
"completion_length": 756.6740798950195,
"epoch": 0.29111266620014,
"grad_norm": 4.483582973480225,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4100765222683549,
"reward_std": 0.11235282756388187,
"rewards/accuracy_reward": 0.4100765222683549,
"step": 26
},
{
"completion_length": 783.984676361084,
"epoch": 0.30230930720783766,
"grad_norm": 4.353560447692871,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4081632560119033,
"reward_std": 0.12413217849098146,
"rewards/accuracy_reward": 0.4081632560119033,
"step": 27
},
{
"completion_length": 785.7461547851562,
"epoch": 0.31350594821553535,
"grad_norm": 2.4794628620147705,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.3628826476633549,
"reward_std": 0.0991243754979223,
"rewards/accuracy_reward": 0.3628826476633549,
"step": 28
},
{
"completion_length": 730.373706817627,
"epoch": 0.32470258922323303,
"grad_norm": 4.818853855133057,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.48596937395632267,
"reward_std": 0.14916561311110854,
"rewards/accuracy_reward": 0.48596937395632267,
"step": 29
},
{
"completion_length": 798.7933502197266,
"epoch": 0.3358992302309307,
"grad_norm": 4.875724792480469,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.35650509130209684,
"reward_std": 0.13356371596455574,
"rewards/accuracy_reward": 0.35650509130209684,
"step": 30
},
{
"completion_length": 796.2346839904785,
"epoch": 0.3470958712386284,
"grad_norm": 10.593175888061523,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.3966836603358388,
"reward_std": 0.1197371541056782,
"rewards/accuracy_reward": 0.3966836603358388,
"step": 31
},
{
"completion_length": 770.2442512512207,
"epoch": 0.3582925122463261,
"grad_norm": 2.9329323768615723,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4113520346581936,
"reward_std": 0.10089330864138901,
"rewards/accuracy_reward": 0.4113520346581936,
"step": 32
},
{
"completion_length": 775.821418762207,
"epoch": 0.3694891532540238,
"grad_norm": 12.537503242492676,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4483418297022581,
"reward_std": 0.125624421518296,
"rewards/accuracy_reward": 0.4483418297022581,
"step": 33
},
{
"completion_length": 783.8558540344238,
"epoch": 0.3806857942617215,
"grad_norm": 4.941224575042725,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.40752549935132265,
"reward_std": 0.1238795283716172,
"rewards/accuracy_reward": 0.40752549935132265,
"step": 34
},
{
"completion_length": 835.3316078186035,
"epoch": 0.3918824352694192,
"grad_norm": 4.293496608734131,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.36670917458832264,
"reward_std": 0.13812832674011588,
"rewards/accuracy_reward": 0.36670917458832264,
"step": 35
},
{
"completion_length": 800.0905456542969,
"epoch": 0.40307907627711687,
"grad_norm": 19.928516387939453,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4234693804755807,
"reward_std": 0.11749003268778324,
"rewards/accuracy_reward": 0.4234693804755807,
"step": 36
},
{
"completion_length": 767.5561103820801,
"epoch": 0.41427571728481455,
"grad_norm": 6.884653568267822,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4445152971893549,
"reward_std": 0.09231905196793377,
"rewards/accuracy_reward": 0.4445152971893549,
"step": 37
},
{
"completion_length": 805.4872283935547,
"epoch": 0.42547235829251223,
"grad_norm": 24.017255783081055,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.40114795323461294,
"reward_std": 0.10263178893364966,
"rewards/accuracy_reward": 0.40114795323461294,
"step": 38
},
{
"completion_length": 801.1536827087402,
"epoch": 0.4366689993002099,
"grad_norm": 5.969362258911133,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.39604590833187103,
"reward_std": 0.10426798998378217,
"rewards/accuracy_reward": 0.39604590833187103,
"step": 39
},
{
"completion_length": 778.0669441223145,
"epoch": 0.44786564030790765,
"grad_norm": 7.667646884918213,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.44068876653909683,
"reward_std": 0.13503032876178622,
"rewards/accuracy_reward": 0.44068876653909683,
"step": 40
},
{
"completion_length": 777.1989631652832,
"epoch": 0.45906228131560534,
"grad_norm": 8.530767440795898,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.45854590740054846,
"reward_std": 0.1331926230341196,
"rewards/accuracy_reward": 0.45854590740054846,
"step": 41
},
{
"completion_length": 709.0452651977539,
"epoch": 0.470258922323303,
"grad_norm": 2.948634386062622,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4279336668550968,
"reward_std": 0.13054730370640755,
"rewards/accuracy_reward": 0.4279336668550968,
"step": 42
},
{
"completion_length": 812.6842880249023,
"epoch": 0.4814555633310007,
"grad_norm": 63.21303939819336,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4056122303009033,
"reward_std": 0.11122526740655303,
"rewards/accuracy_reward": 0.4056122303009033,
"step": 43
},
{
"completion_length": 794.6052093505859,
"epoch": 0.4926522043386984,
"grad_norm": 36.811729431152344,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4387754984200001,
"reward_std": 0.11845282535068691,
"rewards/accuracy_reward": 0.4387754984200001,
"step": 44
},
{
"completion_length": 774.257640838623,
"epoch": 0.5038488453463961,
"grad_norm": 15.306803703308105,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4419642761349678,
"reward_std": 0.12792909424751997,
"rewards/accuracy_reward": 0.4419642761349678,
"step": 45
},
{
"completion_length": 809.9763870239258,
"epoch": 0.5150454863540938,
"grad_norm": 18.062774658203125,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.39158162381500006,
"reward_std": 0.1342562234494835,
"rewards/accuracy_reward": 0.39158162381500006,
"step": 46
},
{
"completion_length": 801.1211700439453,
"epoch": 0.5262421273617914,
"grad_norm": 14.511787414550781,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.40752550307661295,
"reward_std": 0.13466564589180052,
"rewards/accuracy_reward": 0.40752550307661295,
"step": 47
},
{
"completion_length": 799.5650329589844,
"epoch": 0.5374387683694891,
"grad_norm": 11.552735328674316,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.3839285643771291,
"reward_std": 0.12077671871520579,
"rewards/accuracy_reward": 0.3839285643771291,
"step": 48
},
{
"completion_length": 796.6683540344238,
"epoch": 0.5486354093771868,
"grad_norm": 13.472460746765137,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.3966836668550968,
"reward_std": 0.12616657256148756,
"rewards/accuracy_reward": 0.3966836668550968,
"step": 49
},
{
"completion_length": 750.8067474365234,
"epoch": 0.5598320503848845,
"grad_norm": 13.930930137634277,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4438775470480323,
"reward_std": 0.1101488508284092,
"rewards/accuracy_reward": 0.4438775470480323,
"step": 50
},
{
"completion_length": 780.1970520019531,
"epoch": 0.5710286913925823,
"grad_norm": 5.438718795776367,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4017857061699033,
"reward_std": 0.09486208576709032,
"rewards/accuracy_reward": 0.4017857061699033,
"step": 51
},
{
"completion_length": 772.9202728271484,
"epoch": 0.58222533240028,
"grad_norm": 6.467463493347168,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.39795918203890324,
"reward_std": 0.11394576611928642,
"rewards/accuracy_reward": 0.39795918203890324,
"step": 52
},
{
"completion_length": 751.3756217956543,
"epoch": 0.5934219734079776,
"grad_norm": 7.1286139488220215,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.5063775442540646,
"reward_std": 0.152797756716609,
"rewards/accuracy_reward": 0.5063775442540646,
"step": 53
},
{
"completion_length": 773.2212867736816,
"epoch": 0.6046186144156753,
"grad_norm": 5.8168840408325195,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.3852040721103549,
"reward_std": 0.11522368853911757,
"rewards/accuracy_reward": 0.3852040721103549,
"step": 54
},
{
"completion_length": 720.9132423400879,
"epoch": 0.615815255423373,
"grad_norm": 4.343114852905273,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4336734600365162,
"reward_std": 0.10637756483629346,
"rewards/accuracy_reward": 0.4336734600365162,
"step": 55
},
{
"completion_length": 769.654956817627,
"epoch": 0.6270118964310707,
"grad_norm": 4.029993534088135,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4202806008979678,
"reward_std": 0.13102709129452705,
"rewards/accuracy_reward": 0.4202806008979678,
"step": 56
},
{
"completion_length": 750.3571243286133,
"epoch": 0.6382085374387684,
"grad_norm": 2.320124864578247,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4547193767502904,
"reward_std": 0.12525333184748888,
"rewards/accuracy_reward": 0.4547193767502904,
"step": 57
},
{
"completion_length": 756.4540672302246,
"epoch": 0.6494051784464661,
"grad_norm": 5.077786922454834,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.42793366126716137,
"reward_std": 0.1169798003975302,
"rewards/accuracy_reward": 0.42793366126716137,
"step": 58
},
{
"completion_length": 741.6307258605957,
"epoch": 0.6606018194541637,
"grad_norm": 3.857607364654541,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.398596934042871,
"reward_std": 0.11093576485291123,
"rewards/accuracy_reward": 0.398596934042871,
"step": 59
},
{
"completion_length": 779.0318717956543,
"epoch": 0.6717984604618614,
"grad_norm": 2.3024513721466064,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.40624999161809683,
"reward_std": 0.11923973984085023,
"rewards/accuracy_reward": 0.40624999161809683,
"step": 60
},
{
"completion_length": 814.8584022521973,
"epoch": 0.6829951014695591,
"grad_norm": 4.7456374168396,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.36415816005319357,
"reward_std": 0.13503673416562378,
"rewards/accuracy_reward": 0.36415816005319357,
"step": 61
},
{
"completion_length": 744.0248603820801,
"epoch": 0.6941917424772568,
"grad_norm": 7.634659767150879,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4336734553799033,
"reward_std": 0.13652257318608463,
"rewards/accuracy_reward": 0.4336734553799033,
"step": 62
},
{
"completion_length": 748.6332778930664,
"epoch": 0.7053883834849545,
"grad_norm": 5.460710525512695,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.45854590460658073,
"reward_std": 0.10201446153223515,
"rewards/accuracy_reward": 0.45854590460658073,
"step": 63
},
{
"completion_length": 766.7691230773926,
"epoch": 0.7165850244926522,
"grad_norm": 10.574377059936523,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.43558672722429037,
"reward_std": 0.11734448280185461,
"rewards/accuracy_reward": 0.43558672722429037,
"step": 64
},
{
"completion_length": 774.1874847412109,
"epoch": 0.72778166550035,
"grad_norm": 13.465957641601562,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4088010136038065,
"reward_std": 0.1347600498702377,
"rewards/accuracy_reward": 0.4088010136038065,
"step": 65
},
{
"completion_length": 767.0720520019531,
"epoch": 0.7389783065080476,
"grad_norm": 18.965850830078125,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.3392857080325484,
"reward_std": 0.10657906858250499,
"rewards/accuracy_reward": 0.3392857080325484,
"step": 66
},
{
"completion_length": 743.1543273925781,
"epoch": 0.7501749475157453,
"grad_norm": 4.266128063201904,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4547193758189678,
"reward_std": 0.14951107138767838,
"rewards/accuracy_reward": 0.4547193758189678,
"step": 67
},
{
"completion_length": 746.9189910888672,
"epoch": 0.761371588523443,
"grad_norm": 7.374969959259033,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.43048468325287104,
"reward_std": 0.13727903924882412,
"rewards/accuracy_reward": 0.43048468325287104,
"step": 68
},
{
"completion_length": 772.3635101318359,
"epoch": 0.7725682295311407,
"grad_norm": 2.557593822479248,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4036989789456129,
"reward_std": 0.12583233416080475,
"rewards/accuracy_reward": 0.4036989789456129,
"step": 69
},
{
"completion_length": 725.642204284668,
"epoch": 0.7837648705388384,
"grad_norm": 2.71532940864563,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.45854590460658073,
"reward_std": 0.13771890476346016,
"rewards/accuracy_reward": 0.45854590460658073,
"step": 70
},
{
"completion_length": 762.9929695129395,
"epoch": 0.794961511546536,
"grad_norm": 6.598220348358154,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.38392856158316135,
"reward_std": 0.1016802228987217,
"rewards/accuracy_reward": 0.38392856158316135,
"step": 71
},
{
"completion_length": 734.3813591003418,
"epoch": 0.8061581525542337,
"grad_norm": 4.22649621963501,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4272959092631936,
"reward_std": 0.09739230386912823,
"rewards/accuracy_reward": 0.4272959092631936,
"step": 72
},
{
"completion_length": 756.7136306762695,
"epoch": 0.8173547935619314,
"grad_norm": 5.3303704261779785,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4713010126724839,
"reward_std": 0.13973407400771976,
"rewards/accuracy_reward": 0.4713010126724839,
"step": 73
},
{
"completion_length": 764.8596725463867,
"epoch": 0.8285514345696291,
"grad_norm": 4.296087265014648,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.37436223961412907,
"reward_std": 0.1403882629238069,
"rewards/accuracy_reward": 0.37436223961412907,
"step": 74
},
{
"completion_length": 703.3316268920898,
"epoch": 0.8397480755773268,
"grad_norm": 6.208092212677002,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4279336668550968,
"reward_std": 0.09254459687508643,
"rewards/accuracy_reward": 0.4279336668550968,
"step": 75
},
{
"completion_length": 816.643482208252,
"epoch": 0.8509447165850245,
"grad_norm": 5.470515727996826,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4081632560119033,
"reward_std": 0.12125490978360176,
"rewards/accuracy_reward": 0.4081632560119033,
"step": 76
},
{
"completion_length": 741.5395240783691,
"epoch": 0.8621413575927221,
"grad_norm": 7.999130725860596,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4687499925494194,
"reward_std": 0.12569960486143827,
"rewards/accuracy_reward": 0.4687499925494194,
"step": 77
},
{
"completion_length": 758.1237106323242,
"epoch": 0.8733379986004198,
"grad_norm": 14.48597526550293,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.3903061128221452,
"reward_std": 0.1084871394559741,
"rewards/accuracy_reward": 0.3903061128221452,
"step": 78
},
{
"completion_length": 761.9495964050293,
"epoch": 0.8845346396081175,
"grad_norm": 9.992022514343262,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.3998724417760968,
"reward_std": 0.10686216223984957,
"rewards/accuracy_reward": 0.3998724417760968,
"step": 79
},
{
"completion_length": 755.2149085998535,
"epoch": 0.8957312806158153,
"grad_norm": 1.8447043895721436,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.422193868085742,
"reward_std": 0.12513341289013624,
"rewards/accuracy_reward": 0.422193868085742,
"step": 80
},
{
"completion_length": 787.5254936218262,
"epoch": 0.906927921623513,
"grad_norm": 2.829308271408081,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.45408162102103233,
"reward_std": 0.11606655921787024,
"rewards/accuracy_reward": 0.45408162102103233,
"step": 81
},
{
"completion_length": 750.6084022521973,
"epoch": 0.9181245626312107,
"grad_norm": 9.926461219787598,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.41709183249622583,
"reward_std": 0.11887505534105003,
"rewards/accuracy_reward": 0.41709183249622583,
"step": 82
},
{
"completion_length": 791.3246002197266,
"epoch": 0.9293212036389084,
"grad_norm": 8.353261947631836,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.37882652413100004,
"reward_std": 0.0963351079262793,
"rewards/accuracy_reward": 0.37882652413100004,
"step": 83
},
{
"completion_length": 731.2653007507324,
"epoch": 0.940517844646606,
"grad_norm": 54.08208465576172,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.415816318243742,
"reward_std": 0.14536869549192488,
"rewards/accuracy_reward": 0.415816318243742,
"step": 84
},
{
"completion_length": 743.2423324584961,
"epoch": 0.9517144856543037,
"grad_norm": 3.901226043701172,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.4451530482620001,
"reward_std": 0.1309119921643287,
"rewards/accuracy_reward": 0.4451530482620001,
"step": 85
},
{
"completion_length": 751.2391338348389,
"epoch": 0.9629111266620014,
"grad_norm": 5.942202568054199,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.43367346189916134,
"reward_std": 0.10790172568522394,
"rewards/accuracy_reward": 0.43367346189916134,
"step": 86
},
{
"completion_length": 719.3335266113281,
"epoch": 0.9741077676696991,
"grad_norm": 5.666225433349609,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.44451529532670975,
"reward_std": 0.11002893466502428,
"rewards/accuracy_reward": 0.44451529532670975,
"step": 87
},
{
"completion_length": 752.9968070983887,
"epoch": 0.9853044086773968,
"grad_norm": 10.701108932495117,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.411352033726871,
"reward_std": 0.11254792334511876,
"rewards/accuracy_reward": 0.411352033726871,
"step": 88
},
{
"completion_length": 763.3092956542969,
"epoch": 0.9965010496850945,
"grad_norm": 7.832119941711426,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.40051019471138716,
"reward_std": 0.10468381433747709,
"rewards/accuracy_reward": 0.40051019471138716,
"step": 89
},
{
"epoch": 0.9965010496850945,
"step": 89,
"total_flos": 0.0,
"train_loss": 2.155859272374791e-08,
"train_runtime": 56373.7111,
"train_samples_per_second": 0.355,
"train_steps_per_second": 0.002
}
],
"logging_steps": 1,
"max_steps": 89,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}