{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9965010496850945, "eval_steps": 100, "global_step": 89, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 776.6575088500977, "epoch": 0.01119664100769769, "grad_norm": 1.4259217977523804, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4336734637618065, "reward_std": 0.13708234671503305, "rewards/accuracy_reward": 0.4336734637618065, "step": 1 }, { "completion_length": 775.5478172302246, "epoch": 0.02239328201539538, "grad_norm": 1.6899404525756836, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.3609693795442581, "reward_std": 0.12760126357898116, "rewards/accuracy_reward": 0.3609693795442581, "step": 2 }, { "completion_length": 770.4444999694824, "epoch": 0.03358992302309307, "grad_norm": 1.1099838018417358, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.41772958543151617, "reward_std": 0.0995770595036447, "rewards/accuracy_reward": 0.41772958543151617, "step": 3 }, { "completion_length": 763.253173828125, "epoch": 0.04478656403079076, "grad_norm": 1.734800100326538, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.42602039594203234, "reward_std": 0.10463267145678401, "rewards/accuracy_reward": 0.42602039594203234, "step": 4 }, { "completion_length": 728.0452613830566, "epoch": 0.05598320503848846, "grad_norm": 1.9766837358474731, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.412627543322742, "reward_std": 0.1070892985444516, "rewards/accuracy_reward": 0.412627543322742, "step": 5 }, { "completion_length": 771.2640113830566, "epoch": 0.06717984604618614, "grad_norm": 1.5028815269470215, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.3877550968900323, "reward_std": 0.11859837244264781, "rewards/accuracy_reward": 0.3877550968900323, "step": 6 }, { "completion_length": 744.927921295166, "epoch": 0.07837648705388384, "grad_norm": 2.0006868839263916, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.426658152602613, "reward_std": 0.11621210724115372, "rewards/accuracy_reward": 0.426658152602613, "step": 7 }, { "completion_length": 776.862865447998, "epoch": 0.08957312806158152, "grad_norm": 2.4789130687713623, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.3558673383668065, "reward_std": 0.12130605196580291, "rewards/accuracy_reward": 0.3558673383668065, "step": 8 }, { "completion_length": 778.5867118835449, "epoch": 0.10076976906927922, "grad_norm": 1.8755781650543213, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.41581631638109684, "reward_std": 0.10032712062820792, "rewards/accuracy_reward": 0.41581631638109684, "step": 9 }, { "completion_length": 749.0809860229492, "epoch": 0.11196641007697691, "grad_norm": 2.071565628051758, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4030612176284194, "reward_std": 0.11739562568254769, "rewards/accuracy_reward": 0.4030612176284194, "step": 10 }, { "completion_length": 814.1294441223145, "epoch": 0.1231630510846746, "grad_norm": 5.151350021362305, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.37436224054545164, "reward_std": 0.14050176995806396, "rewards/accuracy_reward": 0.37436224054545164, "step": 11 }, { "completion_length": 757.7876129150391, "epoch": 0.13435969209237228, "grad_norm": 1.5203074216842651, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.41709182877093554, "reward_std": 0.1278091778513044, "rewards/accuracy_reward": 0.41709182877093554, "step": 12 }, { "completion_length": 777.004451751709, "epoch": 0.14555633310007, "grad_norm": 2.49013614654541, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.42410713247954845, "reward_std": 0.13054731115698814, "rewards/accuracy_reward": 0.42410713247954845, "step": 13 }, { "completion_length": 757.3265151977539, "epoch": 0.15675297410776767, "grad_norm": 3.2910609245300293, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.38201529905200005, "reward_std": 0.10587374167516828, "rewards/accuracy_reward": 0.38201529905200005, "step": 14 }, { "completion_length": 806.7212867736816, "epoch": 0.16794961511546536, "grad_norm": 3.803057909011841, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.3826530510559678, "reward_std": 0.1126359230838716, "rewards/accuracy_reward": 0.3826530510559678, "step": 15 }, { "completion_length": 783.6211624145508, "epoch": 0.17914625612316304, "grad_norm": 5.312828540802002, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.3998724380508065, "reward_std": 0.1370647200383246, "rewards/accuracy_reward": 0.3998724380508065, "step": 16 }, { "completion_length": 751.4138832092285, "epoch": 0.19034289713086075, "grad_norm": 5.630074977874756, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.3985969312489033, "reward_std": 0.1044071288779378, "rewards/accuracy_reward": 0.3985969312489033, "step": 17 }, { "completion_length": 740.3303375244141, "epoch": 0.20153953813855843, "grad_norm": 3.4527852535247803, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.45153060369193554, "reward_std": 0.13771249912679195, "rewards/accuracy_reward": 0.45153060369193554, "step": 18 }, { "completion_length": 762.7110862731934, "epoch": 0.21273617914625612, "grad_norm": 2.000288724899292, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.44005101174116135, "reward_std": 0.11361152515746653, "rewards/accuracy_reward": 0.44005101174116135, "step": 19 }, { "completion_length": 754.8182258605957, "epoch": 0.22393282015395383, "grad_norm": 2.939009189605713, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4623724361881614, "reward_std": 0.1443291292525828, "rewards/accuracy_reward": 0.4623724361881614, "step": 20 }, { "completion_length": 804.9897804260254, "epoch": 0.2351294611616515, "grad_norm": 3.3701393604278564, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.42219387367367744, "reward_std": 0.12525973934680223, "rewards/accuracy_reward": 0.42219387367367744, "step": 21 }, { "completion_length": 760.9719276428223, "epoch": 0.2463261021693492, "grad_norm": 4.5709004402160645, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4604591690003872, "reward_std": 0.13864337070845068, "rewards/accuracy_reward": 0.4604591690003872, "step": 22 }, { "completion_length": 747.7378578186035, "epoch": 0.2575227431770469, "grad_norm": 4.603999137878418, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4559948891401291, "reward_std": 0.11444317712448537, "rewards/accuracy_reward": 0.4559948891401291, "step": 23 }, { "completion_length": 776.5950050354004, "epoch": 0.26871938418474456, "grad_norm": 3.4199488162994385, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.38711733650416136, "reward_std": 0.12604024447500706, "rewards/accuracy_reward": 0.38711733650416136, "step": 24 }, { "completion_length": 805.4457778930664, "epoch": 0.27991602519244224, "grad_norm": 2.1585614681243896, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.34757652413100004, "reward_std": 0.09647424682043493, "rewards/accuracy_reward": 0.34757652413100004, "step": 25 }, { "completion_length": 756.6740798950195, "epoch": 0.29111266620014, "grad_norm": 4.483582973480225, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4100765222683549, "reward_std": 0.11235282756388187, "rewards/accuracy_reward": 0.4100765222683549, "step": 26 }, { "completion_length": 783.984676361084, "epoch": 0.30230930720783766, "grad_norm": 4.353560447692871, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4081632560119033, "reward_std": 0.12413217849098146, "rewards/accuracy_reward": 0.4081632560119033, "step": 27 }, { "completion_length": 785.7461547851562, "epoch": 0.31350594821553535, "grad_norm": 2.4794628620147705, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.3628826476633549, "reward_std": 0.0991243754979223, "rewards/accuracy_reward": 0.3628826476633549, "step": 28 }, { "completion_length": 730.373706817627, "epoch": 0.32470258922323303, "grad_norm": 4.818853855133057, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.48596937395632267, "reward_std": 0.14916561311110854, "rewards/accuracy_reward": 0.48596937395632267, "step": 29 }, { "completion_length": 798.7933502197266, "epoch": 0.3358992302309307, "grad_norm": 4.875724792480469, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.35650509130209684, "reward_std": 0.13356371596455574, "rewards/accuracy_reward": 0.35650509130209684, "step": 30 }, { "completion_length": 796.2346839904785, "epoch": 0.3470958712386284, "grad_norm": 10.593175888061523, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.3966836603358388, "reward_std": 0.1197371541056782, "rewards/accuracy_reward": 0.3966836603358388, "step": 31 }, { "completion_length": 770.2442512512207, "epoch": 0.3582925122463261, "grad_norm": 2.9329323768615723, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4113520346581936, "reward_std": 0.10089330864138901, "rewards/accuracy_reward": 0.4113520346581936, "step": 32 }, { "completion_length": 775.821418762207, "epoch": 0.3694891532540238, "grad_norm": 12.537503242492676, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4483418297022581, "reward_std": 0.125624421518296, "rewards/accuracy_reward": 0.4483418297022581, "step": 33 }, { "completion_length": 783.8558540344238, "epoch": 0.3806857942617215, "grad_norm": 4.941224575042725, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.40752549935132265, "reward_std": 0.1238795283716172, "rewards/accuracy_reward": 0.40752549935132265, "step": 34 }, { "completion_length": 835.3316078186035, "epoch": 0.3918824352694192, "grad_norm": 4.293496608734131, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.36670917458832264, "reward_std": 0.13812832674011588, "rewards/accuracy_reward": 0.36670917458832264, "step": 35 }, { "completion_length": 800.0905456542969, "epoch": 0.40307907627711687, "grad_norm": 19.928516387939453, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4234693804755807, "reward_std": 0.11749003268778324, "rewards/accuracy_reward": 0.4234693804755807, "step": 36 }, { "completion_length": 767.5561103820801, "epoch": 0.41427571728481455, "grad_norm": 6.884653568267822, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4445152971893549, "reward_std": 0.09231905196793377, "rewards/accuracy_reward": 0.4445152971893549, "step": 37 }, { "completion_length": 805.4872283935547, "epoch": 0.42547235829251223, "grad_norm": 24.017255783081055, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.40114795323461294, "reward_std": 0.10263178893364966, "rewards/accuracy_reward": 0.40114795323461294, "step": 38 }, { "completion_length": 801.1536827087402, "epoch": 0.4366689993002099, "grad_norm": 5.969362258911133, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.39604590833187103, "reward_std": 0.10426798998378217, "rewards/accuracy_reward": 0.39604590833187103, "step": 39 }, { "completion_length": 778.0669441223145, "epoch": 0.44786564030790765, "grad_norm": 7.667646884918213, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.44068876653909683, "reward_std": 0.13503032876178622, "rewards/accuracy_reward": 0.44068876653909683, "step": 40 }, { "completion_length": 777.1989631652832, "epoch": 0.45906228131560534, "grad_norm": 8.530767440795898, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.45854590740054846, "reward_std": 0.1331926230341196, "rewards/accuracy_reward": 0.45854590740054846, "step": 41 }, { "completion_length": 709.0452651977539, "epoch": 0.470258922323303, "grad_norm": 2.948634386062622, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4279336668550968, "reward_std": 0.13054730370640755, "rewards/accuracy_reward": 0.4279336668550968, "step": 42 }, { "completion_length": 812.6842880249023, "epoch": 0.4814555633310007, "grad_norm": 63.21303939819336, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4056122303009033, "reward_std": 0.11122526740655303, "rewards/accuracy_reward": 0.4056122303009033, "step": 43 }, { "completion_length": 794.6052093505859, "epoch": 0.4926522043386984, "grad_norm": 36.811729431152344, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4387754984200001, "reward_std": 0.11845282535068691, "rewards/accuracy_reward": 0.4387754984200001, "step": 44 }, { "completion_length": 774.257640838623, "epoch": 0.5038488453463961, "grad_norm": 15.306803703308105, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4419642761349678, "reward_std": 0.12792909424751997, "rewards/accuracy_reward": 0.4419642761349678, "step": 45 }, { "completion_length": 809.9763870239258, "epoch": 0.5150454863540938, "grad_norm": 18.062774658203125, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.39158162381500006, "reward_std": 0.1342562234494835, "rewards/accuracy_reward": 0.39158162381500006, "step": 46 }, { "completion_length": 801.1211700439453, "epoch": 0.5262421273617914, "grad_norm": 14.511787414550781, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.40752550307661295, "reward_std": 0.13466564589180052, "rewards/accuracy_reward": 0.40752550307661295, "step": 47 }, { "completion_length": 799.5650329589844, "epoch": 0.5374387683694891, "grad_norm": 11.552735328674316, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.3839285643771291, "reward_std": 0.12077671871520579, "rewards/accuracy_reward": 0.3839285643771291, "step": 48 }, { "completion_length": 796.6683540344238, "epoch": 0.5486354093771868, "grad_norm": 13.472460746765137, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.3966836668550968, "reward_std": 0.12616657256148756, "rewards/accuracy_reward": 0.3966836668550968, "step": 49 }, { "completion_length": 750.8067474365234, "epoch": 0.5598320503848845, "grad_norm": 13.930930137634277, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4438775470480323, "reward_std": 0.1101488508284092, "rewards/accuracy_reward": 0.4438775470480323, "step": 50 }, { "completion_length": 780.1970520019531, "epoch": 0.5710286913925823, "grad_norm": 5.438718795776367, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4017857061699033, "reward_std": 0.09486208576709032, "rewards/accuracy_reward": 0.4017857061699033, "step": 51 }, { "completion_length": 772.9202728271484, "epoch": 0.58222533240028, "grad_norm": 6.467463493347168, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.39795918203890324, "reward_std": 0.11394576611928642, "rewards/accuracy_reward": 0.39795918203890324, "step": 52 }, { "completion_length": 751.3756217956543, "epoch": 0.5934219734079776, "grad_norm": 7.1286139488220215, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.5063775442540646, "reward_std": 0.152797756716609, "rewards/accuracy_reward": 0.5063775442540646, "step": 53 }, { "completion_length": 773.2212867736816, "epoch": 0.6046186144156753, "grad_norm": 5.8168840408325195, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.3852040721103549, "reward_std": 0.11522368853911757, "rewards/accuracy_reward": 0.3852040721103549, "step": 54 }, { "completion_length": 720.9132423400879, "epoch": 0.615815255423373, "grad_norm": 4.343114852905273, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4336734600365162, "reward_std": 0.10637756483629346, "rewards/accuracy_reward": 0.4336734600365162, "step": 55 }, { "completion_length": 769.654956817627, "epoch": 0.6270118964310707, "grad_norm": 4.029993534088135, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4202806008979678, "reward_std": 0.13102709129452705, "rewards/accuracy_reward": 0.4202806008979678, "step": 56 }, { "completion_length": 750.3571243286133, "epoch": 0.6382085374387684, "grad_norm": 2.320124864578247, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4547193767502904, "reward_std": 0.12525333184748888, "rewards/accuracy_reward": 0.4547193767502904, "step": 57 }, { "completion_length": 756.4540672302246, "epoch": 0.6494051784464661, "grad_norm": 5.077786922454834, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.42793366126716137, "reward_std": 0.1169798003975302, "rewards/accuracy_reward": 0.42793366126716137, "step": 58 }, { "completion_length": 741.6307258605957, "epoch": 0.6606018194541637, "grad_norm": 3.857607364654541, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.398596934042871, "reward_std": 0.11093576485291123, "rewards/accuracy_reward": 0.398596934042871, "step": 59 }, { "completion_length": 779.0318717956543, "epoch": 0.6717984604618614, "grad_norm": 2.3024513721466064, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.40624999161809683, "reward_std": 0.11923973984085023, "rewards/accuracy_reward": 0.40624999161809683, "step": 60 }, { "completion_length": 814.8584022521973, "epoch": 0.6829951014695591, "grad_norm": 4.7456374168396, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.36415816005319357, "reward_std": 0.13503673416562378, "rewards/accuracy_reward": 0.36415816005319357, "step": 61 }, { "completion_length": 744.0248603820801, "epoch": 0.6941917424772568, "grad_norm": 7.634659767150879, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4336734553799033, "reward_std": 0.13652257318608463, "rewards/accuracy_reward": 0.4336734553799033, "step": 62 }, { "completion_length": 748.6332778930664, "epoch": 0.7053883834849545, "grad_norm": 5.460710525512695, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.45854590460658073, "reward_std": 0.10201446153223515, "rewards/accuracy_reward": 0.45854590460658073, "step": 63 }, { "completion_length": 766.7691230773926, "epoch": 0.7165850244926522, "grad_norm": 10.574377059936523, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.43558672722429037, "reward_std": 0.11734448280185461, "rewards/accuracy_reward": 0.43558672722429037, "step": 64 }, { "completion_length": 774.1874847412109, "epoch": 0.72778166550035, "grad_norm": 13.465957641601562, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4088010136038065, "reward_std": 0.1347600498702377, "rewards/accuracy_reward": 0.4088010136038065, "step": 65 }, { "completion_length": 767.0720520019531, "epoch": 0.7389783065080476, "grad_norm": 18.965850830078125, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.3392857080325484, "reward_std": 0.10657906858250499, "rewards/accuracy_reward": 0.3392857080325484, "step": 66 }, { "completion_length": 743.1543273925781, "epoch": 0.7501749475157453, "grad_norm": 4.266128063201904, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4547193758189678, "reward_std": 0.14951107138767838, "rewards/accuracy_reward": 0.4547193758189678, "step": 67 }, { "completion_length": 746.9189910888672, "epoch": 0.761371588523443, "grad_norm": 7.374969959259033, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.43048468325287104, "reward_std": 0.13727903924882412, "rewards/accuracy_reward": 0.43048468325287104, "step": 68 }, { "completion_length": 772.3635101318359, "epoch": 0.7725682295311407, "grad_norm": 2.557593822479248, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4036989789456129, "reward_std": 0.12583233416080475, "rewards/accuracy_reward": 0.4036989789456129, "step": 69 }, { "completion_length": 725.642204284668, "epoch": 0.7837648705388384, "grad_norm": 2.71532940864563, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.45854590460658073, "reward_std": 0.13771890476346016, "rewards/accuracy_reward": 0.45854590460658073, "step": 70 }, { "completion_length": 762.9929695129395, "epoch": 0.794961511546536, "grad_norm": 6.598220348358154, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.38392856158316135, "reward_std": 0.1016802228987217, "rewards/accuracy_reward": 0.38392856158316135, "step": 71 }, { "completion_length": 734.3813591003418, "epoch": 0.8061581525542337, "grad_norm": 4.22649621963501, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4272959092631936, "reward_std": 0.09739230386912823, "rewards/accuracy_reward": 0.4272959092631936, "step": 72 }, { "completion_length": 756.7136306762695, "epoch": 0.8173547935619314, "grad_norm": 5.3303704261779785, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4713010126724839, "reward_std": 0.13973407400771976, "rewards/accuracy_reward": 0.4713010126724839, "step": 73 }, { "completion_length": 764.8596725463867, "epoch": 0.8285514345696291, "grad_norm": 4.296087265014648, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.37436223961412907, "reward_std": 0.1403882629238069, "rewards/accuracy_reward": 0.37436223961412907, "step": 74 }, { "completion_length": 703.3316268920898, "epoch": 0.8397480755773268, "grad_norm": 6.208092212677002, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4279336668550968, "reward_std": 0.09254459687508643, "rewards/accuracy_reward": 0.4279336668550968, "step": 75 }, { "completion_length": 816.643482208252, "epoch": 0.8509447165850245, "grad_norm": 5.470515727996826, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4081632560119033, "reward_std": 0.12125490978360176, "rewards/accuracy_reward": 0.4081632560119033, "step": 76 }, { "completion_length": 741.5395240783691, "epoch": 0.8621413575927221, "grad_norm": 7.999130725860596, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4687499925494194, "reward_std": 0.12569960486143827, "rewards/accuracy_reward": 0.4687499925494194, "step": 77 }, { "completion_length": 758.1237106323242, "epoch": 0.8733379986004198, "grad_norm": 14.48597526550293, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.3903061128221452, "reward_std": 0.1084871394559741, "rewards/accuracy_reward": 0.3903061128221452, "step": 78 }, { "completion_length": 761.9495964050293, "epoch": 0.8845346396081175, "grad_norm": 9.992022514343262, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.3998724417760968, "reward_std": 0.10686216223984957, "rewards/accuracy_reward": 0.3998724417760968, "step": 79 }, { "completion_length": 755.2149085998535, "epoch": 0.8957312806158153, "grad_norm": 1.8447043895721436, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.422193868085742, "reward_std": 0.12513341289013624, "rewards/accuracy_reward": 0.422193868085742, "step": 80 }, { "completion_length": 787.5254936218262, "epoch": 0.906927921623513, "grad_norm": 2.829308271408081, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.45408162102103233, "reward_std": 0.11606655921787024, "rewards/accuracy_reward": 0.45408162102103233, "step": 81 }, { "completion_length": 750.6084022521973, "epoch": 0.9181245626312107, "grad_norm": 9.926461219787598, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.41709183249622583, "reward_std": 0.11887505534105003, "rewards/accuracy_reward": 0.41709183249622583, "step": 82 }, { "completion_length": 791.3246002197266, "epoch": 0.9293212036389084, "grad_norm": 8.353261947631836, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.37882652413100004, "reward_std": 0.0963351079262793, "rewards/accuracy_reward": 0.37882652413100004, "step": 83 }, { "completion_length": 731.2653007507324, "epoch": 0.940517844646606, "grad_norm": 54.08208465576172, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.415816318243742, "reward_std": 0.14536869549192488, "rewards/accuracy_reward": 0.415816318243742, "step": 84 }, { "completion_length": 743.2423324584961, "epoch": 0.9517144856543037, "grad_norm": 3.901226043701172, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.4451530482620001, "reward_std": 0.1309119921643287, "rewards/accuracy_reward": 0.4451530482620001, "step": 85 }, { "completion_length": 751.2391338348389, "epoch": 0.9629111266620014, "grad_norm": 5.942202568054199, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.43367346189916134, "reward_std": 0.10790172568522394, "rewards/accuracy_reward": 0.43367346189916134, "step": 86 }, { "completion_length": 719.3335266113281, "epoch": 0.9741077676696991, "grad_norm": 5.666225433349609, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.44451529532670975, "reward_std": 0.11002893466502428, "rewards/accuracy_reward": 0.44451529532670975, "step": 87 }, { "completion_length": 752.9968070983887, "epoch": 0.9853044086773968, "grad_norm": 10.701108932495117, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.411352033726871, "reward_std": 0.11254792334511876, "rewards/accuracy_reward": 0.411352033726871, "step": 88 }, { "completion_length": 763.3092956542969, "epoch": 0.9965010496850945, "grad_norm": 7.832119941711426, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.40051019471138716, "reward_std": 0.10468381433747709, "rewards/accuracy_reward": 0.40051019471138716, "step": 89 }, { "epoch": 0.9965010496850945, "step": 89, "total_flos": 0.0, "train_loss": 2.155859272374791e-08, "train_runtime": 56373.7111, "train_samples_per_second": 0.355, "train_steps_per_second": 0.002 } ], "logging_steps": 1, "max_steps": 89, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }