{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2361, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021177467174925877, "grad_norm": 83.68173840087012, "learning_rate": 4.219409282700422e-07, "loss": 2.1661, "mean_token_accuracy": 0.5976044356822967, "step": 5 }, { "epoch": 0.004235493434985175, "grad_norm": 71.11062161153062, "learning_rate": 8.438818565400844e-07, "loss": 2.1622, "mean_token_accuracy": 0.5995522707700729, "step": 10 }, { "epoch": 0.0063532401524777635, "grad_norm": 34.572128830715705, "learning_rate": 1.2658227848101267e-06, "loss": 2.0632, "mean_token_accuracy": 0.5944636166095734, "step": 15 }, { "epoch": 0.00847098686997035, "grad_norm": 17.011229671312382, "learning_rate": 1.6877637130801689e-06, "loss": 1.7723, "mean_token_accuracy": 0.6232941240072251, "step": 20 }, { "epoch": 0.010588733587462939, "grad_norm": 9.74778063232665, "learning_rate": 2.1097046413502114e-06, "loss": 1.5731, "mean_token_accuracy": 0.63005710542202, "step": 25 }, { "epoch": 0.012706480304955527, "grad_norm": 7.914154804733739, "learning_rate": 2.5316455696202535e-06, "loss": 1.3488, "mean_token_accuracy": 0.6629315137863159, "step": 30 }, { "epoch": 0.014824227022448115, "grad_norm": 7.497303646153751, "learning_rate": 2.9535864978902956e-06, "loss": 1.1306, "mean_token_accuracy": 0.6912301659584046, "step": 35 }, { "epoch": 0.0169419737399407, "grad_norm": 3.0571735289067807, "learning_rate": 3.3755274261603377e-06, "loss": 0.9338, "mean_token_accuracy": 0.7339568644762039, "step": 40 }, { "epoch": 0.01905972045743329, "grad_norm": 3.7299236330227674, "learning_rate": 3.7974683544303802e-06, "loss": 0.8366, "mean_token_accuracy": 0.7472610950469971, "step": 45 }, { "epoch": 0.021177467174925878, "grad_norm": 1.94550213470537, "learning_rate": 4.219409282700423e-06, "loss": 0.813, "mean_token_accuracy": 0.7466759532690048, "step": 50 }, { "epoch": 0.023295213892418468, "grad_norm": 1.9631003842444021, "learning_rate": 4.641350210970465e-06, "loss": 0.7409, "mean_token_accuracy": 0.7591830432415009, "step": 55 }, { "epoch": 0.025412960609911054, "grad_norm": 2.126411421817186, "learning_rate": 5.063291139240507e-06, "loss": 0.6999, "mean_token_accuracy": 0.7653463244438171, "step": 60 }, { "epoch": 0.027530707327403644, "grad_norm": 2.431633067673993, "learning_rate": 5.485232067510548e-06, "loss": 0.6803, "mean_token_accuracy": 0.7705583959817887, "step": 65 }, { "epoch": 0.02964845404489623, "grad_norm": 2.1470463103072706, "learning_rate": 5.907172995780591e-06, "loss": 0.6716, "mean_token_accuracy": 0.7747641324996948, "step": 70 }, { "epoch": 0.03176620076238882, "grad_norm": 1.8134772555606546, "learning_rate": 6.329113924050634e-06, "loss": 0.6442, "mean_token_accuracy": 0.7782594561576843, "step": 75 }, { "epoch": 0.0338839474798814, "grad_norm": 1.828059395701887, "learning_rate": 6.751054852320675e-06, "loss": 0.654, "mean_token_accuracy": 0.7730626821517944, "step": 80 }, { "epoch": 0.03600169419737399, "grad_norm": 2.144653892020441, "learning_rate": 7.172995780590718e-06, "loss": 0.5866, "mean_token_accuracy": 0.7909977465867997, "step": 85 }, { "epoch": 0.03811944091486658, "grad_norm": 1.8471427217270429, "learning_rate": 7.5949367088607605e-06, "loss": 0.6343, "mean_token_accuracy": 0.7756380766630173, "step": 90 }, { "epoch": 0.04023718763235917, "grad_norm": 2.3290876362673916, "learning_rate": 8.016877637130802e-06, "loss": 0.6006, "mean_token_accuracy": 0.7818532437086105, "step": 95 }, { "epoch": 0.042354934349851756, "grad_norm": 2.038115409634727, "learning_rate": 8.438818565400846e-06, "loss": 0.6123, "mean_token_accuracy": 0.7774519264698029, "step": 100 }, { "epoch": 0.044472681067344345, "grad_norm": 1.8512017533544443, "learning_rate": 8.860759493670886e-06, "loss": 0.5782, "mean_token_accuracy": 0.787897452712059, "step": 105 }, { "epoch": 0.046590427784836935, "grad_norm": 1.7617335354592987, "learning_rate": 9.28270042194093e-06, "loss": 0.5646, "mean_token_accuracy": 0.7916492879390716, "step": 110 }, { "epoch": 0.04870817450232952, "grad_norm": 1.7594677346726018, "learning_rate": 9.704641350210972e-06, "loss": 0.5741, "mean_token_accuracy": 0.7923983573913574, "step": 115 }, { "epoch": 0.05082592121982211, "grad_norm": 2.074930275455498, "learning_rate": 1.0126582278481014e-05, "loss": 0.5872, "mean_token_accuracy": 0.7833609640598297, "step": 120 }, { "epoch": 0.0529436679373147, "grad_norm": 1.9672826514728756, "learning_rate": 1.0548523206751056e-05, "loss": 0.5525, "mean_token_accuracy": 0.7987953841686248, "step": 125 }, { "epoch": 0.05506141465480729, "grad_norm": 1.8715882057544604, "learning_rate": 1.0970464135021096e-05, "loss": 0.5825, "mean_token_accuracy": 0.788796243071556, "step": 130 }, { "epoch": 0.05717916137229987, "grad_norm": 2.055120934997949, "learning_rate": 1.139240506329114e-05, "loss": 0.5668, "mean_token_accuracy": 0.794330358505249, "step": 135 }, { "epoch": 0.05929690808979246, "grad_norm": 2.1751457050987213, "learning_rate": 1.1814345991561182e-05, "loss": 0.5504, "mean_token_accuracy": 0.7986414194107055, "step": 140 }, { "epoch": 0.06141465480728505, "grad_norm": 2.6530737679742735, "learning_rate": 1.2236286919831224e-05, "loss": 0.5825, "mean_token_accuracy": 0.787852120399475, "step": 145 }, { "epoch": 0.06353240152477764, "grad_norm": 1.7772714746257652, "learning_rate": 1.2658227848101268e-05, "loss": 0.5321, "mean_token_accuracy": 0.7960227221250534, "step": 150 }, { "epoch": 0.06565014824227022, "grad_norm": 2.007446829510951, "learning_rate": 1.3080168776371309e-05, "loss": 0.5652, "mean_token_accuracy": 0.7959463059902191, "step": 155 }, { "epoch": 0.0677678949597628, "grad_norm": 1.5240825493431345, "learning_rate": 1.350210970464135e-05, "loss": 0.5471, "mean_token_accuracy": 0.7939142495393753, "step": 160 }, { "epoch": 0.0698856416772554, "grad_norm": 2.1050082197685573, "learning_rate": 1.3924050632911395e-05, "loss": 0.5283, "mean_token_accuracy": 0.8012055605649948, "step": 165 }, { "epoch": 0.07200338839474799, "grad_norm": 1.8465069166680506, "learning_rate": 1.4345991561181437e-05, "loss": 0.5525, "mean_token_accuracy": 0.8044249504804611, "step": 170 }, { "epoch": 0.07412113511224058, "grad_norm": 1.6175686507972196, "learning_rate": 1.4767932489451477e-05, "loss": 0.5561, "mean_token_accuracy": 0.8003985285758972, "step": 175 }, { "epoch": 0.07623888182973317, "grad_norm": 1.987593615824268, "learning_rate": 1.5189873417721521e-05, "loss": 0.5637, "mean_token_accuracy": 0.7922172099351883, "step": 180 }, { "epoch": 0.07835662854722575, "grad_norm": 1.6436094243541917, "learning_rate": 1.5611814345991563e-05, "loss": 0.5662, "mean_token_accuracy": 0.7916066467761993, "step": 185 }, { "epoch": 0.08047437526471835, "grad_norm": 1.6236359809975813, "learning_rate": 1.6033755274261603e-05, "loss": 0.5432, "mean_token_accuracy": 0.8009026974439621, "step": 190 }, { "epoch": 0.08259212198221093, "grad_norm": 1.5274031981451834, "learning_rate": 1.6455696202531647e-05, "loss": 0.5484, "mean_token_accuracy": 0.7982567459344864, "step": 195 }, { "epoch": 0.08470986869970351, "grad_norm": 2.03018219248001, "learning_rate": 1.687763713080169e-05, "loss": 0.5407, "mean_token_accuracy": 0.8011936664581298, "step": 200 }, { "epoch": 0.08682761541719611, "grad_norm": 1.8014465384883518, "learning_rate": 1.729957805907173e-05, "loss": 0.5247, "mean_token_accuracy": 0.8084624141454697, "step": 205 }, { "epoch": 0.08894536213468869, "grad_norm": 1.7271306151793162, "learning_rate": 1.7721518987341772e-05, "loss": 0.5708, "mean_token_accuracy": 0.7947988003492356, "step": 210 }, { "epoch": 0.09106310885218127, "grad_norm": 1.9191596287336676, "learning_rate": 1.8143459915611816e-05, "loss": 0.5363, "mean_token_accuracy": 0.8047021597623825, "step": 215 }, { "epoch": 0.09318085556967387, "grad_norm": 1.7320574921982606, "learning_rate": 1.856540084388186e-05, "loss": 0.5658, "mean_token_accuracy": 0.7928981482982635, "step": 220 }, { "epoch": 0.09529860228716645, "grad_norm": 1.8013279459762328, "learning_rate": 1.89873417721519e-05, "loss": 0.5283, "mean_token_accuracy": 0.8064373075962067, "step": 225 }, { "epoch": 0.09741634900465904, "grad_norm": 1.6963994963949376, "learning_rate": 1.9409282700421944e-05, "loss": 0.541, "mean_token_accuracy": 0.798399469256401, "step": 230 }, { "epoch": 0.09953409572215163, "grad_norm": 1.8211709634602606, "learning_rate": 1.9831223628691984e-05, "loss": 0.5529, "mean_token_accuracy": 0.7973109990358352, "step": 235 }, { "epoch": 0.10165184243964422, "grad_norm": 1.800221940229098, "learning_rate": 1.9999901552991966e-05, "loss": 0.5297, "mean_token_accuracy": 0.8061769932508469, "step": 240 }, { "epoch": 0.10376958915713681, "grad_norm": 3.118430462846644, "learning_rate": 1.9999299939406875e-05, "loss": 0.567, "mean_token_accuracy": 0.7869770050048828, "step": 245 }, { "epoch": 0.1058873358746294, "grad_norm": 6.373265842936888, "learning_rate": 1.9998151437882874e-05, "loss": 0.5194, "mean_token_accuracy": 0.8079254150390625, "step": 250 }, { "epoch": 0.10800508259212198, "grad_norm": 1.7643542086224073, "learning_rate": 1.999645611123453e-05, "loss": 0.5476, "mean_token_accuracy": 0.8036489874124527, "step": 255 }, { "epoch": 0.11012282930961458, "grad_norm": 1.7570808876197173, "learning_rate": 1.999421405218369e-05, "loss": 0.5183, "mean_token_accuracy": 0.8039919883012772, "step": 260 }, { "epoch": 0.11224057602710716, "grad_norm": 1.4650600928842654, "learning_rate": 1.9991425383354462e-05, "loss": 0.5575, "mean_token_accuracy": 0.7989150047302246, "step": 265 }, { "epoch": 0.11435832274459974, "grad_norm": 1.5715508311626518, "learning_rate": 1.9988090257266442e-05, "loss": 0.5276, "mean_token_accuracy": 0.8024184852838516, "step": 270 }, { "epoch": 0.11647606946209234, "grad_norm": 1.5012575844730074, "learning_rate": 1.9984208856326433e-05, "loss": 0.511, "mean_token_accuracy": 0.810269170999527, "step": 275 }, { "epoch": 0.11859381617958492, "grad_norm": 2.1674114266205553, "learning_rate": 1.9979781392818424e-05, "loss": 0.5069, "mean_token_accuracy": 0.8049084335565567, "step": 280 }, { "epoch": 0.1207115628970775, "grad_norm": 1.597566985653751, "learning_rate": 1.9974808108892017e-05, "loss": 0.5097, "mean_token_accuracy": 0.810433080792427, "step": 285 }, { "epoch": 0.1228293096145701, "grad_norm": 2.721798223377223, "learning_rate": 1.9969289276549144e-05, "loss": 0.526, "mean_token_accuracy": 0.8058519691228867, "step": 290 }, { "epoch": 0.12494705633206268, "grad_norm": 1.526771766492988, "learning_rate": 1.9963225197629223e-05, "loss": 0.5172, "mean_token_accuracy": 0.8079220175743103, "step": 295 }, { "epoch": 0.12706480304955528, "grad_norm": 1.3424112355487237, "learning_rate": 1.9956616203792636e-05, "loss": 0.5135, "mean_token_accuracy": 0.806724363565445, "step": 300 }, { "epoch": 0.12918254976704785, "grad_norm": 1.5824773036593809, "learning_rate": 1.9949462656502588e-05, "loss": 0.5383, "mean_token_accuracy": 0.8001780599355698, "step": 305 }, { "epoch": 0.13130029648454045, "grad_norm": 1.5157834737082827, "learning_rate": 1.994176494700534e-05, "loss": 0.5466, "mean_token_accuracy": 0.7970251202583313, "step": 310 }, { "epoch": 0.13341804320203304, "grad_norm": 1.8369627378901519, "learning_rate": 1.993352349630882e-05, "loss": 0.5218, "mean_token_accuracy": 0.8072717070579529, "step": 315 }, { "epoch": 0.1355357899195256, "grad_norm": 1.5676620169867563, "learning_rate": 1.9924738755159573e-05, "loss": 0.5116, "mean_token_accuracy": 0.8025958120822907, "step": 320 }, { "epoch": 0.1376535366370182, "grad_norm": 1.5442271717658778, "learning_rate": 1.9915411204018137e-05, "loss": 0.495, "mean_token_accuracy": 0.8155842959880829, "step": 325 }, { "epoch": 0.1397712833545108, "grad_norm": 1.9104862823035134, "learning_rate": 1.9905541353032744e-05, "loss": 0.4707, "mean_token_accuracy": 0.8196403324604035, "step": 330 }, { "epoch": 0.14188903007200337, "grad_norm": 1.8843041038781683, "learning_rate": 1.9895129742011434e-05, "loss": 0.5359, "mean_token_accuracy": 0.8036209732294083, "step": 335 }, { "epoch": 0.14400677678949597, "grad_norm": 1.2996290243783448, "learning_rate": 1.9884176940392522e-05, "loss": 0.5355, "mean_token_accuracy": 0.7970023989677429, "step": 340 }, { "epoch": 0.14612452350698857, "grad_norm": 1.7409691547169837, "learning_rate": 1.9872683547213446e-05, "loss": 0.5222, "mean_token_accuracy": 0.8015773713588714, "step": 345 }, { "epoch": 0.14824227022448117, "grad_norm": 1.3236145792783143, "learning_rate": 1.9860650191078033e-05, "loss": 0.5165, "mean_token_accuracy": 0.8045854181051254, "step": 350 }, { "epoch": 0.15036001694197373, "grad_norm": 1.5674402609006048, "learning_rate": 1.9848077530122083e-05, "loss": 0.5141, "mean_token_accuracy": 0.8047444432973861, "step": 355 }, { "epoch": 0.15247776365946633, "grad_norm": 1.4948547674340282, "learning_rate": 1.98349662519774e-05, "loss": 0.493, "mean_token_accuracy": 0.8128765910863877, "step": 360 }, { "epoch": 0.15459551037695893, "grad_norm": 1.57285942684427, "learning_rate": 1.9821317073734173e-05, "loss": 0.5114, "mean_token_accuracy": 0.8024025142192841, "step": 365 }, { "epoch": 0.1567132570944515, "grad_norm": 1.3725667498479879, "learning_rate": 1.9807130741901756e-05, "loss": 0.5552, "mean_token_accuracy": 0.7975639194250107, "step": 370 }, { "epoch": 0.1588310038119441, "grad_norm": 1.6323326415858614, "learning_rate": 1.979240803236785e-05, "loss": 0.5101, "mean_token_accuracy": 0.8058428287506103, "step": 375 }, { "epoch": 0.1609487505294367, "grad_norm": 1.293657741608038, "learning_rate": 1.9777149750356044e-05, "loss": 0.4931, "mean_token_accuracy": 0.8156037211418152, "step": 380 }, { "epoch": 0.16306649724692926, "grad_norm": 1.584456213127757, "learning_rate": 1.9761356730381806e-05, "loss": 0.5066, "mean_token_accuracy": 0.8106210082769394, "step": 385 }, { "epoch": 0.16518424396442186, "grad_norm": 1.3531024564685128, "learning_rate": 1.9745029836206813e-05, "loss": 0.4862, "mean_token_accuracy": 0.8180296182632446, "step": 390 }, { "epoch": 0.16730199068191445, "grad_norm": 1.5992771952291873, "learning_rate": 1.9728169960791736e-05, "loss": 0.5158, "mean_token_accuracy": 0.8020082831382751, "step": 395 }, { "epoch": 0.16941973739940702, "grad_norm": 1.3875752393035827, "learning_rate": 1.9710778026247367e-05, "loss": 0.5268, "mean_token_accuracy": 0.8021057844161987, "step": 400 }, { "epoch": 0.17153748411689962, "grad_norm": 1.4892475787998831, "learning_rate": 1.9692854983784235e-05, "loss": 0.5031, "mean_token_accuracy": 0.8153967589139939, "step": 405 }, { "epoch": 0.17365523083439222, "grad_norm": 1.3435721015179996, "learning_rate": 1.9674401813660532e-05, "loss": 0.5151, "mean_token_accuracy": 0.8066144526004791, "step": 410 }, { "epoch": 0.17577297755188478, "grad_norm": 1.4757784795296558, "learning_rate": 1.9655419525128528e-05, "loss": 0.5197, "mean_token_accuracy": 0.8056324630975723, "step": 415 }, { "epoch": 0.17789072426937738, "grad_norm": 1.8586890907074842, "learning_rate": 1.9635909156379373e-05, "loss": 0.4817, "mean_token_accuracy": 0.8227346748113632, "step": 420 }, { "epoch": 0.18000847098686998, "grad_norm": 1.3338010634125226, "learning_rate": 1.9615871774486293e-05, "loss": 0.476, "mean_token_accuracy": 0.8171389639377594, "step": 425 }, { "epoch": 0.18212621770436255, "grad_norm": 1.467996639944381, "learning_rate": 1.959530847534627e-05, "loss": 0.4857, "mean_token_accuracy": 0.8151497721672059, "step": 430 }, { "epoch": 0.18424396442185514, "grad_norm": 1.482953746737999, "learning_rate": 1.9574220383620054e-05, "loss": 0.4922, "mean_token_accuracy": 0.8100210309028626, "step": 435 }, { "epoch": 0.18636171113934774, "grad_norm": 5.208401516653082, "learning_rate": 1.95526086526707e-05, "loss": 0.5263, "mean_token_accuracy": 0.8080328673124313, "step": 440 }, { "epoch": 0.1884794578568403, "grad_norm": 1.5834873689672437, "learning_rate": 1.9530474464500445e-05, "loss": 0.514, "mean_token_accuracy": 0.8094299465417862, "step": 445 }, { "epoch": 0.1905972045743329, "grad_norm": 1.3405671636751928, "learning_rate": 1.9507819029686094e-05, "loss": 0.5119, "mean_token_accuracy": 0.8087350040674209, "step": 450 }, { "epoch": 0.1927149512918255, "grad_norm": 1.3993020572279387, "learning_rate": 1.94846435873128e-05, "loss": 0.5153, "mean_token_accuracy": 0.8082747459411621, "step": 455 }, { "epoch": 0.19483269800931807, "grad_norm": 1.3011551512989479, "learning_rate": 1.9460949404906285e-05, "loss": 0.5028, "mean_token_accuracy": 0.8120961904525756, "step": 460 }, { "epoch": 0.19695044472681067, "grad_norm": 1.6479875272294309, "learning_rate": 1.9436737778363526e-05, "loss": 0.4787, "mean_token_accuracy": 0.8184203952550888, "step": 465 }, { "epoch": 0.19906819144430327, "grad_norm": 1.2952323822215526, "learning_rate": 1.9412010031881884e-05, "loss": 0.4811, "mean_token_accuracy": 0.8196297824382782, "step": 470 }, { "epoch": 0.20118593816179586, "grad_norm": 1.2434980503550659, "learning_rate": 1.9386767517886666e-05, "loss": 0.4992, "mean_token_accuracy": 0.8126247316598892, "step": 475 }, { "epoch": 0.20330368487928843, "grad_norm": 1.2749730489780189, "learning_rate": 1.9361011616957165e-05, "loss": 0.5013, "mean_token_accuracy": 0.8094296991825104, "step": 480 }, { "epoch": 0.20542143159678103, "grad_norm": 1.2801081991950354, "learning_rate": 1.933474373775115e-05, "loss": 0.4914, "mean_token_accuracy": 0.8103417336940766, "step": 485 }, { "epoch": 0.20753917831427363, "grad_norm": 1.3841139586738282, "learning_rate": 1.930796531692783e-05, "loss": 0.503, "mean_token_accuracy": 0.8150111019611359, "step": 490 }, { "epoch": 0.2096569250317662, "grad_norm": 1.2895819374549709, "learning_rate": 1.9280677819069273e-05, "loss": 0.4938, "mean_token_accuracy": 0.8058139503002166, "step": 495 }, { "epoch": 0.2117746717492588, "grad_norm": 1.2705506609214867, "learning_rate": 1.9252882736600302e-05, "loss": 0.5041, "mean_token_accuracy": 0.8078715801239014, "step": 500 }, { "epoch": 0.2138924184667514, "grad_norm": 1.3700128773821674, "learning_rate": 1.922458158970688e-05, "loss": 0.5122, "mean_token_accuracy": 0.805089196562767, "step": 505 }, { "epoch": 0.21601016518424396, "grad_norm": 1.4292612681859336, "learning_rate": 1.9195775926252952e-05, "loss": 0.4799, "mean_token_accuracy": 0.8134547978639602, "step": 510 }, { "epoch": 0.21812791190173655, "grad_norm": 2.589810653355124, "learning_rate": 1.91664673216958e-05, "loss": 0.4686, "mean_token_accuracy": 0.8232874065637589, "step": 515 }, { "epoch": 0.22024565861922915, "grad_norm": 1.4425686621750156, "learning_rate": 1.913665737899988e-05, "loss": 0.4885, "mean_token_accuracy": 0.815599313378334, "step": 520 }, { "epoch": 0.22236340533672172, "grad_norm": 1.4823410740282665, "learning_rate": 1.9106347728549134e-05, "loss": 0.4832, "mean_token_accuracy": 0.8109551817178726, "step": 525 }, { "epoch": 0.22448115205421432, "grad_norm": 1.1459009249468546, "learning_rate": 1.9075540028057844e-05, "loss": 0.5156, "mean_token_accuracy": 0.8015700995922088, "step": 530 }, { "epoch": 0.2265988987717069, "grad_norm": 1.273350806844229, "learning_rate": 1.9044235962479945e-05, "loss": 0.4901, "mean_token_accuracy": 0.8163118690252305, "step": 535 }, { "epoch": 0.22871664548919948, "grad_norm": 1.2736969034780394, "learning_rate": 1.9012437243916895e-05, "loss": 0.475, "mean_token_accuracy": 0.8155727684497833, "step": 540 }, { "epoch": 0.23083439220669208, "grad_norm": 1.1644155017049156, "learning_rate": 1.8980145611523996e-05, "loss": 0.5041, "mean_token_accuracy": 0.8130400031805038, "step": 545 }, { "epoch": 0.23295213892418468, "grad_norm": 1.3543018612133357, "learning_rate": 1.8947362831415327e-05, "loss": 0.4668, "mean_token_accuracy": 0.8260669410228729, "step": 550 }, { "epoch": 0.23506988564167725, "grad_norm": 1.2391111005758269, "learning_rate": 1.8914090696567104e-05, "loss": 0.4809, "mean_token_accuracy": 0.8127309769392014, "step": 555 }, { "epoch": 0.23718763235916984, "grad_norm": 2.2015980143710583, "learning_rate": 1.888033102671965e-05, "loss": 0.4922, "mean_token_accuracy": 0.8155588954687119, "step": 560 }, { "epoch": 0.23930537907666244, "grad_norm": 1.2198454979455773, "learning_rate": 1.884608566827785e-05, "loss": 0.5168, "mean_token_accuracy": 0.8062847316265106, "step": 565 }, { "epoch": 0.241423125794155, "grad_norm": 1.184969374617232, "learning_rate": 1.8811356494210166e-05, "loss": 0.4805, "mean_token_accuracy": 0.8132707148790359, "step": 570 }, { "epoch": 0.2435408725116476, "grad_norm": 1.187126766493632, "learning_rate": 1.8776145403946226e-05, "loss": 0.4955, "mean_token_accuracy": 0.8102918237447738, "step": 575 }, { "epoch": 0.2456586192291402, "grad_norm": 1.3821096957818944, "learning_rate": 1.874045432327289e-05, "loss": 0.4985, "mean_token_accuracy": 0.8098550081253052, "step": 580 }, { "epoch": 0.24777636594663277, "grad_norm": 1.214604218577671, "learning_rate": 1.8704285204228973e-05, "loss": 0.4627, "mean_token_accuracy": 0.8165160745382309, "step": 585 }, { "epoch": 0.24989411266412537, "grad_norm": 1.4526314855211653, "learning_rate": 1.866764002499846e-05, "loss": 0.4909, "mean_token_accuracy": 0.8122711658477784, "step": 590 }, { "epoch": 0.25201185938161796, "grad_norm": 1.1543877428891598, "learning_rate": 1.8630520789802308e-05, "loss": 0.4782, "mean_token_accuracy": 0.8182896554470063, "step": 595 }, { "epoch": 0.25412960609911056, "grad_norm": 1.3086338857944744, "learning_rate": 1.8592929528788844e-05, "loss": 0.4753, "mean_token_accuracy": 0.8180733859539032, "step": 600 }, { "epoch": 0.25624735281660316, "grad_norm": 1.3557276365311686, "learning_rate": 1.8554868297922728e-05, "loss": 0.4708, "mean_token_accuracy": 0.8193376958370209, "step": 605 }, { "epoch": 0.2583650995340957, "grad_norm": 1.2996719117152657, "learning_rate": 1.8516339178872492e-05, "loss": 0.4518, "mean_token_accuracy": 0.8204487860202789, "step": 610 }, { "epoch": 0.2604828462515883, "grad_norm": 1.3696724777806233, "learning_rate": 1.8477344278896708e-05, "loss": 0.5072, "mean_token_accuracy": 0.8076569020748139, "step": 615 }, { "epoch": 0.2626005929690809, "grad_norm": 1.2308629288247015, "learning_rate": 1.8437885730728738e-05, "loss": 0.5113, "mean_token_accuracy": 0.8088377475738525, "step": 620 }, { "epoch": 0.2647183396865735, "grad_norm": 1.2397238918015017, "learning_rate": 1.839796569246006e-05, "loss": 0.494, "mean_token_accuracy": 0.8118572622537613, "step": 625 }, { "epoch": 0.2668360864040661, "grad_norm": 1.3479748389387212, "learning_rate": 1.8357586347422266e-05, "loss": 0.5081, "mean_token_accuracy": 0.8135558038949966, "step": 630 }, { "epoch": 0.2689538331215587, "grad_norm": 1.1063564395200467, "learning_rate": 1.8316749904067637e-05, "loss": 0.4653, "mean_token_accuracy": 0.8218313783407212, "step": 635 }, { "epoch": 0.2710715798390512, "grad_norm": 1.1492824512346658, "learning_rate": 1.8275458595848376e-05, "loss": 0.4817, "mean_token_accuracy": 0.8135390222072602, "step": 640 }, { "epoch": 0.2731893265565438, "grad_norm": 1.4159749106872088, "learning_rate": 1.8233714681094405e-05, "loss": 0.4616, "mean_token_accuracy": 0.8250806093215942, "step": 645 }, { "epoch": 0.2753070732740364, "grad_norm": 1.1611107224498594, "learning_rate": 1.819152044288992e-05, "loss": 0.488, "mean_token_accuracy": 0.8166846603155136, "step": 650 }, { "epoch": 0.277424819991529, "grad_norm": 1.3205339840836507, "learning_rate": 1.814887818894846e-05, "loss": 0.5036, "mean_token_accuracy": 0.810426139831543, "step": 655 }, { "epoch": 0.2795425667090216, "grad_norm": 1.2642547117014469, "learning_rate": 1.810579025148674e-05, "loss": 0.5063, "mean_token_accuracy": 0.8112012058496475, "step": 660 }, { "epoch": 0.2816603134265142, "grad_norm": 5.33401159048522, "learning_rate": 1.8062258987097062e-05, "loss": 0.4478, "mean_token_accuracy": 0.8289118260145187, "step": 665 }, { "epoch": 0.28377806014400675, "grad_norm": 1.3752087188227111, "learning_rate": 1.8018286776618446e-05, "loss": 0.4963, "mean_token_accuracy": 0.8137694984674454, "step": 670 }, { "epoch": 0.28589580686149935, "grad_norm": 1.176266427707403, "learning_rate": 1.7973876025006407e-05, "loss": 0.4976, "mean_token_accuracy": 0.8188654541969299, "step": 675 }, { "epoch": 0.28801355357899194, "grad_norm": 1.331341038204072, "learning_rate": 1.792902916120143e-05, "loss": 0.4939, "mean_token_accuracy": 0.8163222283124923, "step": 680 }, { "epoch": 0.29013130029648454, "grad_norm": 1.1914829607255677, "learning_rate": 1.7883748637996113e-05, "loss": 0.4881, "mean_token_accuracy": 0.8130565702915191, "step": 685 }, { "epoch": 0.29224904701397714, "grad_norm": 1.2277506964948814, "learning_rate": 1.7838036931901033e-05, "loss": 0.4559, "mean_token_accuracy": 0.824514701962471, "step": 690 }, { "epoch": 0.29436679373146973, "grad_norm": 1.0800320597549389, "learning_rate": 1.7791896543009282e-05, "loss": 0.4891, "mean_token_accuracy": 0.8174144089221954, "step": 695 }, { "epoch": 0.29648454044896233, "grad_norm": 1.5694294317697621, "learning_rate": 1.7745329994859746e-05, "loss": 0.4914, "mean_token_accuracy": 0.8185641199350357, "step": 700 }, { "epoch": 0.29860228716645487, "grad_norm": 1.1923041867729132, "learning_rate": 1.7698339834299064e-05, "loss": 0.5008, "mean_token_accuracy": 0.8142161637544632, "step": 705 }, { "epoch": 0.30072003388394747, "grad_norm": 1.3729946102267174, "learning_rate": 1.7650928631342364e-05, "loss": 0.4845, "mean_token_accuracy": 0.8133604645729064, "step": 710 }, { "epoch": 0.30283778060144007, "grad_norm": 1.174456646604131, "learning_rate": 1.7603098979032683e-05, "loss": 0.4777, "mean_token_accuracy": 0.813685166835785, "step": 715 }, { "epoch": 0.30495552731893266, "grad_norm": 1.158532302748484, "learning_rate": 1.7554853493299142e-05, "loss": 0.504, "mean_token_accuracy": 0.8088937163352966, "step": 720 }, { "epoch": 0.30707327403642526, "grad_norm": 1.2620596837516858, "learning_rate": 1.7506194812813896e-05, "loss": 0.4817, "mean_token_accuracy": 0.8206409096717835, "step": 725 }, { "epoch": 0.30919102075391786, "grad_norm": 1.148012521360775, "learning_rate": 1.74571255988478e-05, "loss": 0.4819, "mean_token_accuracy": 0.812398812174797, "step": 730 }, { "epoch": 0.3113087674714104, "grad_norm": 1.2373133691587057, "learning_rate": 1.740764853512485e-05, "loss": 0.49, "mean_token_accuracy": 0.8143349289894104, "step": 735 }, { "epoch": 0.313426514188903, "grad_norm": 2.100740115519466, "learning_rate": 1.7357766327675433e-05, "loss": 0.4651, "mean_token_accuracy": 0.8216336488723754, "step": 740 }, { "epoch": 0.3155442609063956, "grad_norm": 1.4189894877798284, "learning_rate": 1.73074817046883e-05, "loss": 0.4801, "mean_token_accuracy": 0.8188165038824081, "step": 745 }, { "epoch": 0.3176620076238882, "grad_norm": 1.2994480429040771, "learning_rate": 1.725679741636136e-05, "loss": 0.4614, "mean_token_accuracy": 0.8237657248973846, "step": 750 }, { "epoch": 0.3197797543413808, "grad_norm": 1.2308603791930401, "learning_rate": 1.720571623475128e-05, "loss": 0.492, "mean_token_accuracy": 0.8165101200342179, "step": 755 }, { "epoch": 0.3218975010588734, "grad_norm": 1.3843077010151197, "learning_rate": 1.7154240953621844e-05, "loss": 0.4564, "mean_token_accuracy": 0.825025874376297, "step": 760 }, { "epoch": 0.3240152477763659, "grad_norm": 1.1848129565884666, "learning_rate": 1.7102374388291182e-05, "loss": 0.4575, "mean_token_accuracy": 0.8252220988273621, "step": 765 }, { "epoch": 0.3261329944938585, "grad_norm": 1.3217187216198285, "learning_rate": 1.705011937547779e-05, "loss": 0.4629, "mean_token_accuracy": 0.8198304086923599, "step": 770 }, { "epoch": 0.3282507412113511, "grad_norm": 1.3851637896221318, "learning_rate": 1.6997478773145363e-05, "loss": 0.4337, "mean_token_accuracy": 0.8338131695985794, "step": 775 }, { "epoch": 0.3303684879288437, "grad_norm": 1.423775789920787, "learning_rate": 1.6944455460346503e-05, "loss": 0.4807, "mean_token_accuracy": 0.8188902169466019, "step": 780 }, { "epoch": 0.3324862346463363, "grad_norm": 1.3680154210297841, "learning_rate": 1.6891052337065256e-05, "loss": 0.4841, "mean_token_accuracy": 0.8188378721475601, "step": 785 }, { "epoch": 0.3346039813638289, "grad_norm": 1.1670007538420892, "learning_rate": 1.6837272324058487e-05, "loss": 0.4209, "mean_token_accuracy": 0.8359328061342239, "step": 790 }, { "epoch": 0.33672172808132145, "grad_norm": 1.2238185684348435, "learning_rate": 1.6783118362696162e-05, "loss": 0.4687, "mean_token_accuracy": 0.8194981902837754, "step": 795 }, { "epoch": 0.33883947479881404, "grad_norm": 1.3104844364549155, "learning_rate": 1.672859341480046e-05, "loss": 0.4605, "mean_token_accuracy": 0.8169092148542404, "step": 800 }, { "epoch": 0.34095722151630664, "grad_norm": 1.1074420443801423, "learning_rate": 1.6673700462483776e-05, "loss": 0.4424, "mean_token_accuracy": 0.8315922617912292, "step": 805 }, { "epoch": 0.34307496823379924, "grad_norm": 1.2002465546594834, "learning_rate": 1.661844250798565e-05, "loss": 0.4773, "mean_token_accuracy": 0.8234172344207764, "step": 810 }, { "epoch": 0.34519271495129183, "grad_norm": 1.3643314568341807, "learning_rate": 1.6562822573508533e-05, "loss": 0.4803, "mean_token_accuracy": 0.8155502796173095, "step": 815 }, { "epoch": 0.34731046166878443, "grad_norm": 1.1653511889703811, "learning_rate": 1.650684370105252e-05, "loss": 0.4907, "mean_token_accuracy": 0.8095988690853119, "step": 820 }, { "epoch": 0.34942820838627703, "grad_norm": 1.2052540958169133, "learning_rate": 1.6450508952248957e-05, "loss": 0.4664, "mean_token_accuracy": 0.8265933513641357, "step": 825 }, { "epoch": 0.35154595510376957, "grad_norm": 1.5477552328113091, "learning_rate": 1.6393821408193007e-05, "loss": 0.4783, "mean_token_accuracy": 0.8169477820396424, "step": 830 }, { "epoch": 0.35366370182126217, "grad_norm": 1.8070494772139423, "learning_rate": 1.6336784169275132e-05, "loss": 0.454, "mean_token_accuracy": 0.8248355984687805, "step": 835 }, { "epoch": 0.35578144853875476, "grad_norm": 1.2257376390653825, "learning_rate": 1.627940035501152e-05, "loss": 0.4506, "mean_token_accuracy": 0.8257219165563583, "step": 840 }, { "epoch": 0.35789919525624736, "grad_norm": 1.3198794046839721, "learning_rate": 1.6221673103873474e-05, "loss": 0.4427, "mean_token_accuracy": 0.8296634495258332, "step": 845 }, { "epoch": 0.36001694197373996, "grad_norm": 2.109231295857473, "learning_rate": 1.616360557311575e-05, "loss": 0.489, "mean_token_accuracy": 0.8102859228849411, "step": 850 }, { "epoch": 0.36213468869123255, "grad_norm": 1.1872292152679083, "learning_rate": 1.6105200938603917e-05, "loss": 0.4681, "mean_token_accuracy": 0.8261395335197449, "step": 855 }, { "epoch": 0.3642524354087251, "grad_norm": 1.214005452933459, "learning_rate": 1.60464623946406e-05, "loss": 0.4852, "mean_token_accuracy": 0.8179385870695114, "step": 860 }, { "epoch": 0.3663701821262177, "grad_norm": 1.0907256335398452, "learning_rate": 1.5987393153790832e-05, "loss": 0.4623, "mean_token_accuracy": 0.8248693764209747, "step": 865 }, { "epoch": 0.3684879288437103, "grad_norm": 1.061691508146564, "learning_rate": 1.5927996446706308e-05, "loss": 0.4803, "mean_token_accuracy": 0.8169174045324326, "step": 870 }, { "epoch": 0.3706056755612029, "grad_norm": 1.1759352091149649, "learning_rate": 1.5868275521948726e-05, "loss": 0.4563, "mean_token_accuracy": 0.8279780805110931, "step": 875 }, { "epoch": 0.3727234222786955, "grad_norm": 1.2135030886876705, "learning_rate": 1.5808233645812087e-05, "loss": 0.4418, "mean_token_accuracy": 0.8301020473241806, "step": 880 }, { "epoch": 0.3748411689961881, "grad_norm": 1.1266881444254488, "learning_rate": 1.5747874102144073e-05, "loss": 0.4626, "mean_token_accuracy": 0.8214969336986542, "step": 885 }, { "epoch": 0.3769589157136806, "grad_norm": 1.0911244736489776, "learning_rate": 1.5687200192166424e-05, "loss": 0.4635, "mean_token_accuracy": 0.8221491903066636, "step": 890 }, { "epoch": 0.3790766624311732, "grad_norm": 1.0852849507203284, "learning_rate": 1.5626215234294416e-05, "loss": 0.451, "mean_token_accuracy": 0.8251518607139587, "step": 895 }, { "epoch": 0.3811944091486658, "grad_norm": 1.1215853338868707, "learning_rate": 1.5564922563955337e-05, "loss": 0.4608, "mean_token_accuracy": 0.8237892210483551, "step": 900 }, { "epoch": 0.3833121558661584, "grad_norm": 0.9235255903522734, "learning_rate": 1.5503325533406076e-05, "loss": 0.4676, "mean_token_accuracy": 0.8222286731004715, "step": 905 }, { "epoch": 0.385429902583651, "grad_norm": 1.0494173037764836, "learning_rate": 1.5441427511549795e-05, "loss": 0.4652, "mean_token_accuracy": 0.8235789179801941, "step": 910 }, { "epoch": 0.3875476493011436, "grad_norm": 1.2934333868332708, "learning_rate": 1.537923188375164e-05, "loss": 0.459, "mean_token_accuracy": 0.8253506690263748, "step": 915 }, { "epoch": 0.38966539601863615, "grad_norm": 1.045643086378396, "learning_rate": 1.5316742051653624e-05, "loss": 0.4487, "mean_token_accuracy": 0.8300421804189682, "step": 920 }, { "epoch": 0.39178314273612874, "grad_norm": 1.0549731687620314, "learning_rate": 1.5253961432988548e-05, "loss": 0.4756, "mean_token_accuracy": 0.8141780078411103, "step": 925 }, { "epoch": 0.39390088945362134, "grad_norm": 1.1263426428393677, "learning_rate": 1.5190893461393108e-05, "loss": 0.4698, "mean_token_accuracy": 0.8173887878656387, "step": 930 }, { "epoch": 0.39601863617111394, "grad_norm": 1.1982411204873675, "learning_rate": 1.5127541586220077e-05, "loss": 0.4595, "mean_token_accuracy": 0.8246693462133408, "step": 935 }, { "epoch": 0.39813638288860653, "grad_norm": 1.331125977750805, "learning_rate": 1.5063909272349664e-05, "loss": 0.466, "mean_token_accuracy": 0.8266402333974838, "step": 940 }, { "epoch": 0.40025412960609913, "grad_norm": 1.165754254305497, "learning_rate": 1.5000000000000002e-05, "loss": 0.435, "mean_token_accuracy": 0.8271202713251113, "step": 945 }, { "epoch": 0.4023718763235917, "grad_norm": 1.1585938088360928, "learning_rate": 1.4935817264536809e-05, "loss": 0.4386, "mean_token_accuracy": 0.8255492657423019, "step": 950 }, { "epoch": 0.40448962304108427, "grad_norm": 1.1542702135186313, "learning_rate": 1.4871364576282223e-05, "loss": 0.4769, "mean_token_accuracy": 0.8163278847932816, "step": 955 }, { "epoch": 0.40660736975857686, "grad_norm": 1.1855267108232739, "learning_rate": 1.4806645460322804e-05, "loss": 0.4938, "mean_token_accuracy": 0.8140994518995285, "step": 960 }, { "epoch": 0.40872511647606946, "grad_norm": 1.0583179034757253, "learning_rate": 1.4741663456316742e-05, "loss": 0.4694, "mean_token_accuracy": 0.8194496780633926, "step": 965 }, { "epoch": 0.41084286319356206, "grad_norm": 1.2166297794886325, "learning_rate": 1.4676422118300266e-05, "loss": 0.4583, "mean_token_accuracy": 0.8240072697401046, "step": 970 }, { "epoch": 0.41296060991105465, "grad_norm": 1.2077033819076497, "learning_rate": 1.461092501449326e-05, "loss": 0.4683, "mean_token_accuracy": 0.8127462983131408, "step": 975 }, { "epoch": 0.41507835662854725, "grad_norm": 1.2024839451726628, "learning_rate": 1.4545175727104113e-05, "loss": 0.4746, "mean_token_accuracy": 0.817327806353569, "step": 980 }, { "epoch": 0.4171961033460398, "grad_norm": 43.895890122529586, "learning_rate": 1.4479177852133787e-05, "loss": 0.4339, "mean_token_accuracy": 0.83043053150177, "step": 985 }, { "epoch": 0.4193138500635324, "grad_norm": 1.3761452239892333, "learning_rate": 1.4412934999179169e-05, "loss": 0.4682, "mean_token_accuracy": 0.82216075360775, "step": 990 }, { "epoch": 0.421431596781025, "grad_norm": 9.553572882081992, "learning_rate": 1.4346450791235611e-05, "loss": 0.425, "mean_token_accuracy": 0.8346862554550171, "step": 995 }, { "epoch": 0.4235493434985176, "grad_norm": 1.19535922636142, "learning_rate": 1.427972886449882e-05, "loss": 0.4916, "mean_token_accuracy": 0.8201052099466324, "step": 1000 }, { "epoch": 0.4256670902160102, "grad_norm": 1.487407000401354, "learning_rate": 1.4212772868165957e-05, "loss": 0.4759, "mean_token_accuracy": 0.8201690822839737, "step": 1005 }, { "epoch": 0.4277848369335028, "grad_norm": 1.2209557581398112, "learning_rate": 1.4145586464236074e-05, "loss": 0.4776, "mean_token_accuracy": 0.8144995361566544, "step": 1010 }, { "epoch": 0.4299025836509953, "grad_norm": 1.4175123984588354, "learning_rate": 1.4078173327309807e-05, "loss": 0.4697, "mean_token_accuracy": 0.820775744318962, "step": 1015 }, { "epoch": 0.4320203303684879, "grad_norm": 1.2129818965934513, "learning_rate": 1.4010537144388416e-05, "loss": 0.463, "mean_token_accuracy": 0.8259893089532853, "step": 1020 }, { "epoch": 0.4341380770859805, "grad_norm": 1.12010970838833, "learning_rate": 1.3942681614672144e-05, "loss": 0.4629, "mean_token_accuracy": 0.8218669801950454, "step": 1025 }, { "epoch": 0.4362558238034731, "grad_norm": 1.1464961804103622, "learning_rate": 1.3874610449357873e-05, "loss": 0.4238, "mean_token_accuracy": 0.8335713416337966, "step": 1030 }, { "epoch": 0.4383735705209657, "grad_norm": 1.1351310993680606, "learning_rate": 1.3806327371436159e-05, "loss": 0.4394, "mean_token_accuracy": 0.8307629436254501, "step": 1035 }, { "epoch": 0.4404913172384583, "grad_norm": 1.1188266853744508, "learning_rate": 1.3737836115487624e-05, "loss": 0.4663, "mean_token_accuracy": 0.8193978488445282, "step": 1040 }, { "epoch": 0.44260906395595084, "grad_norm": 1.1620199858915772, "learning_rate": 1.3669140427478693e-05, "loss": 0.4705, "mean_token_accuracy": 0.8229668527841568, "step": 1045 }, { "epoch": 0.44472681067344344, "grad_norm": 1.110101616240863, "learning_rate": 1.3600244064556702e-05, "loss": 0.4747, "mean_token_accuracy": 0.8179006308317185, "step": 1050 }, { "epoch": 0.44684455739093604, "grad_norm": 1.2783615446297392, "learning_rate": 1.353115079484444e-05, "loss": 0.4458, "mean_token_accuracy": 0.8308207571506501, "step": 1055 }, { "epoch": 0.44896230410842863, "grad_norm": 1.1007302332610067, "learning_rate": 1.3461864397234041e-05, "loss": 0.4598, "mean_token_accuracy": 0.8242943733930588, "step": 1060 }, { "epoch": 0.45108005082592123, "grad_norm": 1.2199483732995027, "learning_rate": 1.3392388661180303e-05, "loss": 0.445, "mean_token_accuracy": 0.824502220749855, "step": 1065 }, { "epoch": 0.4531977975434138, "grad_norm": 1.0010509955815885, "learning_rate": 1.332272738649345e-05, "loss": 0.4583, "mean_token_accuracy": 0.8303744524717331, "step": 1070 }, { "epoch": 0.45531554426090637, "grad_norm": 1.918284418636839, "learning_rate": 1.325288438313129e-05, "loss": 0.4269, "mean_token_accuracy": 0.8296439230442048, "step": 1075 }, { "epoch": 0.45743329097839897, "grad_norm": 1.1887902164021535, "learning_rate": 1.318286347099086e-05, "loss": 0.4625, "mean_token_accuracy": 0.8217881232500076, "step": 1080 }, { "epoch": 0.45955103769589156, "grad_norm": 1.1360766453253965, "learning_rate": 1.3112668479699486e-05, "loss": 0.4589, "mean_token_accuracy": 0.8269425123929978, "step": 1085 }, { "epoch": 0.46166878441338416, "grad_norm": 1.2399254503178083, "learning_rate": 1.3042303248405346e-05, "loss": 0.4555, "mean_token_accuracy": 0.8309968203306198, "step": 1090 }, { "epoch": 0.46378653113087676, "grad_norm": 1.0508779611719044, "learning_rate": 1.297177162556748e-05, "loss": 0.4545, "mean_token_accuracy": 0.824161484837532, "step": 1095 }, { "epoch": 0.46590427784836935, "grad_norm": 1.0822262810815348, "learning_rate": 1.2901077468745329e-05, "loss": 0.4571, "mean_token_accuracy": 0.8281063556671142, "step": 1100 }, { "epoch": 0.46802202456586195, "grad_norm": 1.0744745429140576, "learning_rate": 1.2830224644387742e-05, "loss": 0.471, "mean_token_accuracy": 0.8183866649866104, "step": 1105 }, { "epoch": 0.4701397712833545, "grad_norm": 1.2108459211634035, "learning_rate": 1.2759217027621507e-05, "loss": 0.4445, "mean_token_accuracy": 0.8313823521137238, "step": 1110 }, { "epoch": 0.4722575180008471, "grad_norm": 1.1385271166035913, "learning_rate": 1.2688058502039416e-05, "loss": 0.4724, "mean_token_accuracy": 0.8208224922418594, "step": 1115 }, { "epoch": 0.4743752647183397, "grad_norm": 1.1608922255857643, "learning_rate": 1.261675295948786e-05, "loss": 0.4402, "mean_token_accuracy": 0.8260656505823135, "step": 1120 }, { "epoch": 0.4764930114358323, "grad_norm": 1.2001870807148136, "learning_rate": 1.2545304299853977e-05, "loss": 0.4676, "mean_token_accuracy": 0.8217555999755859, "step": 1125 }, { "epoch": 0.4786107581533249, "grad_norm": 1.099496727008847, "learning_rate": 1.2473716430852353e-05, "loss": 0.436, "mean_token_accuracy": 0.8312188684940338, "step": 1130 }, { "epoch": 0.4807285048708175, "grad_norm": 2.032998570634967, "learning_rate": 1.2401993267811293e-05, "loss": 0.4317, "mean_token_accuracy": 0.8295620054006576, "step": 1135 }, { "epoch": 0.48284625158831, "grad_norm": 1.1812725212971202, "learning_rate": 1.2330138733458693e-05, "loss": 0.4156, "mean_token_accuracy": 0.8353513538837433, "step": 1140 }, { "epoch": 0.4849639983058026, "grad_norm": 1.138821301405385, "learning_rate": 1.2258156757707496e-05, "loss": 0.4506, "mean_token_accuracy": 0.8284595161676407, "step": 1145 }, { "epoch": 0.4870817450232952, "grad_norm": 1.039456646381961, "learning_rate": 1.2186051277440739e-05, "loss": 0.4281, "mean_token_accuracy": 0.8340547412633896, "step": 1150 }, { "epoch": 0.4891994917407878, "grad_norm": 1.0935441587184827, "learning_rate": 1.2113826236296245e-05, "loss": 0.4368, "mean_token_accuracy": 0.8294982463121414, "step": 1155 }, { "epoch": 0.4913172384582804, "grad_norm": 1.0601849025025707, "learning_rate": 1.2041485584450945e-05, "loss": 0.4496, "mean_token_accuracy": 0.8288684636354446, "step": 1160 }, { "epoch": 0.493434985175773, "grad_norm": 1.1432826242197904, "learning_rate": 1.1969033278404816e-05, "loss": 0.472, "mean_token_accuracy": 0.8184500396251678, "step": 1165 }, { "epoch": 0.49555273189326554, "grad_norm": 1.178255399480397, "learning_rate": 1.1896473280764498e-05, "loss": 0.453, "mean_token_accuracy": 0.82464899122715, "step": 1170 }, { "epoch": 0.49767047861075814, "grad_norm": 1.2123556499205794, "learning_rate": 1.1823809560026558e-05, "loss": 0.442, "mean_token_accuracy": 0.8262520909309388, "step": 1175 }, { "epoch": 0.49978822532825073, "grad_norm": 1.490671459887953, "learning_rate": 1.175104609036047e-05, "loss": 0.4493, "mean_token_accuracy": 0.8295370072126389, "step": 1180 }, { "epoch": 0.5019059720457433, "grad_norm": 3.5058816478434993, "learning_rate": 1.1678186851391218e-05, "loss": 0.4593, "mean_token_accuracy": 0.8269213020801545, "step": 1185 }, { "epoch": 0.5040237187632359, "grad_norm": 1.1384716073513477, "learning_rate": 1.1605235827981673e-05, "loss": 0.4463, "mean_token_accuracy": 0.8314786165952682, "step": 1190 }, { "epoch": 0.5061414654807285, "grad_norm": 1.1752572701433124, "learning_rate": 1.1532197010014636e-05, "loss": 0.4453, "mean_token_accuracy": 0.8288865953683853, "step": 1195 }, { "epoch": 0.5082592121982211, "grad_norm": 1.0006379736398943, "learning_rate": 1.1459074392174619e-05, "loss": 0.4293, "mean_token_accuracy": 0.8350226402282714, "step": 1200 }, { "epoch": 0.5103769589157137, "grad_norm": 1.1784455736187447, "learning_rate": 1.138587197372937e-05, "loss": 0.4612, "mean_token_accuracy": 0.8215854614973068, "step": 1205 }, { "epoch": 0.5124947056332063, "grad_norm": 1.1048766566547503, "learning_rate": 1.1312593758311143e-05, "loss": 0.4279, "mean_token_accuracy": 0.8407860666513443, "step": 1210 }, { "epoch": 0.5146124523506989, "grad_norm": 1.0718700385713946, "learning_rate": 1.1239243753697728e-05, "loss": 0.4288, "mean_token_accuracy": 0.8378984898328781, "step": 1215 }, { "epoch": 0.5167301990681914, "grad_norm": 1.558568433227081, "learning_rate": 1.1165825971593251e-05, "loss": 0.4678, "mean_token_accuracy": 0.825000548362732, "step": 1220 }, { "epoch": 0.518847945785684, "grad_norm": 1.082392246698731, "learning_rate": 1.1092344427408767e-05, "loss": 0.4276, "mean_token_accuracy": 0.8359992414712906, "step": 1225 }, { "epoch": 0.5209656925031766, "grad_norm": 1.256334909576375, "learning_rate": 1.1018803140042651e-05, "loss": 0.4633, "mean_token_accuracy": 0.8229638338088989, "step": 1230 }, { "epoch": 0.5230834392206692, "grad_norm": 1.303814596864245, "learning_rate": 1.0945206131660787e-05, "loss": 0.469, "mean_token_accuracy": 0.8193328499794006, "step": 1235 }, { "epoch": 0.5252011859381618, "grad_norm": 1.0507039996160834, "learning_rate": 1.0871557427476585e-05, "loss": 0.4414, "mean_token_accuracy": 0.8317544460296631, "step": 1240 }, { "epoch": 0.5273189326556544, "grad_norm": 1.015866344156703, "learning_rate": 1.0797861055530832e-05, "loss": 0.428, "mean_token_accuracy": 0.8305379122495651, "step": 1245 }, { "epoch": 0.529436679373147, "grad_norm": 1.1624992956977676, "learning_rate": 1.07241210464714e-05, "loss": 0.467, "mean_token_accuracy": 0.820591053366661, "step": 1250 }, { "epoch": 0.5315544260906395, "grad_norm": 1.2782647412686758, "learning_rate": 1.0650341433332778e-05, "loss": 0.4689, "mean_token_accuracy": 0.8219984292984008, "step": 1255 }, { "epoch": 0.5336721728081322, "grad_norm": 1.1784870838731618, "learning_rate": 1.0576526251315515e-05, "loss": 0.4596, "mean_token_accuracy": 0.8260756641626358, "step": 1260 }, { "epoch": 0.5357899195256247, "grad_norm": 1.1204805080469906, "learning_rate": 1.0502679537565507e-05, "loss": 0.442, "mean_token_accuracy": 0.8296466141939163, "step": 1265 }, { "epoch": 0.5379076662431174, "grad_norm": 1.0718296420595828, "learning_rate": 1.0428805330953209e-05, "loss": 0.4215, "mean_token_accuracy": 0.8308669030666351, "step": 1270 }, { "epoch": 0.5400254129606099, "grad_norm": 1.1125024136410944, "learning_rate": 1.0354907671852733e-05, "loss": 0.4363, "mean_token_accuracy": 0.8332655102014541, "step": 1275 }, { "epoch": 0.5421431596781024, "grad_norm": 1.090167844275342, "learning_rate": 1.0280990601920863e-05, "loss": 0.4435, "mean_token_accuracy": 0.8282716870307922, "step": 1280 }, { "epoch": 0.5442609063955951, "grad_norm": 1.0290238619990948, "learning_rate": 1.0207058163876021e-05, "loss": 0.4413, "mean_token_accuracy": 0.8311887979507446, "step": 1285 }, { "epoch": 0.5463786531130876, "grad_norm": 1.0778232888370207, "learning_rate": 1.013311440127714e-05, "loss": 0.4386, "mean_token_accuracy": 0.8266764581203461, "step": 1290 }, { "epoch": 0.5484963998305803, "grad_norm": 1.1219731141973122, "learning_rate": 1.0059163358302537e-05, "loss": 0.4103, "mean_token_accuracy": 0.8391000181436539, "step": 1295 }, { "epoch": 0.5506141465480728, "grad_norm": 1.1468466517999107, "learning_rate": 9.9852090795287e-06, "loss": 0.4391, "mean_token_accuracy": 0.8361193478107453, "step": 1300 }, { "epoch": 0.5527318932655655, "grad_norm": 1.0284132663014267, "learning_rate": 9.911255609709089e-06, "loss": 0.4409, "mean_token_accuracy": 0.8269284754991532, "step": 1305 }, { "epoch": 0.554849639983058, "grad_norm": 1.0310999165822667, "learning_rate": 9.83730699355294e-06, "loss": 0.4071, "mean_token_accuracy": 0.835135304927826, "step": 1310 }, { "epoch": 0.5569673867005506, "grad_norm": 1.2728900066425748, "learning_rate": 9.76336727550401e-06, "loss": 0.4601, "mean_token_accuracy": 0.8267913639545441, "step": 1315 }, { "epoch": 0.5590851334180432, "grad_norm": 1.2269899407592741, "learning_rate": 9.689440499519395e-06, "loss": 0.4322, "mean_token_accuracy": 0.8314703017473221, "step": 1320 }, { "epoch": 0.5612028801355358, "grad_norm": 1.1418757049837882, "learning_rate": 9.615530708848373e-06, "loss": 0.4231, "mean_token_accuracy": 0.8340400338172913, "step": 1325 }, { "epoch": 0.5633206268530284, "grad_norm": 1.1108149486798655, "learning_rate": 9.541641945811233e-06, "loss": 0.4492, "mean_token_accuracy": 0.8232677519321442, "step": 1330 }, { "epoch": 0.565438373570521, "grad_norm": 1.1088127297572268, "learning_rate": 9.467778251578217e-06, "loss": 0.4549, "mean_token_accuracy": 0.8236530691385269, "step": 1335 }, { "epoch": 0.5675561202880135, "grad_norm": 0.9179664771961787, "learning_rate": 9.393943665948478e-06, "loss": 0.4763, "mean_token_accuracy": 0.8244054973125458, "step": 1340 }, { "epoch": 0.5696738670055062, "grad_norm": 1.1777867866273308, "learning_rate": 9.320142227129158e-06, "loss": 0.4348, "mean_token_accuracy": 0.8331925332546234, "step": 1345 }, { "epoch": 0.5717916137229987, "grad_norm": 1.0020743360016087, "learning_rate": 9.246377971514504e-06, "loss": 0.4161, "mean_token_accuracy": 0.8360674440860748, "step": 1350 }, { "epoch": 0.5739093604404913, "grad_norm": 1.346066080223308, "learning_rate": 9.172654933465114e-06, "loss": 0.448, "mean_token_accuracy": 0.8250635206699372, "step": 1355 }, { "epoch": 0.5760271071579839, "grad_norm": 1.3221207747875352, "learning_rate": 9.0989771450873e-06, "loss": 0.4228, "mean_token_accuracy": 0.8357968628406525, "step": 1360 }, { "epoch": 0.5781448538754765, "grad_norm": 1.1501989319658534, "learning_rate": 9.025348636012537e-06, "loss": 0.4411, "mean_token_accuracy": 0.8290417343378067, "step": 1365 }, { "epoch": 0.5802626005929691, "grad_norm": 1.1694331116554113, "learning_rate": 8.951773433177095e-06, "loss": 0.4343, "mean_token_accuracy": 0.8303040146827698, "step": 1370 }, { "epoch": 0.5823803473104616, "grad_norm": 1.2089472872967426, "learning_rate": 8.878255560601781e-06, "loss": 0.4285, "mean_token_accuracy": 0.8339911371469497, "step": 1375 }, { "epoch": 0.5844980940279543, "grad_norm": 1.1555334960481487, "learning_rate": 8.804799039171863e-06, "loss": 0.4225, "mean_token_accuracy": 0.8346673488616944, "step": 1380 }, { "epoch": 0.5866158407454468, "grad_norm": 0.9976941601020334, "learning_rate": 8.731407886417155e-06, "loss": 0.4538, "mean_token_accuracy": 0.8272438108921051, "step": 1385 }, { "epoch": 0.5887335874629395, "grad_norm": 1.0977726966561636, "learning_rate": 8.658086116292283e-06, "loss": 0.4297, "mean_token_accuracy": 0.8334219962358475, "step": 1390 }, { "epoch": 0.590851334180432, "grad_norm": 2.0194878160007987, "learning_rate": 8.584837738957155e-06, "loss": 0.4413, "mean_token_accuracy": 0.8283408343791961, "step": 1395 }, { "epoch": 0.5929690808979247, "grad_norm": 1.2186719145281468, "learning_rate": 8.511666760557638e-06, "loss": 0.4693, "mean_token_accuracy": 0.8232256740331649, "step": 1400 }, { "epoch": 0.5950868276154172, "grad_norm": 1.1198588684752515, "learning_rate": 8.438577183006448e-06, "loss": 0.4221, "mean_token_accuracy": 0.8324928849935531, "step": 1405 }, { "epoch": 0.5972045743329097, "grad_norm": 1.1215071963961742, "learning_rate": 8.36557300376427e-06, "loss": 0.4392, "mean_token_accuracy": 0.8286356210708619, "step": 1410 }, { "epoch": 0.5993223210504024, "grad_norm": 1.107475266800191, "learning_rate": 8.292658215621139e-06, "loss": 0.4344, "mean_token_accuracy": 0.8313880443572998, "step": 1415 }, { "epoch": 0.6014400677678949, "grad_norm": 1.1686631557802003, "learning_rate": 8.219836806478049e-06, "loss": 0.4336, "mean_token_accuracy": 0.8312123149633408, "step": 1420 }, { "epoch": 0.6035578144853876, "grad_norm": 1.230978585871069, "learning_rate": 8.147112759128859e-06, "loss": 0.4647, "mean_token_accuracy": 0.8231993585824966, "step": 1425 }, { "epoch": 0.6056755612028801, "grad_norm": 1.0717890273842352, "learning_rate": 8.074490051042447e-06, "loss": 0.4353, "mean_token_accuracy": 0.8321529895067215, "step": 1430 }, { "epoch": 0.6077933079203727, "grad_norm": 1.085108371368418, "learning_rate": 8.001972654145194e-06, "loss": 0.4415, "mean_token_accuracy": 0.8277548223733902, "step": 1435 }, { "epoch": 0.6099110546378653, "grad_norm": 1.2119593900205077, "learning_rate": 7.929564534603722e-06, "loss": 0.4571, "mean_token_accuracy": 0.8255878984928131, "step": 1440 }, { "epoch": 0.6120288013553579, "grad_norm": 1.1055437345283827, "learning_rate": 7.857269652607995e-06, "loss": 0.4406, "mean_token_accuracy": 0.8275179982185363, "step": 1445 }, { "epoch": 0.6141465480728505, "grad_norm": 1.1275451956189597, "learning_rate": 7.78509196215472e-06, "loss": 0.4308, "mean_token_accuracy": 0.8301453530788422, "step": 1450 }, { "epoch": 0.6162642947903431, "grad_norm": 1.2886494426253579, "learning_rate": 7.713035410831086e-06, "loss": 0.4573, "mean_token_accuracy": 0.8251194447278977, "step": 1455 }, { "epoch": 0.6183820415078357, "grad_norm": 1.1109768793864798, "learning_rate": 7.64110393959887e-06, "loss": 0.4279, "mean_token_accuracy": 0.8380070447921752, "step": 1460 }, { "epoch": 0.6204997882253283, "grad_norm": 1.0182035864318235, "learning_rate": 7.569301482578885e-06, "loss": 0.4281, "mean_token_accuracy": 0.8316156834363937, "step": 1465 }, { "epoch": 0.6226175349428208, "grad_norm": 1.2074345207100396, "learning_rate": 7.497631966835828e-06, "loss": 0.4527, "mean_token_accuracy": 0.8231601238250732, "step": 1470 }, { "epoch": 0.6247352816603134, "grad_norm": 0.991329003303421, "learning_rate": 7.42609931216348e-06, "loss": 0.442, "mean_token_accuracy": 0.8327670186758042, "step": 1475 }, { "epoch": 0.626853028377806, "grad_norm": 1.38024365126256, "learning_rate": 7.354707430870332e-06, "loss": 0.4335, "mean_token_accuracy": 0.8324557185173035, "step": 1480 }, { "epoch": 0.6289707750952986, "grad_norm": 1.2263457500699402, "learning_rate": 7.283460227565614e-06, "loss": 0.4289, "mean_token_accuracy": 0.8289420217275619, "step": 1485 }, { "epoch": 0.6310885218127912, "grad_norm": 1.1601375730316865, "learning_rate": 7.2123615989457364e-06, "loss": 0.4465, "mean_token_accuracy": 0.832300814986229, "step": 1490 }, { "epoch": 0.6332062685302838, "grad_norm": 1.3029839142463893, "learning_rate": 7.141415433581169e-06, "loss": 0.4167, "mean_token_accuracy": 0.8393772184848786, "step": 1495 }, { "epoch": 0.6353240152477764, "grad_norm": 1.0421344337402514, "learning_rate": 7.070625611703762e-06, "loss": 0.4537, "mean_token_accuracy": 0.8257811456918717, "step": 1500 }, { "epoch": 0.6374417619652689, "grad_norm": 1.1352186472493642, "learning_rate": 6.9999960049945406e-06, "loss": 0.4227, "mean_token_accuracy": 0.8368300348520279, "step": 1505 }, { "epoch": 0.6395595086827616, "grad_norm": 0.9884985072070904, "learning_rate": 6.929530476371935e-06, "loss": 0.4189, "mean_token_accuracy": 0.8349219173192978, "step": 1510 }, { "epoch": 0.6416772554002541, "grad_norm": 1.7766008455284357, "learning_rate": 6.859232879780515e-06, "loss": 0.4288, "mean_token_accuracy": 0.8374936401844024, "step": 1515 }, { "epoch": 0.6437950021177468, "grad_norm": 1.012934970024209, "learning_rate": 6.7891070599802045e-06, "loss": 0.4549, "mean_token_accuracy": 0.8239244252443314, "step": 1520 }, { "epoch": 0.6459127488352393, "grad_norm": 0.9859441855867837, "learning_rate": 6.719156852336015e-06, "loss": 0.4293, "mean_token_accuracy": 0.8353272944688797, "step": 1525 }, { "epoch": 0.6480304955527318, "grad_norm": 1.261329902420831, "learning_rate": 6.649386082608256e-06, "loss": 0.428, "mean_token_accuracy": 0.8329044044017792, "step": 1530 }, { "epoch": 0.6501482422702245, "grad_norm": 1.2457535519058567, "learning_rate": 6.579798566743314e-06, "loss": 0.4324, "mean_token_accuracy": 0.8307075470685958, "step": 1535 }, { "epoch": 0.652265988987717, "grad_norm": 1.213114456712863, "learning_rate": 6.510398110664939e-06, "loss": 0.4223, "mean_token_accuracy": 0.8351607590913772, "step": 1540 }, { "epoch": 0.6543837357052097, "grad_norm": 1.155264435257233, "learning_rate": 6.441188510066092e-06, "loss": 0.4207, "mean_token_accuracy": 0.8374445289373398, "step": 1545 }, { "epoch": 0.6565014824227022, "grad_norm": 1.1756119576548756, "learning_rate": 6.372173550201346e-06, "loss": 0.4119, "mean_token_accuracy": 0.8390755444765091, "step": 1550 }, { "epoch": 0.6586192291401949, "grad_norm": 1.0243897900651528, "learning_rate": 6.303357005679858e-06, "loss": 0.4478, "mean_token_accuracy": 0.8277173846960068, "step": 1555 }, { "epoch": 0.6607369758576874, "grad_norm": 1.0868676429874986, "learning_rate": 6.234742640258938e-06, "loss": 0.4552, "mean_token_accuracy": 0.827509269118309, "step": 1560 }, { "epoch": 0.66285472257518, "grad_norm": 1.1792649536698685, "learning_rate": 6.166334206638186e-06, "loss": 0.4396, "mean_token_accuracy": 0.8288001954555512, "step": 1565 }, { "epoch": 0.6649724692926726, "grad_norm": 1.171894663481444, "learning_rate": 6.0981354462542456e-06, "loss": 0.4365, "mean_token_accuracy": 0.8315492898225785, "step": 1570 }, { "epoch": 0.6670902160101652, "grad_norm": 1.1333037764256397, "learning_rate": 6.030150089076199e-06, "loss": 0.4319, "mean_token_accuracy": 0.8316318243741989, "step": 1575 }, { "epoch": 0.6692079627276578, "grad_norm": 1.1892286300854609, "learning_rate": 5.9623818534015275e-06, "loss": 0.4275, "mean_token_accuracy": 0.8352140128612519, "step": 1580 }, { "epoch": 0.6713257094451504, "grad_norm": 4.250523219515856, "learning_rate": 5.894834445652777e-06, "loss": 0.411, "mean_token_accuracy": 0.8329778879880905, "step": 1585 }, { "epoch": 0.6734434561626429, "grad_norm": 1.157008090047474, "learning_rate": 5.827511560174835e-06, "loss": 0.4242, "mean_token_accuracy": 0.832972839474678, "step": 1590 }, { "epoch": 0.6755612028801355, "grad_norm": 1.1834078816860993, "learning_rate": 5.7604168790328774e-06, "loss": 0.3931, "mean_token_accuracy": 0.8443128287792205, "step": 1595 }, { "epoch": 0.6776789495976281, "grad_norm": 1.0766345733639675, "learning_rate": 5.693554071810987e-06, "loss": 0.4478, "mean_token_accuracy": 0.8282081812620163, "step": 1600 }, { "epoch": 0.6797966963151207, "grad_norm": 1.0314594529031804, "learning_rate": 5.626926795411447e-06, "loss": 0.4246, "mean_token_accuracy": 0.8321157455444336, "step": 1605 }, { "epoch": 0.6819144430326133, "grad_norm": 1.055274137880832, "learning_rate": 5.560538693854751e-06, "loss": 0.4193, "mean_token_accuracy": 0.8316533505916596, "step": 1610 }, { "epoch": 0.6840321897501059, "grad_norm": 1.1972782090907812, "learning_rate": 5.494393398080292e-06, "loss": 0.4313, "mean_token_accuracy": 0.834712353348732, "step": 1615 }, { "epoch": 0.6861499364675985, "grad_norm": 1.0962501568970522, "learning_rate": 5.428494525747769e-06, "loss": 0.4597, "mean_token_accuracy": 0.8248083680868149, "step": 1620 }, { "epoch": 0.688267683185091, "grad_norm": 1.0751444988160856, "learning_rate": 5.362845681039348e-06, "loss": 0.4321, "mean_token_accuracy": 0.8374727904796601, "step": 1625 }, { "epoch": 0.6903854299025837, "grad_norm": 1.1471090324016462, "learning_rate": 5.297450454462526e-06, "loss": 0.4328, "mean_token_accuracy": 0.8296476870775222, "step": 1630 }, { "epoch": 0.6925031766200762, "grad_norm": 0.962534660265453, "learning_rate": 5.23231242265375e-06, "loss": 0.4181, "mean_token_accuracy": 0.83418510556221, "step": 1635 }, { "epoch": 0.6946209233375689, "grad_norm": 1.1168651450432128, "learning_rate": 5.167435148182824e-06, "loss": 0.4176, "mean_token_accuracy": 0.8372534781694412, "step": 1640 }, { "epoch": 0.6967386700550614, "grad_norm": 1.2186341287706137, "learning_rate": 5.102822179358037e-06, "loss": 0.4075, "mean_token_accuracy": 0.8409687280654907, "step": 1645 }, { "epoch": 0.6988564167725541, "grad_norm": 0.9820636174800459, "learning_rate": 5.0384770500321175e-06, "loss": 0.4128, "mean_token_accuracy": 0.8384972155094147, "step": 1650 }, { "epoch": 0.7009741634900466, "grad_norm": 0.943830506781205, "learning_rate": 4.97440327940895e-06, "loss": 0.4027, "mean_token_accuracy": 0.8365049093961716, "step": 1655 }, { "epoch": 0.7030919102075391, "grad_norm": 1.0574783345670844, "learning_rate": 4.910604371851091e-06, "loss": 0.4308, "mean_token_accuracy": 0.8333552926778793, "step": 1660 }, { "epoch": 0.7052096569250318, "grad_norm": 1.103380699456734, "learning_rate": 4.847083816688123e-06, "loss": 0.412, "mean_token_accuracy": 0.8425119102001191, "step": 1665 }, { "epoch": 0.7073274036425243, "grad_norm": 1.117253769501395, "learning_rate": 4.783845088025807e-06, "loss": 0.4346, "mean_token_accuracy": 0.8330845534801483, "step": 1670 }, { "epoch": 0.709445150360017, "grad_norm": 1.4108563780024128, "learning_rate": 4.7208916445560625e-06, "loss": 0.414, "mean_token_accuracy": 0.8379091322422028, "step": 1675 }, { "epoch": 0.7115628970775095, "grad_norm": 1.031565575748758, "learning_rate": 4.658226929367826e-06, "loss": 0.4598, "mean_token_accuracy": 0.8240681082010269, "step": 1680 }, { "epoch": 0.7136806437950021, "grad_norm": 1.2248996065912452, "learning_rate": 4.595854369758727e-06, "loss": 0.4299, "mean_token_accuracy": 0.8363937050104141, "step": 1685 }, { "epoch": 0.7157983905124947, "grad_norm": 1.1049025661918381, "learning_rate": 4.5337773770476245e-06, "loss": 0.4273, "mean_token_accuracy": 0.8340339243412018, "step": 1690 }, { "epoch": 0.7179161372299873, "grad_norm": 1.1244170950870136, "learning_rate": 4.4719993463880695e-06, "loss": 0.4571, "mean_token_accuracy": 0.8225684702396393, "step": 1695 }, { "epoch": 0.7200338839474799, "grad_norm": 1.1969285633316296, "learning_rate": 4.410523656582576e-06, "loss": 0.4025, "mean_token_accuracy": 0.8440569192171097, "step": 1700 }, { "epoch": 0.7221516306649725, "grad_norm": 1.122866308313561, "learning_rate": 4.349353669897856e-06, "loss": 0.4208, "mean_token_accuracy": 0.837623131275177, "step": 1705 }, { "epoch": 0.7242693773824651, "grad_norm": 1.0173115464088704, "learning_rate": 4.288492731880917e-06, "loss": 0.4148, "mean_token_accuracy": 0.8388867497444152, "step": 1710 }, { "epoch": 0.7263871240999576, "grad_norm": 1.1018457774189827, "learning_rate": 4.227944171176072e-06, "loss": 0.4003, "mean_token_accuracy": 0.8392677456140518, "step": 1715 }, { "epoch": 0.7285048708174502, "grad_norm": 1.2471156860459571, "learning_rate": 4.167711299342909e-06, "loss": 0.4459, "mean_token_accuracy": 0.8256678134202957, "step": 1720 }, { "epoch": 0.7306226175349428, "grad_norm": 1.1273568017592417, "learning_rate": 4.107797410675166e-06, "loss": 0.4068, "mean_token_accuracy": 0.8386416286230087, "step": 1725 }, { "epoch": 0.7327403642524354, "grad_norm": 1.20918067568615, "learning_rate": 4.048205782020544e-06, "loss": 0.4539, "mean_token_accuracy": 0.8220532357692718, "step": 1730 }, { "epoch": 0.734858110969928, "grad_norm": 1.1573583276355073, "learning_rate": 3.988939672601509e-06, "loss": 0.395, "mean_token_accuracy": 0.844212406873703, "step": 1735 }, { "epoch": 0.7369758576874206, "grad_norm": 1.1516374922245958, "learning_rate": 3.930002323837026e-06, "loss": 0.4251, "mean_token_accuracy": 0.8371291518211365, "step": 1740 }, { "epoch": 0.7390936044049131, "grad_norm": 1.274643963255776, "learning_rate": 3.871396959165267e-06, "loss": 0.429, "mean_token_accuracy": 0.8348165363073349, "step": 1745 }, { "epoch": 0.7412113511224058, "grad_norm": 1.025583507042276, "learning_rate": 3.8131267838673336e-06, "loss": 0.4262, "mean_token_accuracy": 0.8343986541032791, "step": 1750 }, { "epoch": 0.7433290978398983, "grad_norm": 1.1299748085754966, "learning_rate": 3.755194984891943e-06, "loss": 0.4081, "mean_token_accuracy": 0.8430469453334808, "step": 1755 }, { "epoch": 0.745446844557391, "grad_norm": 1.0603027089656643, "learning_rate": 3.6976047306811115e-06, "loss": 0.4256, "mean_token_accuracy": 0.8382641762495041, "step": 1760 }, { "epoch": 0.7475645912748835, "grad_norm": 1.1281590494510496, "learning_rate": 3.6403591709968924e-06, "loss": 0.4357, "mean_token_accuracy": 0.8320927768945694, "step": 1765 }, { "epoch": 0.7496823379923762, "grad_norm": 1.0367839611389602, "learning_rate": 3.5834614367490706e-06, "loss": 0.4221, "mean_token_accuracy": 0.835366889834404, "step": 1770 }, { "epoch": 0.7518000847098687, "grad_norm": 1.0958827736818129, "learning_rate": 3.526914639823973e-06, "loss": 0.4381, "mean_token_accuracy": 0.8301591634750366, "step": 1775 }, { "epoch": 0.7539178314273612, "grad_norm": 1.0559223618431266, "learning_rate": 3.4707218729142224e-06, "loss": 0.4291, "mean_token_accuracy": 0.8316712707281113, "step": 1780 }, { "epoch": 0.7560355781448539, "grad_norm": 1.0792688197107765, "learning_rate": 3.414886209349615e-06, "loss": 0.4269, "mean_token_accuracy": 0.835688841342926, "step": 1785 }, { "epoch": 0.7581533248623464, "grad_norm": 1.1979681287726258, "learning_rate": 3.3594107029290347e-06, "loss": 0.4269, "mean_token_accuracy": 0.8371979027986527, "step": 1790 }, { "epoch": 0.7602710715798391, "grad_norm": 1.1468783022113433, "learning_rate": 3.304298387753426e-06, "loss": 0.4311, "mean_token_accuracy": 0.8341523915529251, "step": 1795 }, { "epoch": 0.7623888182973316, "grad_norm": 1.142335742385377, "learning_rate": 3.2495522780598442e-06, "loss": 0.4174, "mean_token_accuracy": 0.8298469454050064, "step": 1800 }, { "epoch": 0.7645065650148243, "grad_norm": 1.1968773332651736, "learning_rate": 3.1951753680566143e-06, "loss": 0.4383, "mean_token_accuracy": 0.8313175171613694, "step": 1805 }, { "epoch": 0.7666243117323168, "grad_norm": 1.0804618708583653, "learning_rate": 3.141170631759558e-06, "loss": 0.4086, "mean_token_accuracy": 0.8373444229364395, "step": 1810 }, { "epoch": 0.7687420584498094, "grad_norm": 1.0872538790077677, "learning_rate": 3.087541022829347e-06, "loss": 0.4221, "mean_token_accuracy": 0.8371105402708053, "step": 1815 }, { "epoch": 0.770859805167302, "grad_norm": 0.9905135006363225, "learning_rate": 3.034289474409943e-06, "loss": 0.4133, "mean_token_accuracy": 0.8365035742521286, "step": 1820 }, { "epoch": 0.7729775518847946, "grad_norm": 1.0890914888672922, "learning_rate": 2.981418898968186e-06, "loss": 0.4189, "mean_token_accuracy": 0.838862606883049, "step": 1825 }, { "epoch": 0.7750952986022872, "grad_norm": 1.1417209565486737, "learning_rate": 2.9289321881345257e-06, "loss": 0.4169, "mean_token_accuracy": 0.833648070693016, "step": 1830 }, { "epoch": 0.7772130453197797, "grad_norm": 1.1684616910176908, "learning_rate": 2.8768322125448265e-06, "loss": 0.4469, "mean_token_accuracy": 0.83056038916111, "step": 1835 }, { "epoch": 0.7793307920372723, "grad_norm": 1.1845681597767028, "learning_rate": 2.825121821683391e-06, "loss": 0.4223, "mean_token_accuracy": 0.8353413581848145, "step": 1840 }, { "epoch": 0.7814485387547649, "grad_norm": 1.1732126933903428, "learning_rate": 2.7738038437271288e-06, "loss": 0.4121, "mean_token_accuracy": 0.842677703499794, "step": 1845 }, { "epoch": 0.7835662854722575, "grad_norm": 1.0292583187860371, "learning_rate": 2.7228810853908406e-06, "loss": 0.3921, "mean_token_accuracy": 0.8447476714849472, "step": 1850 }, { "epoch": 0.7856840321897501, "grad_norm": 0.9892030702997285, "learning_rate": 2.67235633177373e-06, "loss": 0.4387, "mean_token_accuracy": 0.8288900941610337, "step": 1855 }, { "epoch": 0.7878017789072427, "grad_norm": 1.0050687986582967, "learning_rate": 2.6222323462070897e-06, "loss": 0.4356, "mean_token_accuracy": 0.828187745809555, "step": 1860 }, { "epoch": 0.7899195256247353, "grad_norm": 1.1304197153376732, "learning_rate": 2.572511870103149e-06, "loss": 0.4125, "mean_token_accuracy": 0.8425087302923202, "step": 1865 }, { "epoch": 0.7920372723422279, "grad_norm": 1.0444576639344187, "learning_rate": 2.5231976228051526e-06, "loss": 0.4318, "mean_token_accuracy": 0.8337043792009353, "step": 1870 }, { "epoch": 0.7941550190597204, "grad_norm": 1.0875080317220023, "learning_rate": 2.4742923014386154e-06, "loss": 0.4287, "mean_token_accuracy": 0.8368022799491882, "step": 1875 }, { "epoch": 0.7962727657772131, "grad_norm": 1.1517129093084153, "learning_rate": 2.4257985807638294e-06, "loss": 0.4284, "mean_token_accuracy": 0.8356128752231597, "step": 1880 }, { "epoch": 0.7983905124947056, "grad_norm": 1.2213468844119533, "learning_rate": 2.3777191130295673e-06, "loss": 0.411, "mean_token_accuracy": 0.8373890697956086, "step": 1885 }, { "epoch": 0.8005082592121983, "grad_norm": 1.1105462272187794, "learning_rate": 2.330056527828013e-06, "loss": 0.4549, "mean_token_accuracy": 0.8282926619052887, "step": 1890 }, { "epoch": 0.8026260059296908, "grad_norm": 1.1626653178571262, "learning_rate": 2.282813431950952e-06, "loss": 0.4295, "mean_token_accuracy": 0.8333282887935638, "step": 1895 }, { "epoch": 0.8047437526471835, "grad_norm": 1.1195581942328177, "learning_rate": 2.235992409247214e-06, "loss": 0.4338, "mean_token_accuracy": 0.8319763302803039, "step": 1900 }, { "epoch": 0.806861499364676, "grad_norm": 1.026868168904022, "learning_rate": 2.1895960204813194e-06, "loss": 0.4118, "mean_token_accuracy": 0.8370046824216842, "step": 1905 }, { "epoch": 0.8089792460821685, "grad_norm": 1.0639569641896143, "learning_rate": 2.1436268031934602e-06, "loss": 0.4411, "mean_token_accuracy": 0.8297486454248428, "step": 1910 }, { "epoch": 0.8110969927996612, "grad_norm": 1.0385740186847223, "learning_rate": 2.098087271560687e-06, "loss": 0.4152, "mean_token_accuracy": 0.8370089381933212, "step": 1915 }, { "epoch": 0.8132147395171537, "grad_norm": 1.1169845772777505, "learning_rate": 2.0529799162594242e-06, "loss": 0.4094, "mean_token_accuracy": 0.839673039317131, "step": 1920 }, { "epoch": 0.8153324862346464, "grad_norm": 1.0745546751170598, "learning_rate": 2.0083072043292406e-06, "loss": 0.417, "mean_token_accuracy": 0.8379459470510483, "step": 1925 }, { "epoch": 0.8174502329521389, "grad_norm": 1.206429363415916, "learning_rate": 1.9640715790379084e-06, "loss": 0.4133, "mean_token_accuracy": 0.8345289677381516, "step": 1930 }, { "epoch": 0.8195679796696315, "grad_norm": 1.0452292636568519, "learning_rate": 1.920275459747796e-06, "loss": 0.4123, "mean_token_accuracy": 0.8368586808443069, "step": 1935 }, { "epoch": 0.8216857263871241, "grad_norm": 1.0706189800647066, "learning_rate": 1.8769212417835314e-06, "loss": 0.3773, "mean_token_accuracy": 0.8513321369886399, "step": 1940 }, { "epoch": 0.8238034731046167, "grad_norm": 1.0974535637452612, "learning_rate": 1.8340112963009993e-06, "loss": 0.4353, "mean_token_accuracy": 0.8337271898984909, "step": 1945 }, { "epoch": 0.8259212198221093, "grad_norm": 1.0867209847341632, "learning_rate": 1.7915479701576577e-06, "loss": 0.4489, "mean_token_accuracy": 0.8291646331548691, "step": 1950 }, { "epoch": 0.8280389665396019, "grad_norm": 1.1993569416921062, "learning_rate": 1.7495335857841855e-06, "loss": 0.4138, "mean_token_accuracy": 0.8385995358228684, "step": 1955 }, { "epoch": 0.8301567132570945, "grad_norm": 1.1414883228473476, "learning_rate": 1.7079704410574505e-06, "loss": 0.3859, "mean_token_accuracy": 0.8459228605031968, "step": 1960 }, { "epoch": 0.832274459974587, "grad_norm": 1.048311577922366, "learning_rate": 1.6668608091748495e-06, "loss": 0.426, "mean_token_accuracy": 0.8357879251241684, "step": 1965 }, { "epoch": 0.8343922066920796, "grad_norm": 1.0617438922962255, "learning_rate": 1.6262069385299694e-06, "loss": 0.4334, "mean_token_accuracy": 0.8343731433153152, "step": 1970 }, { "epoch": 0.8365099534095722, "grad_norm": 1.1279209328755353, "learning_rate": 1.5860110525896143e-06, "loss": 0.4197, "mean_token_accuracy": 0.835442116856575, "step": 1975 }, { "epoch": 0.8386277001270648, "grad_norm": 0.9640338154076892, "learning_rate": 1.5462753497722139e-06, "loss": 0.4228, "mean_token_accuracy": 0.8363285154104233, "step": 1980 }, { "epoch": 0.8407454468445574, "grad_norm": 1.065476222817932, "learning_rate": 1.5070020033275655e-06, "loss": 0.3954, "mean_token_accuracy": 0.8427035689353943, "step": 1985 }, { "epoch": 0.84286319356205, "grad_norm": 1.055480105683973, "learning_rate": 1.4681931612179901e-06, "loss": 0.4289, "mean_token_accuracy": 0.8340502351522445, "step": 1990 }, { "epoch": 0.8449809402795425, "grad_norm": 1.0690985831761302, "learning_rate": 1.4298509460008491e-06, "loss": 0.4072, "mean_token_accuracy": 0.8402904689311981, "step": 1995 }, { "epoch": 0.8470986869970352, "grad_norm": 1.0063164183000968, "learning_rate": 1.39197745471245e-06, "loss": 0.4231, "mean_token_accuracy": 0.8361636906862259, "step": 2000 }, { "epoch": 0.8492164337145277, "grad_norm": 1.0247616494577987, "learning_rate": 1.354574758753363e-06, "loss": 0.4189, "mean_token_accuracy": 0.8310322672128677, "step": 2005 }, { "epoch": 0.8513341804320204, "grad_norm": 1.044860588344852, "learning_rate": 1.3176449037751294e-06, "loss": 0.4404, "mean_token_accuracy": 0.8303707420825959, "step": 2010 }, { "epoch": 0.8534519271495129, "grad_norm": 2.4537559889629694, "learning_rate": 1.28118990956837e-06, "loss": 0.4104, "mean_token_accuracy": 0.835821408033371, "step": 2015 }, { "epoch": 0.8555696738670056, "grad_norm": 1.0972489520800874, "learning_rate": 1.2452117699523303e-06, "loss": 0.4027, "mean_token_accuracy": 0.8460766285657882, "step": 2020 }, { "epoch": 0.8576874205844981, "grad_norm": 1.2309045137234433, "learning_rate": 1.2097124526658277e-06, "loss": 0.419, "mean_token_accuracy": 0.8366678208112717, "step": 2025 }, { "epoch": 0.8598051673019906, "grad_norm": 1.0849048269411365, "learning_rate": 1.1746938992596257e-06, "loss": 0.4174, "mean_token_accuracy": 0.8296289384365082, "step": 2030 }, { "epoch": 0.8619229140194833, "grad_norm": 0.989974221522167, "learning_rate": 1.1401580249902566e-06, "loss": 0.4153, "mean_token_accuracy": 0.8379861056804657, "step": 2035 }, { "epoch": 0.8640406607369758, "grad_norm": 1.0066596115748891, "learning_rate": 1.1061067187152584e-06, "loss": 0.4041, "mean_token_accuracy": 0.8417060792446136, "step": 2040 }, { "epoch": 0.8661584074544685, "grad_norm": 1.0526259070425423, "learning_rate": 1.0725418427898792e-06, "loss": 0.4099, "mean_token_accuracy": 0.8398545056581497, "step": 2045 }, { "epoch": 0.868276154171961, "grad_norm": 1.134470581551777, "learning_rate": 1.0394652329652165e-06, "loss": 0.4146, "mean_token_accuracy": 0.8354752600193024, "step": 2050 }, { "epoch": 0.8703939008894537, "grad_norm": 1.130864865166622, "learning_rate": 1.0068786982878087e-06, "loss": 0.418, "mean_token_accuracy": 0.8398678600788116, "step": 2055 }, { "epoch": 0.8725116476069462, "grad_norm": 1.1500087879964977, "learning_rate": 9.747840210007021e-07, "loss": 0.4157, "mean_token_accuracy": 0.8322781622409821, "step": 2060 }, { "epoch": 0.8746293943244388, "grad_norm": 0.9770307768209092, "learning_rate": 9.43182956445976e-07, "loss": 0.3977, "mean_token_accuracy": 0.8416966944932938, "step": 2065 }, { "epoch": 0.8767471410419314, "grad_norm": 1.2583818143393242, "learning_rate": 9.120772329687278e-07, "loss": 0.4251, "mean_token_accuracy": 0.8354076951742172, "step": 2070 }, { "epoch": 0.878864887759424, "grad_norm": 1.0618576291439479, "learning_rate": 8.814685518225552e-07, "loss": 0.4291, "mean_token_accuracy": 0.8308704495429993, "step": 2075 }, { "epoch": 0.8809826344769166, "grad_norm": 1.1180367611245425, "learning_rate": 8.513585870765118e-07, "loss": 0.3907, "mean_token_accuracy": 0.8452890306711197, "step": 2080 }, { "epoch": 0.8831003811944091, "grad_norm": 1.230123212504081, "learning_rate": 8.217489855235338e-07, "loss": 0.4144, "mean_token_accuracy": 0.8392110764980316, "step": 2085 }, { "epoch": 0.8852181279119017, "grad_norm": 1.1108948475484288, "learning_rate": 7.926413665903931e-07, "loss": 0.4151, "mean_token_accuracy": 0.8380868971347809, "step": 2090 }, { "epoch": 0.8873358746293943, "grad_norm": 1.098761542271965, "learning_rate": 7.640373222491038e-07, "loss": 0.4196, "mean_token_accuracy": 0.8407029449939728, "step": 2095 }, { "epoch": 0.8894536213468869, "grad_norm": 1.0940803341605705, "learning_rate": 7.359384169298744e-07, "loss": 0.4097, "mean_token_accuracy": 0.8401619613170623, "step": 2100 }, { "epoch": 0.8915713680643795, "grad_norm": 0.9066347453646844, "learning_rate": 7.083461874355335e-07, "loss": 0.4257, "mean_token_accuracy": 0.8362819194793701, "step": 2105 }, { "epoch": 0.8936891147818721, "grad_norm": 1.0448023766882066, "learning_rate": 6.81262142857475e-07, "loss": 0.3898, "mean_token_accuracy": 0.8459620922803879, "step": 2110 }, { "epoch": 0.8958068614993647, "grad_norm": 1.0611643496346475, "learning_rate": 6.546877644931315e-07, "loss": 0.4208, "mean_token_accuracy": 0.8312031596899032, "step": 2115 }, { "epoch": 0.8979246082168573, "grad_norm": 1.1224663985096108, "learning_rate": 6.286245057649542e-07, "loss": 0.3994, "mean_token_accuracy": 0.8465497404336929, "step": 2120 }, { "epoch": 0.9000423549343498, "grad_norm": 1.0832056476567533, "learning_rate": 6.030737921409169e-07, "loss": 0.3867, "mean_token_accuracy": 0.8440623044967651, "step": 2125 }, { "epoch": 0.9021601016518425, "grad_norm": 1.0523110523954844, "learning_rate": 5.7803702105656e-07, "loss": 0.4127, "mean_token_accuracy": 0.8366563141345977, "step": 2130 }, { "epoch": 0.904277848369335, "grad_norm": 1.0105232913792406, "learning_rate": 5.535155618385612e-07, "loss": 0.4195, "mean_token_accuracy": 0.8335390537977219, "step": 2135 }, { "epoch": 0.9063955950868277, "grad_norm": 1.1129917485868344, "learning_rate": 5.295107556298329e-07, "loss": 0.3928, "mean_token_accuracy": 0.8431670844554902, "step": 2140 }, { "epoch": 0.9085133418043202, "grad_norm": 1.145719574659648, "learning_rate": 5.060239153161872e-07, "loss": 0.4019, "mean_token_accuracy": 0.8419764310121536, "step": 2145 }, { "epoch": 0.9106310885218127, "grad_norm": 1.443628282269306, "learning_rate": 4.830563254545207e-07, "loss": 0.4233, "mean_token_accuracy": 0.8361739784479141, "step": 2150 }, { "epoch": 0.9127488352393054, "grad_norm": 1.1691030241329559, "learning_rate": 4.6060924220255654e-07, "loss": 0.4257, "mean_token_accuracy": 0.8305665761232376, "step": 2155 }, { "epoch": 0.9148665819567979, "grad_norm": 1.2424085660240223, "learning_rate": 4.386838932501547e-07, "loss": 0.4303, "mean_token_accuracy": 0.8358988225460052, "step": 2160 }, { "epoch": 0.9169843286742906, "grad_norm": 1.0258262640063769, "learning_rate": 4.172814777521483e-07, "loss": 0.4298, "mean_token_accuracy": 0.8366893321275711, "step": 2165 }, { "epoch": 0.9191020753917831, "grad_norm": 1.0932401792673323, "learning_rate": 3.9640316626277654e-07, "loss": 0.4172, "mean_token_accuracy": 0.836585283279419, "step": 2170 }, { "epoch": 0.9212198221092758, "grad_norm": 1.0881178279493329, "learning_rate": 3.7605010067165216e-07, "loss": 0.42, "mean_token_accuracy": 0.8352493315935134, "step": 2175 }, { "epoch": 0.9233375688267683, "grad_norm": 1.057750886441079, "learning_rate": 3.562233941413096e-07, "loss": 0.3975, "mean_token_accuracy": 0.8412194460630417, "step": 2180 }, { "epoch": 0.9254553155442609, "grad_norm": 1.1056774030421723, "learning_rate": 3.3692413104633226e-07, "loss": 0.3976, "mean_token_accuracy": 0.840697067975998, "step": 2185 }, { "epoch": 0.9275730622617535, "grad_norm": 1.163101779598673, "learning_rate": 3.1815336691403464e-07, "loss": 0.3751, "mean_token_accuracy": 0.8496327966451644, "step": 2190 }, { "epoch": 0.929690808979246, "grad_norm": 0.9755793569303719, "learning_rate": 2.999121283667339e-07, "loss": 0.4079, "mean_token_accuracy": 0.8418219208717346, "step": 2195 }, { "epoch": 0.9318085556967387, "grad_norm": 1.021358583461123, "learning_rate": 2.8220141306561034e-07, "loss": 0.4186, "mean_token_accuracy": 0.8352805793285369, "step": 2200 }, { "epoch": 0.9339263024142312, "grad_norm": 1.0396837778560488, "learning_rate": 2.6502218965613335e-07, "loss": 0.4225, "mean_token_accuracy": 0.8338442891836166, "step": 2205 }, { "epoch": 0.9360440491317239, "grad_norm": 1.1742052357618658, "learning_rate": 2.483753977150882e-07, "loss": 0.4067, "mean_token_accuracy": 0.8387827515602112, "step": 2210 }, { "epoch": 0.9381617958492164, "grad_norm": 1.0739901995137444, "learning_rate": 2.3226194769918497e-07, "loss": 0.4041, "mean_token_accuracy": 0.837730023264885, "step": 2215 }, { "epoch": 0.940279542566709, "grad_norm": 1.0246012489566791, "learning_rate": 2.1668272089526377e-07, "loss": 0.4161, "mean_token_accuracy": 0.8399739652872086, "step": 2220 }, { "epoch": 0.9423972892842016, "grad_norm": 1.0463273467785923, "learning_rate": 2.0163856937210236e-07, "loss": 0.4245, "mean_token_accuracy": 0.8379955619573594, "step": 2225 }, { "epoch": 0.9445150360016942, "grad_norm": 1.13493837929642, "learning_rate": 1.8713031593380116e-07, "loss": 0.405, "mean_token_accuracy": 0.8368137925863266, "step": 2230 }, { "epoch": 0.9466327827191868, "grad_norm": 1.1326422720007092, "learning_rate": 1.731587540747903e-07, "loss": 0.4164, "mean_token_accuracy": 0.839913833141327, "step": 2235 }, { "epoch": 0.9487505294366794, "grad_norm": 1.1288581153860058, "learning_rate": 1.597246479364345e-07, "loss": 0.4345, "mean_token_accuracy": 0.8263521671295166, "step": 2240 }, { "epoch": 0.9508682761541719, "grad_norm": 1.0618048867408285, "learning_rate": 1.4682873226523064e-07, "loss": 0.4116, "mean_token_accuracy": 0.8380947977304458, "step": 2245 }, { "epoch": 0.9529860228716646, "grad_norm": 1.0089748524009554, "learning_rate": 1.3447171237262912e-07, "loss": 0.4281, "mean_token_accuracy": 0.8311914891004563, "step": 2250 }, { "epoch": 0.9551037695891571, "grad_norm": 1.1718653607363838, "learning_rate": 1.2265426409645676e-07, "loss": 0.4205, "mean_token_accuracy": 0.8367854833602906, "step": 2255 }, { "epoch": 0.9572215163066498, "grad_norm": 1.009709808393184, "learning_rate": 1.1137703376395304e-07, "loss": 0.4307, "mean_token_accuracy": 0.8332184463739395, "step": 2260 }, { "epoch": 0.9593392630241423, "grad_norm": 1.0456672123180084, "learning_rate": 1.0064063815642178e-07, "loss": 0.4143, "mean_token_accuracy": 0.8407183200120926, "step": 2265 }, { "epoch": 0.961457009741635, "grad_norm": 1.4564232381895734, "learning_rate": 9.044566447549697e-08, "loss": 0.3935, "mean_token_accuracy": 0.843877837061882, "step": 2270 }, { "epoch": 0.9635747564591275, "grad_norm": 1.006395133879737, "learning_rate": 8.079267031102844e-08, "loss": 0.4379, "mean_token_accuracy": 0.8322035163640976, "step": 2275 }, { "epoch": 0.96569250317662, "grad_norm": 1.0451381392295622, "learning_rate": 7.16821836105841e-08, "loss": 0.3998, "mean_token_accuracy": 0.8473025262355804, "step": 2280 }, { "epoch": 0.9678102498941127, "grad_norm": 1.0472971428422386, "learning_rate": 6.311470265057518e-08, "loss": 0.423, "mean_token_accuracy": 0.8354467749595642, "step": 2285 }, { "epoch": 0.9699279966116052, "grad_norm": 1.1605199240395647, "learning_rate": 5.5090696009004744e-08, "loss": 0.4257, "mean_token_accuracy": 0.8360013753175736, "step": 2290 }, { "epoch": 0.9720457433290979, "grad_norm": 0.9898182837486158, "learning_rate": 4.761060253984151e-08, "loss": 0.4204, "mean_token_accuracy": 0.8367842882871628, "step": 2295 }, { "epoch": 0.9741634900465904, "grad_norm": 1.088987157568079, "learning_rate": 4.067483134901573e-08, "loss": 0.4134, "mean_token_accuracy": 0.83856400847435, "step": 2300 }, { "epoch": 0.976281236764083, "grad_norm": 1.0295706774122013, "learning_rate": 3.4283761772042623e-08, "loss": 0.4224, "mean_token_accuracy": 0.8354990780353546, "step": 2305 }, { "epoch": 0.9783989834815756, "grad_norm": 1.0694735478921555, "learning_rate": 2.84377433532812e-08, "loss": 0.4305, "mean_token_accuracy": 0.8316824287176132, "step": 2310 }, { "epoch": 0.9805167301990682, "grad_norm": 1.0827744380795652, "learning_rate": 2.3137095826809564e-08, "loss": 0.402, "mean_token_accuracy": 0.8404913783073426, "step": 2315 }, { "epoch": 0.9826344769165608, "grad_norm": 1.0960538876783272, "learning_rate": 1.8382109098944444e-08, "loss": 0.4352, "mean_token_accuracy": 0.8338410943746567, "step": 2320 }, { "epoch": 0.9847522236340533, "grad_norm": 1.1206569874331853, "learning_rate": 1.4173043232380557e-08, "loss": 0.4076, "mean_token_accuracy": 0.8435803085565567, "step": 2325 }, { "epoch": 0.986869970351546, "grad_norm": 1.0708342349134583, "learning_rate": 1.0510128431968635e-08, "loss": 0.4041, "mean_token_accuracy": 0.8435177773237228, "step": 2330 }, { "epoch": 0.9889877170690385, "grad_norm": 1.0195754299190762, "learning_rate": 7.3935650321255156e-09, "loss": 0.4017, "mean_token_accuracy": 0.8434190511703491, "step": 2335 }, { "epoch": 0.9911054637865311, "grad_norm": 1.043562362927536, "learning_rate": 4.823523485879556e-09, "loss": 0.4441, "mean_token_accuracy": 0.8331767469644547, "step": 2340 }, { "epoch": 0.9932232105040237, "grad_norm": 0.9692528305370003, "learning_rate": 2.800144355540324e-09, "loss": 0.4112, "mean_token_accuracy": 0.836205193400383, "step": 2345 }, { "epoch": 0.9953409572215163, "grad_norm": 0.9767634882260735, "learning_rate": 1.32353830502141e-09, "loss": 0.4233, "mean_token_accuracy": 0.8327444672584534, "step": 2350 }, { "epoch": 0.9974587039390089, "grad_norm": 1.1074445733229596, "learning_rate": 3.9378609377971335e-10, "loss": 0.3959, "mean_token_accuracy": 0.8446923106908798, "step": 2355 }, { "epoch": 0.9995764506565015, "grad_norm": 1.0245959330198788, "learning_rate": 1.0938572402308111e-11, "loss": 0.4106, "mean_token_accuracy": 0.8339618355035782, "step": 2360 }, { "epoch": 1.0, "mean_token_accuracy": 0.890313521027565, "step": 2361, "total_flos": 451385831948288.0, "train_loss": 0.48239256052181206, "train_runtime": 37146.7848, "train_samples_per_second": 1.017, "train_steps_per_second": 0.064 } ], "logging_steps": 5, "max_steps": 2361, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 451385831948288.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }