diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3819 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2361, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0021177467174925877, + "grad_norm": 83.68173840087012, + "learning_rate": 4.219409282700422e-07, + "loss": 2.1661, + "mean_token_accuracy": 0.5976044356822967, + "step": 5 + }, + { + "epoch": 0.004235493434985175, + "grad_norm": 71.11062161153062, + "learning_rate": 8.438818565400844e-07, + "loss": 2.1622, + "mean_token_accuracy": 0.5995522707700729, + "step": 10 + }, + { + "epoch": 0.0063532401524777635, + "grad_norm": 34.572128830715705, + "learning_rate": 1.2658227848101267e-06, + "loss": 2.0632, + "mean_token_accuracy": 0.5944636166095734, + "step": 15 + }, + { + "epoch": 0.00847098686997035, + "grad_norm": 17.011229671312382, + "learning_rate": 1.6877637130801689e-06, + "loss": 1.7723, + "mean_token_accuracy": 0.6232941240072251, + "step": 20 + }, + { + "epoch": 0.010588733587462939, + "grad_norm": 9.74778063232665, + "learning_rate": 2.1097046413502114e-06, + "loss": 1.5731, + "mean_token_accuracy": 0.63005710542202, + "step": 25 + }, + { + "epoch": 0.012706480304955527, + "grad_norm": 7.914154804733739, + "learning_rate": 2.5316455696202535e-06, + "loss": 1.3488, + "mean_token_accuracy": 0.6629315137863159, + "step": 30 + }, + { + "epoch": 0.014824227022448115, + "grad_norm": 7.497303646153751, + "learning_rate": 2.9535864978902956e-06, + "loss": 1.1306, + "mean_token_accuracy": 0.6912301659584046, + "step": 35 + }, + { + "epoch": 0.0169419737399407, + "grad_norm": 3.0571735289067807, + "learning_rate": 3.3755274261603377e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7339568644762039, + "step": 40 + }, + { + "epoch": 0.01905972045743329, + "grad_norm": 3.7299236330227674, + "learning_rate": 3.7974683544303802e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.7472610950469971, + "step": 45 + }, + { + "epoch": 0.021177467174925878, + "grad_norm": 1.94550213470537, + "learning_rate": 4.219409282700423e-06, + "loss": 0.813, + "mean_token_accuracy": 0.7466759532690048, + "step": 50 + }, + { + "epoch": 0.023295213892418468, + "grad_norm": 1.9631003842444021, + "learning_rate": 4.641350210970465e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7591830432415009, + "step": 55 + }, + { + "epoch": 0.025412960609911054, + "grad_norm": 2.126411421817186, + "learning_rate": 5.063291139240507e-06, + "loss": 0.6999, + "mean_token_accuracy": 0.7653463244438171, + "step": 60 + }, + { + "epoch": 0.027530707327403644, + "grad_norm": 2.431633067673993, + "learning_rate": 5.485232067510548e-06, + "loss": 0.6803, + "mean_token_accuracy": 0.7705583959817887, + "step": 65 + }, + { + "epoch": 0.02964845404489623, + "grad_norm": 2.1470463103072706, + "learning_rate": 5.907172995780591e-06, + "loss": 0.6716, + "mean_token_accuracy": 0.7747641324996948, + "step": 70 + }, + { + "epoch": 0.03176620076238882, + "grad_norm": 1.8134772555606546, + "learning_rate": 6.329113924050634e-06, + "loss": 0.6442, + "mean_token_accuracy": 0.7782594561576843, + "step": 75 + }, + { + "epoch": 0.0338839474798814, + "grad_norm": 1.828059395701887, + "learning_rate": 6.751054852320675e-06, + "loss": 0.654, + "mean_token_accuracy": 0.7730626821517944, + "step": 80 + }, + { + "epoch": 0.03600169419737399, + "grad_norm": 2.144653892020441, + "learning_rate": 7.172995780590718e-06, + "loss": 0.5866, + "mean_token_accuracy": 0.7909977465867997, + "step": 85 + }, + { + "epoch": 0.03811944091486658, + "grad_norm": 1.8471427217270429, + "learning_rate": 7.5949367088607605e-06, + "loss": 0.6343, + "mean_token_accuracy": 0.7756380766630173, + "step": 90 + }, + { + "epoch": 0.04023718763235917, + "grad_norm": 2.3290876362673916, + "learning_rate": 8.016877637130802e-06, + "loss": 0.6006, + "mean_token_accuracy": 0.7818532437086105, + "step": 95 + }, + { + "epoch": 0.042354934349851756, + "grad_norm": 2.038115409634727, + "learning_rate": 8.438818565400846e-06, + "loss": 0.6123, + "mean_token_accuracy": 0.7774519264698029, + "step": 100 + }, + { + "epoch": 0.044472681067344345, + "grad_norm": 1.8512017533544443, + "learning_rate": 8.860759493670886e-06, + "loss": 0.5782, + "mean_token_accuracy": 0.787897452712059, + "step": 105 + }, + { + "epoch": 0.046590427784836935, + "grad_norm": 1.7617335354592987, + "learning_rate": 9.28270042194093e-06, + "loss": 0.5646, + "mean_token_accuracy": 0.7916492879390716, + "step": 110 + }, + { + "epoch": 0.04870817450232952, + "grad_norm": 1.7594677346726018, + "learning_rate": 9.704641350210972e-06, + "loss": 0.5741, + "mean_token_accuracy": 0.7923983573913574, + "step": 115 + }, + { + "epoch": 0.05082592121982211, + "grad_norm": 2.074930275455498, + "learning_rate": 1.0126582278481014e-05, + "loss": 0.5872, + "mean_token_accuracy": 0.7833609640598297, + "step": 120 + }, + { + "epoch": 0.0529436679373147, + "grad_norm": 1.9672826514728756, + "learning_rate": 1.0548523206751056e-05, + "loss": 0.5525, + "mean_token_accuracy": 0.7987953841686248, + "step": 125 + }, + { + "epoch": 0.05506141465480729, + "grad_norm": 1.8715882057544604, + "learning_rate": 1.0970464135021096e-05, + "loss": 0.5825, + "mean_token_accuracy": 0.788796243071556, + "step": 130 + }, + { + "epoch": 0.05717916137229987, + "grad_norm": 2.055120934997949, + "learning_rate": 1.139240506329114e-05, + "loss": 0.5668, + "mean_token_accuracy": 0.794330358505249, + "step": 135 + }, + { + "epoch": 0.05929690808979246, + "grad_norm": 2.1751457050987213, + "learning_rate": 1.1814345991561182e-05, + "loss": 0.5504, + "mean_token_accuracy": 0.7986414194107055, + "step": 140 + }, + { + "epoch": 0.06141465480728505, + "grad_norm": 2.6530737679742735, + "learning_rate": 1.2236286919831224e-05, + "loss": 0.5825, + "mean_token_accuracy": 0.787852120399475, + "step": 145 + }, + { + "epoch": 0.06353240152477764, + "grad_norm": 1.7772714746257652, + "learning_rate": 1.2658227848101268e-05, + "loss": 0.5321, + "mean_token_accuracy": 0.7960227221250534, + "step": 150 + }, + { + "epoch": 0.06565014824227022, + "grad_norm": 2.007446829510951, + "learning_rate": 1.3080168776371309e-05, + "loss": 0.5652, + "mean_token_accuracy": 0.7959463059902191, + "step": 155 + }, + { + "epoch": 0.0677678949597628, + "grad_norm": 1.5240825493431345, + "learning_rate": 1.350210970464135e-05, + "loss": 0.5471, + "mean_token_accuracy": 0.7939142495393753, + "step": 160 + }, + { + "epoch": 0.0698856416772554, + "grad_norm": 2.1050082197685573, + "learning_rate": 1.3924050632911395e-05, + "loss": 0.5283, + "mean_token_accuracy": 0.8012055605649948, + "step": 165 + }, + { + "epoch": 0.07200338839474799, + "grad_norm": 1.8465069166680506, + "learning_rate": 1.4345991561181437e-05, + "loss": 0.5525, + "mean_token_accuracy": 0.8044249504804611, + "step": 170 + }, + { + "epoch": 0.07412113511224058, + "grad_norm": 1.6175686507972196, + "learning_rate": 1.4767932489451477e-05, + "loss": 0.5561, + "mean_token_accuracy": 0.8003985285758972, + "step": 175 + }, + { + "epoch": 0.07623888182973317, + "grad_norm": 1.987593615824268, + "learning_rate": 1.5189873417721521e-05, + "loss": 0.5637, + "mean_token_accuracy": 0.7922172099351883, + "step": 180 + }, + { + "epoch": 0.07835662854722575, + "grad_norm": 1.6436094243541917, + "learning_rate": 1.5611814345991563e-05, + "loss": 0.5662, + "mean_token_accuracy": 0.7916066467761993, + "step": 185 + }, + { + "epoch": 0.08047437526471835, + "grad_norm": 1.6236359809975813, + "learning_rate": 1.6033755274261603e-05, + "loss": 0.5432, + "mean_token_accuracy": 0.8009026974439621, + "step": 190 + }, + { + "epoch": 0.08259212198221093, + "grad_norm": 1.5274031981451834, + "learning_rate": 1.6455696202531647e-05, + "loss": 0.5484, + "mean_token_accuracy": 0.7982567459344864, + "step": 195 + }, + { + "epoch": 0.08470986869970351, + "grad_norm": 2.03018219248001, + "learning_rate": 1.687763713080169e-05, + "loss": 0.5407, + "mean_token_accuracy": 0.8011936664581298, + "step": 200 + }, + { + "epoch": 0.08682761541719611, + "grad_norm": 1.8014465384883518, + "learning_rate": 1.729957805907173e-05, + "loss": 0.5247, + "mean_token_accuracy": 0.8084624141454697, + "step": 205 + }, + { + "epoch": 0.08894536213468869, + "grad_norm": 1.7271306151793162, + "learning_rate": 1.7721518987341772e-05, + "loss": 0.5708, + "mean_token_accuracy": 0.7947988003492356, + "step": 210 + }, + { + "epoch": 0.09106310885218127, + "grad_norm": 1.9191596287336676, + "learning_rate": 1.8143459915611816e-05, + "loss": 0.5363, + "mean_token_accuracy": 0.8047021597623825, + "step": 215 + }, + { + "epoch": 0.09318085556967387, + "grad_norm": 1.7320574921982606, + "learning_rate": 1.856540084388186e-05, + "loss": 0.5658, + "mean_token_accuracy": 0.7928981482982635, + "step": 220 + }, + { + "epoch": 0.09529860228716645, + "grad_norm": 1.8013279459762328, + "learning_rate": 1.89873417721519e-05, + "loss": 0.5283, + "mean_token_accuracy": 0.8064373075962067, + "step": 225 + }, + { + "epoch": 0.09741634900465904, + "grad_norm": 1.6963994963949376, + "learning_rate": 1.9409282700421944e-05, + "loss": 0.541, + "mean_token_accuracy": 0.798399469256401, + "step": 230 + }, + { + "epoch": 0.09953409572215163, + "grad_norm": 1.8211709634602606, + "learning_rate": 1.9831223628691984e-05, + "loss": 0.5529, + "mean_token_accuracy": 0.7973109990358352, + "step": 235 + }, + { + "epoch": 0.10165184243964422, + "grad_norm": 1.800221940229098, + "learning_rate": 1.9999901552991966e-05, + "loss": 0.5297, + "mean_token_accuracy": 0.8061769932508469, + "step": 240 + }, + { + "epoch": 0.10376958915713681, + "grad_norm": 3.118430462846644, + "learning_rate": 1.9999299939406875e-05, + "loss": 0.567, + "mean_token_accuracy": 0.7869770050048828, + "step": 245 + }, + { + "epoch": 0.1058873358746294, + "grad_norm": 6.373265842936888, + "learning_rate": 1.9998151437882874e-05, + "loss": 0.5194, + "mean_token_accuracy": 0.8079254150390625, + "step": 250 + }, + { + "epoch": 0.10800508259212198, + "grad_norm": 1.7643542086224073, + "learning_rate": 1.999645611123453e-05, + "loss": 0.5476, + "mean_token_accuracy": 0.8036489874124527, + "step": 255 + }, + { + "epoch": 0.11012282930961458, + "grad_norm": 1.7570808876197173, + "learning_rate": 1.999421405218369e-05, + "loss": 0.5183, + "mean_token_accuracy": 0.8039919883012772, + "step": 260 + }, + { + "epoch": 0.11224057602710716, + "grad_norm": 1.4650600928842654, + "learning_rate": 1.9991425383354462e-05, + "loss": 0.5575, + "mean_token_accuracy": 0.7989150047302246, + "step": 265 + }, + { + "epoch": 0.11435832274459974, + "grad_norm": 1.5715508311626518, + "learning_rate": 1.9988090257266442e-05, + "loss": 0.5276, + "mean_token_accuracy": 0.8024184852838516, + "step": 270 + }, + { + "epoch": 0.11647606946209234, + "grad_norm": 1.5012575844730074, + "learning_rate": 1.9984208856326433e-05, + "loss": 0.511, + "mean_token_accuracy": 0.810269170999527, + "step": 275 + }, + { + "epoch": 0.11859381617958492, + "grad_norm": 2.1674114266205553, + "learning_rate": 1.9979781392818424e-05, + "loss": 0.5069, + "mean_token_accuracy": 0.8049084335565567, + "step": 280 + }, + { + "epoch": 0.1207115628970775, + "grad_norm": 1.597566985653751, + "learning_rate": 1.9974808108892017e-05, + "loss": 0.5097, + "mean_token_accuracy": 0.810433080792427, + "step": 285 + }, + { + "epoch": 0.1228293096145701, + "grad_norm": 2.721798223377223, + "learning_rate": 1.9969289276549144e-05, + "loss": 0.526, + "mean_token_accuracy": 0.8058519691228867, + "step": 290 + }, + { + "epoch": 0.12494705633206268, + "grad_norm": 1.526771766492988, + "learning_rate": 1.9963225197629223e-05, + "loss": 0.5172, + "mean_token_accuracy": 0.8079220175743103, + "step": 295 + }, + { + "epoch": 0.12706480304955528, + "grad_norm": 1.3424112355487237, + "learning_rate": 1.9956616203792636e-05, + "loss": 0.5135, + "mean_token_accuracy": 0.806724363565445, + "step": 300 + }, + { + "epoch": 0.12918254976704785, + "grad_norm": 1.5824773036593809, + "learning_rate": 1.9949462656502588e-05, + "loss": 0.5383, + "mean_token_accuracy": 0.8001780599355698, + "step": 305 + }, + { + "epoch": 0.13130029648454045, + "grad_norm": 1.5157834737082827, + "learning_rate": 1.994176494700534e-05, + "loss": 0.5466, + "mean_token_accuracy": 0.7970251202583313, + "step": 310 + }, + { + "epoch": 0.13341804320203304, + "grad_norm": 1.8369627378901519, + "learning_rate": 1.993352349630882e-05, + "loss": 0.5218, + "mean_token_accuracy": 0.8072717070579529, + "step": 315 + }, + { + "epoch": 0.1355357899195256, + "grad_norm": 1.5676620169867563, + "learning_rate": 1.9924738755159573e-05, + "loss": 0.5116, + "mean_token_accuracy": 0.8025958120822907, + "step": 320 + }, + { + "epoch": 0.1376535366370182, + "grad_norm": 1.5442271717658778, + "learning_rate": 1.9915411204018137e-05, + "loss": 0.495, + "mean_token_accuracy": 0.8155842959880829, + "step": 325 + }, + { + "epoch": 0.1397712833545108, + "grad_norm": 1.9104862823035134, + "learning_rate": 1.9905541353032744e-05, + "loss": 0.4707, + "mean_token_accuracy": 0.8196403324604035, + "step": 330 + }, + { + "epoch": 0.14188903007200337, + "grad_norm": 1.8843041038781683, + "learning_rate": 1.9895129742011434e-05, + "loss": 0.5359, + "mean_token_accuracy": 0.8036209732294083, + "step": 335 + }, + { + "epoch": 0.14400677678949597, + "grad_norm": 1.2996290243783448, + "learning_rate": 1.9884176940392522e-05, + "loss": 0.5355, + "mean_token_accuracy": 0.7970023989677429, + "step": 340 + }, + { + "epoch": 0.14612452350698857, + "grad_norm": 1.7409691547169837, + "learning_rate": 1.9872683547213446e-05, + "loss": 0.5222, + "mean_token_accuracy": 0.8015773713588714, + "step": 345 + }, + { + "epoch": 0.14824227022448117, + "grad_norm": 1.3236145792783143, + "learning_rate": 1.9860650191078033e-05, + "loss": 0.5165, + "mean_token_accuracy": 0.8045854181051254, + "step": 350 + }, + { + "epoch": 0.15036001694197373, + "grad_norm": 1.5674402609006048, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.5141, + "mean_token_accuracy": 0.8047444432973861, + "step": 355 + }, + { + "epoch": 0.15247776365946633, + "grad_norm": 1.4948547674340282, + "learning_rate": 1.98349662519774e-05, + "loss": 0.493, + "mean_token_accuracy": 0.8128765910863877, + "step": 360 + }, + { + "epoch": 0.15459551037695893, + "grad_norm": 1.57285942684427, + "learning_rate": 1.9821317073734173e-05, + "loss": 0.5114, + "mean_token_accuracy": 0.8024025142192841, + "step": 365 + }, + { + "epoch": 0.1567132570944515, + "grad_norm": 1.3725667498479879, + "learning_rate": 1.9807130741901756e-05, + "loss": 0.5552, + "mean_token_accuracy": 0.7975639194250107, + "step": 370 + }, + { + "epoch": 0.1588310038119441, + "grad_norm": 1.6323326415858614, + "learning_rate": 1.979240803236785e-05, + "loss": 0.5101, + "mean_token_accuracy": 0.8058428287506103, + "step": 375 + }, + { + "epoch": 0.1609487505294367, + "grad_norm": 1.293657741608038, + "learning_rate": 1.9777149750356044e-05, + "loss": 0.4931, + "mean_token_accuracy": 0.8156037211418152, + "step": 380 + }, + { + "epoch": 0.16306649724692926, + "grad_norm": 1.584456213127757, + "learning_rate": 1.9761356730381806e-05, + "loss": 0.5066, + "mean_token_accuracy": 0.8106210082769394, + "step": 385 + }, + { + "epoch": 0.16518424396442186, + "grad_norm": 1.3531024564685128, + "learning_rate": 1.9745029836206813e-05, + "loss": 0.4862, + "mean_token_accuracy": 0.8180296182632446, + "step": 390 + }, + { + "epoch": 0.16730199068191445, + "grad_norm": 1.5992771952291873, + "learning_rate": 1.9728169960791736e-05, + "loss": 0.5158, + "mean_token_accuracy": 0.8020082831382751, + "step": 395 + }, + { + "epoch": 0.16941973739940702, + "grad_norm": 1.3875752393035827, + "learning_rate": 1.9710778026247367e-05, + "loss": 0.5268, + "mean_token_accuracy": 0.8021057844161987, + "step": 400 + }, + { + "epoch": 0.17153748411689962, + "grad_norm": 1.4892475787998831, + "learning_rate": 1.9692854983784235e-05, + "loss": 0.5031, + "mean_token_accuracy": 0.8153967589139939, + "step": 405 + }, + { + "epoch": 0.17365523083439222, + "grad_norm": 1.3435721015179996, + "learning_rate": 1.9674401813660532e-05, + "loss": 0.5151, + "mean_token_accuracy": 0.8066144526004791, + "step": 410 + }, + { + "epoch": 0.17577297755188478, + "grad_norm": 1.4757784795296558, + "learning_rate": 1.9655419525128528e-05, + "loss": 0.5197, + "mean_token_accuracy": 0.8056324630975723, + "step": 415 + }, + { + "epoch": 0.17789072426937738, + "grad_norm": 1.8586890907074842, + "learning_rate": 1.9635909156379373e-05, + "loss": 0.4817, + "mean_token_accuracy": 0.8227346748113632, + "step": 420 + }, + { + "epoch": 0.18000847098686998, + "grad_norm": 1.3338010634125226, + "learning_rate": 1.9615871774486293e-05, + "loss": 0.476, + "mean_token_accuracy": 0.8171389639377594, + "step": 425 + }, + { + "epoch": 0.18212621770436255, + "grad_norm": 1.467996639944381, + "learning_rate": 1.959530847534627e-05, + "loss": 0.4857, + "mean_token_accuracy": 0.8151497721672059, + "step": 430 + }, + { + "epoch": 0.18424396442185514, + "grad_norm": 1.482953746737999, + "learning_rate": 1.9574220383620054e-05, + "loss": 0.4922, + "mean_token_accuracy": 0.8100210309028626, + "step": 435 + }, + { + "epoch": 0.18636171113934774, + "grad_norm": 5.208401516653082, + "learning_rate": 1.95526086526707e-05, + "loss": 0.5263, + "mean_token_accuracy": 0.8080328673124313, + "step": 440 + }, + { + "epoch": 0.1884794578568403, + "grad_norm": 1.5834873689672437, + "learning_rate": 1.9530474464500445e-05, + "loss": 0.514, + "mean_token_accuracy": 0.8094299465417862, + "step": 445 + }, + { + "epoch": 0.1905972045743329, + "grad_norm": 1.3405671636751928, + "learning_rate": 1.9507819029686094e-05, + "loss": 0.5119, + "mean_token_accuracy": 0.8087350040674209, + "step": 450 + }, + { + "epoch": 0.1927149512918255, + "grad_norm": 1.3993020572279387, + "learning_rate": 1.94846435873128e-05, + "loss": 0.5153, + "mean_token_accuracy": 0.8082747459411621, + "step": 455 + }, + { + "epoch": 0.19483269800931807, + "grad_norm": 1.3011551512989479, + "learning_rate": 1.9460949404906285e-05, + "loss": 0.5028, + "mean_token_accuracy": 0.8120961904525756, + "step": 460 + }, + { + "epoch": 0.19695044472681067, + "grad_norm": 1.6479875272294309, + "learning_rate": 1.9436737778363526e-05, + "loss": 0.4787, + "mean_token_accuracy": 0.8184203952550888, + "step": 465 + }, + { + "epoch": 0.19906819144430327, + "grad_norm": 1.2952323822215526, + "learning_rate": 1.9412010031881884e-05, + "loss": 0.4811, + "mean_token_accuracy": 0.8196297824382782, + "step": 470 + }, + { + "epoch": 0.20118593816179586, + "grad_norm": 1.2434980503550659, + "learning_rate": 1.9386767517886666e-05, + "loss": 0.4992, + "mean_token_accuracy": 0.8126247316598892, + "step": 475 + }, + { + "epoch": 0.20330368487928843, + "grad_norm": 1.2749730489780189, + "learning_rate": 1.9361011616957165e-05, + "loss": 0.5013, + "mean_token_accuracy": 0.8094296991825104, + "step": 480 + }, + { + "epoch": 0.20542143159678103, + "grad_norm": 1.2801081991950354, + "learning_rate": 1.933474373775115e-05, + "loss": 0.4914, + "mean_token_accuracy": 0.8103417336940766, + "step": 485 + }, + { + "epoch": 0.20753917831427363, + "grad_norm": 1.3841139586738282, + "learning_rate": 1.930796531692783e-05, + "loss": 0.503, + "mean_token_accuracy": 0.8150111019611359, + "step": 490 + }, + { + "epoch": 0.2096569250317662, + "grad_norm": 1.2895819374549709, + "learning_rate": 1.9280677819069273e-05, + "loss": 0.4938, + "mean_token_accuracy": 0.8058139503002166, + "step": 495 + }, + { + "epoch": 0.2117746717492588, + "grad_norm": 1.2705506609214867, + "learning_rate": 1.9252882736600302e-05, + "loss": 0.5041, + "mean_token_accuracy": 0.8078715801239014, + "step": 500 + }, + { + "epoch": 0.2138924184667514, + "grad_norm": 1.3700128773821674, + "learning_rate": 1.922458158970688e-05, + "loss": 0.5122, + "mean_token_accuracy": 0.805089196562767, + "step": 505 + }, + { + "epoch": 0.21601016518424396, + "grad_norm": 1.4292612681859336, + "learning_rate": 1.9195775926252952e-05, + "loss": 0.4799, + "mean_token_accuracy": 0.8134547978639602, + "step": 510 + }, + { + "epoch": 0.21812791190173655, + "grad_norm": 2.589810653355124, + "learning_rate": 1.91664673216958e-05, + "loss": 0.4686, + "mean_token_accuracy": 0.8232874065637589, + "step": 515 + }, + { + "epoch": 0.22024565861922915, + "grad_norm": 1.4425686621750156, + "learning_rate": 1.913665737899988e-05, + "loss": 0.4885, + "mean_token_accuracy": 0.815599313378334, + "step": 520 + }, + { + "epoch": 0.22236340533672172, + "grad_norm": 1.4823410740282665, + "learning_rate": 1.9106347728549134e-05, + "loss": 0.4832, + "mean_token_accuracy": 0.8109551817178726, + "step": 525 + }, + { + "epoch": 0.22448115205421432, + "grad_norm": 1.1459009249468546, + "learning_rate": 1.9075540028057844e-05, + "loss": 0.5156, + "mean_token_accuracy": 0.8015700995922088, + "step": 530 + }, + { + "epoch": 0.2265988987717069, + "grad_norm": 1.273350806844229, + "learning_rate": 1.9044235962479945e-05, + "loss": 0.4901, + "mean_token_accuracy": 0.8163118690252305, + "step": 535 + }, + { + "epoch": 0.22871664548919948, + "grad_norm": 1.2736969034780394, + "learning_rate": 1.9012437243916895e-05, + "loss": 0.475, + "mean_token_accuracy": 0.8155727684497833, + "step": 540 + }, + { + "epoch": 0.23083439220669208, + "grad_norm": 1.1644155017049156, + "learning_rate": 1.8980145611523996e-05, + "loss": 0.5041, + "mean_token_accuracy": 0.8130400031805038, + "step": 545 + }, + { + "epoch": 0.23295213892418468, + "grad_norm": 1.3543018612133357, + "learning_rate": 1.8947362831415327e-05, + "loss": 0.4668, + "mean_token_accuracy": 0.8260669410228729, + "step": 550 + }, + { + "epoch": 0.23506988564167725, + "grad_norm": 1.2391111005758269, + "learning_rate": 1.8914090696567104e-05, + "loss": 0.4809, + "mean_token_accuracy": 0.8127309769392014, + "step": 555 + }, + { + "epoch": 0.23718763235916984, + "grad_norm": 2.2015980143710583, + "learning_rate": 1.888033102671965e-05, + "loss": 0.4922, + "mean_token_accuracy": 0.8155588954687119, + "step": 560 + }, + { + "epoch": 0.23930537907666244, + "grad_norm": 1.2198454979455773, + "learning_rate": 1.884608566827785e-05, + "loss": 0.5168, + "mean_token_accuracy": 0.8062847316265106, + "step": 565 + }, + { + "epoch": 0.241423125794155, + "grad_norm": 1.184969374617232, + "learning_rate": 1.8811356494210166e-05, + "loss": 0.4805, + "mean_token_accuracy": 0.8132707148790359, + "step": 570 + }, + { + "epoch": 0.2435408725116476, + "grad_norm": 1.187126766493632, + "learning_rate": 1.8776145403946226e-05, + "loss": 0.4955, + "mean_token_accuracy": 0.8102918237447738, + "step": 575 + }, + { + "epoch": 0.2456586192291402, + "grad_norm": 1.3821096957818944, + "learning_rate": 1.874045432327289e-05, + "loss": 0.4985, + "mean_token_accuracy": 0.8098550081253052, + "step": 580 + }, + { + "epoch": 0.24777636594663277, + "grad_norm": 1.214604218577671, + "learning_rate": 1.8704285204228973e-05, + "loss": 0.4627, + "mean_token_accuracy": 0.8165160745382309, + "step": 585 + }, + { + "epoch": 0.24989411266412537, + "grad_norm": 1.4526314855211653, + "learning_rate": 1.866764002499846e-05, + "loss": 0.4909, + "mean_token_accuracy": 0.8122711658477784, + "step": 590 + }, + { + "epoch": 0.25201185938161796, + "grad_norm": 1.1543877428891598, + "learning_rate": 1.8630520789802308e-05, + "loss": 0.4782, + "mean_token_accuracy": 0.8182896554470063, + "step": 595 + }, + { + "epoch": 0.25412960609911056, + "grad_norm": 1.3086338857944744, + "learning_rate": 1.8592929528788844e-05, + "loss": 0.4753, + "mean_token_accuracy": 0.8180733859539032, + "step": 600 + }, + { + "epoch": 0.25624735281660316, + "grad_norm": 1.3557276365311686, + "learning_rate": 1.8554868297922728e-05, + "loss": 0.4708, + "mean_token_accuracy": 0.8193376958370209, + "step": 605 + }, + { + "epoch": 0.2583650995340957, + "grad_norm": 1.2996719117152657, + "learning_rate": 1.8516339178872492e-05, + "loss": 0.4518, + "mean_token_accuracy": 0.8204487860202789, + "step": 610 + }, + { + "epoch": 0.2604828462515883, + "grad_norm": 1.3696724777806233, + "learning_rate": 1.8477344278896708e-05, + "loss": 0.5072, + "mean_token_accuracy": 0.8076569020748139, + "step": 615 + }, + { + "epoch": 0.2626005929690809, + "grad_norm": 1.2308629288247015, + "learning_rate": 1.8437885730728738e-05, + "loss": 0.5113, + "mean_token_accuracy": 0.8088377475738525, + "step": 620 + }, + { + "epoch": 0.2647183396865735, + "grad_norm": 1.2397238918015017, + "learning_rate": 1.839796569246006e-05, + "loss": 0.494, + "mean_token_accuracy": 0.8118572622537613, + "step": 625 + }, + { + "epoch": 0.2668360864040661, + "grad_norm": 1.3479748389387212, + "learning_rate": 1.8357586347422266e-05, + "loss": 0.5081, + "mean_token_accuracy": 0.8135558038949966, + "step": 630 + }, + { + "epoch": 0.2689538331215587, + "grad_norm": 1.1063564395200467, + "learning_rate": 1.8316749904067637e-05, + "loss": 0.4653, + "mean_token_accuracy": 0.8218313783407212, + "step": 635 + }, + { + "epoch": 0.2710715798390512, + "grad_norm": 1.1492824512346658, + "learning_rate": 1.8275458595848376e-05, + "loss": 0.4817, + "mean_token_accuracy": 0.8135390222072602, + "step": 640 + }, + { + "epoch": 0.2731893265565438, + "grad_norm": 1.4159749106872088, + "learning_rate": 1.8233714681094405e-05, + "loss": 0.4616, + "mean_token_accuracy": 0.8250806093215942, + "step": 645 + }, + { + "epoch": 0.2753070732740364, + "grad_norm": 1.1611107224498594, + "learning_rate": 1.819152044288992e-05, + "loss": 0.488, + "mean_token_accuracy": 0.8166846603155136, + "step": 650 + }, + { + "epoch": 0.277424819991529, + "grad_norm": 1.3205339840836507, + "learning_rate": 1.814887818894846e-05, + "loss": 0.5036, + "mean_token_accuracy": 0.810426139831543, + "step": 655 + }, + { + "epoch": 0.2795425667090216, + "grad_norm": 1.2642547117014469, + "learning_rate": 1.810579025148674e-05, + "loss": 0.5063, + "mean_token_accuracy": 0.8112012058496475, + "step": 660 + }, + { + "epoch": 0.2816603134265142, + "grad_norm": 5.33401159048522, + "learning_rate": 1.8062258987097062e-05, + "loss": 0.4478, + "mean_token_accuracy": 0.8289118260145187, + "step": 665 + }, + { + "epoch": 0.28377806014400675, + "grad_norm": 1.3752087188227111, + "learning_rate": 1.8018286776618446e-05, + "loss": 0.4963, + "mean_token_accuracy": 0.8137694984674454, + "step": 670 + }, + { + "epoch": 0.28589580686149935, + "grad_norm": 1.176266427707403, + "learning_rate": 1.7973876025006407e-05, + "loss": 0.4976, + "mean_token_accuracy": 0.8188654541969299, + "step": 675 + }, + { + "epoch": 0.28801355357899194, + "grad_norm": 1.331341038204072, + "learning_rate": 1.792902916120143e-05, + "loss": 0.4939, + "mean_token_accuracy": 0.8163222283124923, + "step": 680 + }, + { + "epoch": 0.29013130029648454, + "grad_norm": 1.1914829607255677, + "learning_rate": 1.7883748637996113e-05, + "loss": 0.4881, + "mean_token_accuracy": 0.8130565702915191, + "step": 685 + }, + { + "epoch": 0.29224904701397714, + "grad_norm": 1.2277506964948814, + "learning_rate": 1.7838036931901033e-05, + "loss": 0.4559, + "mean_token_accuracy": 0.824514701962471, + "step": 690 + }, + { + "epoch": 0.29436679373146973, + "grad_norm": 1.0800320597549389, + "learning_rate": 1.7791896543009282e-05, + "loss": 0.4891, + "mean_token_accuracy": 0.8174144089221954, + "step": 695 + }, + { + "epoch": 0.29648454044896233, + "grad_norm": 1.5694294317697621, + "learning_rate": 1.7745329994859746e-05, + "loss": 0.4914, + "mean_token_accuracy": 0.8185641199350357, + "step": 700 + }, + { + "epoch": 0.29860228716645487, + "grad_norm": 1.1923041867729132, + "learning_rate": 1.7698339834299064e-05, + "loss": 0.5008, + "mean_token_accuracy": 0.8142161637544632, + "step": 705 + }, + { + "epoch": 0.30072003388394747, + "grad_norm": 1.3729946102267174, + "learning_rate": 1.7650928631342364e-05, + "loss": 0.4845, + "mean_token_accuracy": 0.8133604645729064, + "step": 710 + }, + { + "epoch": 0.30283778060144007, + "grad_norm": 1.174456646604131, + "learning_rate": 1.7603098979032683e-05, + "loss": 0.4777, + "mean_token_accuracy": 0.813685166835785, + "step": 715 + }, + { + "epoch": 0.30495552731893266, + "grad_norm": 1.158532302748484, + "learning_rate": 1.7554853493299142e-05, + "loss": 0.504, + "mean_token_accuracy": 0.8088937163352966, + "step": 720 + }, + { + "epoch": 0.30707327403642526, + "grad_norm": 1.2620596837516858, + "learning_rate": 1.7506194812813896e-05, + "loss": 0.4817, + "mean_token_accuracy": 0.8206409096717835, + "step": 725 + }, + { + "epoch": 0.30919102075391786, + "grad_norm": 1.148012521360775, + "learning_rate": 1.74571255988478e-05, + "loss": 0.4819, + "mean_token_accuracy": 0.812398812174797, + "step": 730 + }, + { + "epoch": 0.3113087674714104, + "grad_norm": 1.2373133691587057, + "learning_rate": 1.740764853512485e-05, + "loss": 0.49, + "mean_token_accuracy": 0.8143349289894104, + "step": 735 + }, + { + "epoch": 0.313426514188903, + "grad_norm": 2.100740115519466, + "learning_rate": 1.7357766327675433e-05, + "loss": 0.4651, + "mean_token_accuracy": 0.8216336488723754, + "step": 740 + }, + { + "epoch": 0.3155442609063956, + "grad_norm": 1.4189894877798284, + "learning_rate": 1.73074817046883e-05, + "loss": 0.4801, + "mean_token_accuracy": 0.8188165038824081, + "step": 745 + }, + { + "epoch": 0.3176620076238882, + "grad_norm": 1.2994480429040771, + "learning_rate": 1.725679741636136e-05, + "loss": 0.4614, + "mean_token_accuracy": 0.8237657248973846, + "step": 750 + }, + { + "epoch": 0.3197797543413808, + "grad_norm": 1.2308603791930401, + "learning_rate": 1.720571623475128e-05, + "loss": 0.492, + "mean_token_accuracy": 0.8165101200342179, + "step": 755 + }, + { + "epoch": 0.3218975010588734, + "grad_norm": 1.3843077010151197, + "learning_rate": 1.7154240953621844e-05, + "loss": 0.4564, + "mean_token_accuracy": 0.825025874376297, + "step": 760 + }, + { + "epoch": 0.3240152477763659, + "grad_norm": 1.1848129565884666, + "learning_rate": 1.7102374388291182e-05, + "loss": 0.4575, + "mean_token_accuracy": 0.8252220988273621, + "step": 765 + }, + { + "epoch": 0.3261329944938585, + "grad_norm": 1.3217187216198285, + "learning_rate": 1.705011937547779e-05, + "loss": 0.4629, + "mean_token_accuracy": 0.8198304086923599, + "step": 770 + }, + { + "epoch": 0.3282507412113511, + "grad_norm": 1.3851637896221318, + "learning_rate": 1.6997478773145363e-05, + "loss": 0.4337, + "mean_token_accuracy": 0.8338131695985794, + "step": 775 + }, + { + "epoch": 0.3303684879288437, + "grad_norm": 1.423775789920787, + "learning_rate": 1.6944455460346503e-05, + "loss": 0.4807, + "mean_token_accuracy": 0.8188902169466019, + "step": 780 + }, + { + "epoch": 0.3324862346463363, + "grad_norm": 1.3680154210297841, + "learning_rate": 1.6891052337065256e-05, + "loss": 0.4841, + "mean_token_accuracy": 0.8188378721475601, + "step": 785 + }, + { + "epoch": 0.3346039813638289, + "grad_norm": 1.1670007538420892, + "learning_rate": 1.6837272324058487e-05, + "loss": 0.4209, + "mean_token_accuracy": 0.8359328061342239, + "step": 790 + }, + { + "epoch": 0.33672172808132145, + "grad_norm": 1.2238185684348435, + "learning_rate": 1.6783118362696162e-05, + "loss": 0.4687, + "mean_token_accuracy": 0.8194981902837754, + "step": 795 + }, + { + "epoch": 0.33883947479881404, + "grad_norm": 1.3104844364549155, + "learning_rate": 1.672859341480046e-05, + "loss": 0.4605, + "mean_token_accuracy": 0.8169092148542404, + "step": 800 + }, + { + "epoch": 0.34095722151630664, + "grad_norm": 1.1074420443801423, + "learning_rate": 1.6673700462483776e-05, + "loss": 0.4424, + "mean_token_accuracy": 0.8315922617912292, + "step": 805 + }, + { + "epoch": 0.34307496823379924, + "grad_norm": 1.2002465546594834, + "learning_rate": 1.661844250798565e-05, + "loss": 0.4773, + "mean_token_accuracy": 0.8234172344207764, + "step": 810 + }, + { + "epoch": 0.34519271495129183, + "grad_norm": 1.3643314568341807, + "learning_rate": 1.6562822573508533e-05, + "loss": 0.4803, + "mean_token_accuracy": 0.8155502796173095, + "step": 815 + }, + { + "epoch": 0.34731046166878443, + "grad_norm": 1.1653511889703811, + "learning_rate": 1.650684370105252e-05, + "loss": 0.4907, + "mean_token_accuracy": 0.8095988690853119, + "step": 820 + }, + { + "epoch": 0.34942820838627703, + "grad_norm": 1.2052540958169133, + "learning_rate": 1.6450508952248957e-05, + "loss": 0.4664, + "mean_token_accuracy": 0.8265933513641357, + "step": 825 + }, + { + "epoch": 0.35154595510376957, + "grad_norm": 1.5477552328113091, + "learning_rate": 1.6393821408193007e-05, + "loss": 0.4783, + "mean_token_accuracy": 0.8169477820396424, + "step": 830 + }, + { + "epoch": 0.35366370182126217, + "grad_norm": 1.8070494772139423, + "learning_rate": 1.6336784169275132e-05, + "loss": 0.454, + "mean_token_accuracy": 0.8248355984687805, + "step": 835 + }, + { + "epoch": 0.35578144853875476, + "grad_norm": 1.2257376390653825, + "learning_rate": 1.627940035501152e-05, + "loss": 0.4506, + "mean_token_accuracy": 0.8257219165563583, + "step": 840 + }, + { + "epoch": 0.35789919525624736, + "grad_norm": 1.3198794046839721, + "learning_rate": 1.6221673103873474e-05, + "loss": 0.4427, + "mean_token_accuracy": 0.8296634495258332, + "step": 845 + }, + { + "epoch": 0.36001694197373996, + "grad_norm": 2.109231295857473, + "learning_rate": 1.616360557311575e-05, + "loss": 0.489, + "mean_token_accuracy": 0.8102859228849411, + "step": 850 + }, + { + "epoch": 0.36213468869123255, + "grad_norm": 1.1872292152679083, + "learning_rate": 1.6105200938603917e-05, + "loss": 0.4681, + "mean_token_accuracy": 0.8261395335197449, + "step": 855 + }, + { + "epoch": 0.3642524354087251, + "grad_norm": 1.214005452933459, + "learning_rate": 1.60464623946406e-05, + "loss": 0.4852, + "mean_token_accuracy": 0.8179385870695114, + "step": 860 + }, + { + "epoch": 0.3663701821262177, + "grad_norm": 1.0907256335398452, + "learning_rate": 1.5987393153790832e-05, + "loss": 0.4623, + "mean_token_accuracy": 0.8248693764209747, + "step": 865 + }, + { + "epoch": 0.3684879288437103, + "grad_norm": 1.061691508146564, + "learning_rate": 1.5927996446706308e-05, + "loss": 0.4803, + "mean_token_accuracy": 0.8169174045324326, + "step": 870 + }, + { + "epoch": 0.3706056755612029, + "grad_norm": 1.1759352091149649, + "learning_rate": 1.5868275521948726e-05, + "loss": 0.4563, + "mean_token_accuracy": 0.8279780805110931, + "step": 875 + }, + { + "epoch": 0.3727234222786955, + "grad_norm": 1.2135030886876705, + "learning_rate": 1.5808233645812087e-05, + "loss": 0.4418, + "mean_token_accuracy": 0.8301020473241806, + "step": 880 + }, + { + "epoch": 0.3748411689961881, + "grad_norm": 1.1266881444254488, + "learning_rate": 1.5747874102144073e-05, + "loss": 0.4626, + "mean_token_accuracy": 0.8214969336986542, + "step": 885 + }, + { + "epoch": 0.3769589157136806, + "grad_norm": 1.0911244736489776, + "learning_rate": 1.5687200192166424e-05, + "loss": 0.4635, + "mean_token_accuracy": 0.8221491903066636, + "step": 890 + }, + { + "epoch": 0.3790766624311732, + "grad_norm": 1.0852849507203284, + "learning_rate": 1.5626215234294416e-05, + "loss": 0.451, + "mean_token_accuracy": 0.8251518607139587, + "step": 895 + }, + { + "epoch": 0.3811944091486658, + "grad_norm": 1.1215853338868707, + "learning_rate": 1.5564922563955337e-05, + "loss": 0.4608, + "mean_token_accuracy": 0.8237892210483551, + "step": 900 + }, + { + "epoch": 0.3833121558661584, + "grad_norm": 0.9235255903522734, + "learning_rate": 1.5503325533406076e-05, + "loss": 0.4676, + "mean_token_accuracy": 0.8222286731004715, + "step": 905 + }, + { + "epoch": 0.385429902583651, + "grad_norm": 1.0494173037764836, + "learning_rate": 1.5441427511549795e-05, + "loss": 0.4652, + "mean_token_accuracy": 0.8235789179801941, + "step": 910 + }, + { + "epoch": 0.3875476493011436, + "grad_norm": 1.2934333868332708, + "learning_rate": 1.537923188375164e-05, + "loss": 0.459, + "mean_token_accuracy": 0.8253506690263748, + "step": 915 + }, + { + "epoch": 0.38966539601863615, + "grad_norm": 1.045643086378396, + "learning_rate": 1.5316742051653624e-05, + "loss": 0.4487, + "mean_token_accuracy": 0.8300421804189682, + "step": 920 + }, + { + "epoch": 0.39178314273612874, + "grad_norm": 1.0549731687620314, + "learning_rate": 1.5253961432988548e-05, + "loss": 0.4756, + "mean_token_accuracy": 0.8141780078411103, + "step": 925 + }, + { + "epoch": 0.39390088945362134, + "grad_norm": 1.1263426428393677, + "learning_rate": 1.5190893461393108e-05, + "loss": 0.4698, + "mean_token_accuracy": 0.8173887878656387, + "step": 930 + }, + { + "epoch": 0.39601863617111394, + "grad_norm": 1.1982411204873675, + "learning_rate": 1.5127541586220077e-05, + "loss": 0.4595, + "mean_token_accuracy": 0.8246693462133408, + "step": 935 + }, + { + "epoch": 0.39813638288860653, + "grad_norm": 1.331125977750805, + "learning_rate": 1.5063909272349664e-05, + "loss": 0.466, + "mean_token_accuracy": 0.8266402333974838, + "step": 940 + }, + { + "epoch": 0.40025412960609913, + "grad_norm": 1.165754254305497, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.435, + "mean_token_accuracy": 0.8271202713251113, + "step": 945 + }, + { + "epoch": 0.4023718763235917, + "grad_norm": 1.1585938088360928, + "learning_rate": 1.4935817264536809e-05, + "loss": 0.4386, + "mean_token_accuracy": 0.8255492657423019, + "step": 950 + }, + { + "epoch": 0.40448962304108427, + "grad_norm": 1.1542702135186313, + "learning_rate": 1.4871364576282223e-05, + "loss": 0.4769, + "mean_token_accuracy": 0.8163278847932816, + "step": 955 + }, + { + "epoch": 0.40660736975857686, + "grad_norm": 1.1855267108232739, + "learning_rate": 1.4806645460322804e-05, + "loss": 0.4938, + "mean_token_accuracy": 0.8140994518995285, + "step": 960 + }, + { + "epoch": 0.40872511647606946, + "grad_norm": 1.0583179034757253, + "learning_rate": 1.4741663456316742e-05, + "loss": 0.4694, + "mean_token_accuracy": 0.8194496780633926, + "step": 965 + }, + { + "epoch": 0.41084286319356206, + "grad_norm": 1.2166297794886325, + "learning_rate": 1.4676422118300266e-05, + "loss": 0.4583, + "mean_token_accuracy": 0.8240072697401046, + "step": 970 + }, + { + "epoch": 0.41296060991105465, + "grad_norm": 1.2077033819076497, + "learning_rate": 1.461092501449326e-05, + "loss": 0.4683, + "mean_token_accuracy": 0.8127462983131408, + "step": 975 + }, + { + "epoch": 0.41507835662854725, + "grad_norm": 1.2024839451726628, + "learning_rate": 1.4545175727104113e-05, + "loss": 0.4746, + "mean_token_accuracy": 0.817327806353569, + "step": 980 + }, + { + "epoch": 0.4171961033460398, + "grad_norm": 43.895890122529586, + "learning_rate": 1.4479177852133787e-05, + "loss": 0.4339, + "mean_token_accuracy": 0.83043053150177, + "step": 985 + }, + { + "epoch": 0.4193138500635324, + "grad_norm": 1.3761452239892333, + "learning_rate": 1.4412934999179169e-05, + "loss": 0.4682, + "mean_token_accuracy": 0.82216075360775, + "step": 990 + }, + { + "epoch": 0.421431596781025, + "grad_norm": 9.553572882081992, + "learning_rate": 1.4346450791235611e-05, + "loss": 0.425, + "mean_token_accuracy": 0.8346862554550171, + "step": 995 + }, + { + "epoch": 0.4235493434985176, + "grad_norm": 1.19535922636142, + "learning_rate": 1.427972886449882e-05, + "loss": 0.4916, + "mean_token_accuracy": 0.8201052099466324, + "step": 1000 + }, + { + "epoch": 0.4256670902160102, + "grad_norm": 1.487407000401354, + "learning_rate": 1.4212772868165957e-05, + "loss": 0.4759, + "mean_token_accuracy": 0.8201690822839737, + "step": 1005 + }, + { + "epoch": 0.4277848369335028, + "grad_norm": 1.2209557581398112, + "learning_rate": 1.4145586464236074e-05, + "loss": 0.4776, + "mean_token_accuracy": 0.8144995361566544, + "step": 1010 + }, + { + "epoch": 0.4299025836509953, + "grad_norm": 1.4175123984588354, + "learning_rate": 1.4078173327309807e-05, + "loss": 0.4697, + "mean_token_accuracy": 0.820775744318962, + "step": 1015 + }, + { + "epoch": 0.4320203303684879, + "grad_norm": 1.2129818965934513, + "learning_rate": 1.4010537144388416e-05, + "loss": 0.463, + "mean_token_accuracy": 0.8259893089532853, + "step": 1020 + }, + { + "epoch": 0.4341380770859805, + "grad_norm": 1.12010970838833, + "learning_rate": 1.3942681614672144e-05, + "loss": 0.4629, + "mean_token_accuracy": 0.8218669801950454, + "step": 1025 + }, + { + "epoch": 0.4362558238034731, + "grad_norm": 1.1464961804103622, + "learning_rate": 1.3874610449357873e-05, + "loss": 0.4238, + "mean_token_accuracy": 0.8335713416337966, + "step": 1030 + }, + { + "epoch": 0.4383735705209657, + "grad_norm": 1.1351310993680606, + "learning_rate": 1.3806327371436159e-05, + "loss": 0.4394, + "mean_token_accuracy": 0.8307629436254501, + "step": 1035 + }, + { + "epoch": 0.4404913172384583, + "grad_norm": 1.1188266853744508, + "learning_rate": 1.3737836115487624e-05, + "loss": 0.4663, + "mean_token_accuracy": 0.8193978488445282, + "step": 1040 + }, + { + "epoch": 0.44260906395595084, + "grad_norm": 1.1620199858915772, + "learning_rate": 1.3669140427478693e-05, + "loss": 0.4705, + "mean_token_accuracy": 0.8229668527841568, + "step": 1045 + }, + { + "epoch": 0.44472681067344344, + "grad_norm": 1.110101616240863, + "learning_rate": 1.3600244064556702e-05, + "loss": 0.4747, + "mean_token_accuracy": 0.8179006308317185, + "step": 1050 + }, + { + "epoch": 0.44684455739093604, + "grad_norm": 1.2783615446297392, + "learning_rate": 1.353115079484444e-05, + "loss": 0.4458, + "mean_token_accuracy": 0.8308207571506501, + "step": 1055 + }, + { + "epoch": 0.44896230410842863, + "grad_norm": 1.1007302332610067, + "learning_rate": 1.3461864397234041e-05, + "loss": 0.4598, + "mean_token_accuracy": 0.8242943733930588, + "step": 1060 + }, + { + "epoch": 0.45108005082592123, + "grad_norm": 1.2199483732995027, + "learning_rate": 1.3392388661180303e-05, + "loss": 0.445, + "mean_token_accuracy": 0.824502220749855, + "step": 1065 + }, + { + "epoch": 0.4531977975434138, + "grad_norm": 1.0010509955815885, + "learning_rate": 1.332272738649345e-05, + "loss": 0.4583, + "mean_token_accuracy": 0.8303744524717331, + "step": 1070 + }, + { + "epoch": 0.45531554426090637, + "grad_norm": 1.918284418636839, + "learning_rate": 1.325288438313129e-05, + "loss": 0.4269, + "mean_token_accuracy": 0.8296439230442048, + "step": 1075 + }, + { + "epoch": 0.45743329097839897, + "grad_norm": 1.1887902164021535, + "learning_rate": 1.318286347099086e-05, + "loss": 0.4625, + "mean_token_accuracy": 0.8217881232500076, + "step": 1080 + }, + { + "epoch": 0.45955103769589156, + "grad_norm": 1.1360766453253965, + "learning_rate": 1.3112668479699486e-05, + "loss": 0.4589, + "mean_token_accuracy": 0.8269425123929978, + "step": 1085 + }, + { + "epoch": 0.46166878441338416, + "grad_norm": 1.2399254503178083, + "learning_rate": 1.3042303248405346e-05, + "loss": 0.4555, + "mean_token_accuracy": 0.8309968203306198, + "step": 1090 + }, + { + "epoch": 0.46378653113087676, + "grad_norm": 1.0508779611719044, + "learning_rate": 1.297177162556748e-05, + "loss": 0.4545, + "mean_token_accuracy": 0.824161484837532, + "step": 1095 + }, + { + "epoch": 0.46590427784836935, + "grad_norm": 1.0822262810815348, + "learning_rate": 1.2901077468745329e-05, + "loss": 0.4571, + "mean_token_accuracy": 0.8281063556671142, + "step": 1100 + }, + { + "epoch": 0.46802202456586195, + "grad_norm": 1.0744745429140576, + "learning_rate": 1.2830224644387742e-05, + "loss": 0.471, + "mean_token_accuracy": 0.8183866649866104, + "step": 1105 + }, + { + "epoch": 0.4701397712833545, + "grad_norm": 1.2108459211634035, + "learning_rate": 1.2759217027621507e-05, + "loss": 0.4445, + "mean_token_accuracy": 0.8313823521137238, + "step": 1110 + }, + { + "epoch": 0.4722575180008471, + "grad_norm": 1.1385271166035913, + "learning_rate": 1.2688058502039416e-05, + "loss": 0.4724, + "mean_token_accuracy": 0.8208224922418594, + "step": 1115 + }, + { + "epoch": 0.4743752647183397, + "grad_norm": 1.1608922255857643, + "learning_rate": 1.261675295948786e-05, + "loss": 0.4402, + "mean_token_accuracy": 0.8260656505823135, + "step": 1120 + }, + { + "epoch": 0.4764930114358323, + "grad_norm": 1.2001870807148136, + "learning_rate": 1.2545304299853977e-05, + "loss": 0.4676, + "mean_token_accuracy": 0.8217555999755859, + "step": 1125 + }, + { + "epoch": 0.4786107581533249, + "grad_norm": 1.099496727008847, + "learning_rate": 1.2473716430852353e-05, + "loss": 0.436, + "mean_token_accuracy": 0.8312188684940338, + "step": 1130 + }, + { + "epoch": 0.4807285048708175, + "grad_norm": 2.032998570634967, + "learning_rate": 1.2401993267811293e-05, + "loss": 0.4317, + "mean_token_accuracy": 0.8295620054006576, + "step": 1135 + }, + { + "epoch": 0.48284625158831, + "grad_norm": 1.1812725212971202, + "learning_rate": 1.2330138733458693e-05, + "loss": 0.4156, + "mean_token_accuracy": 0.8353513538837433, + "step": 1140 + }, + { + "epoch": 0.4849639983058026, + "grad_norm": 1.138821301405385, + "learning_rate": 1.2258156757707496e-05, + "loss": 0.4506, + "mean_token_accuracy": 0.8284595161676407, + "step": 1145 + }, + { + "epoch": 0.4870817450232952, + "grad_norm": 1.039456646381961, + "learning_rate": 1.2186051277440739e-05, + "loss": 0.4281, + "mean_token_accuracy": 0.8340547412633896, + "step": 1150 + }, + { + "epoch": 0.4891994917407878, + "grad_norm": 1.0935441587184827, + "learning_rate": 1.2113826236296245e-05, + "loss": 0.4368, + "mean_token_accuracy": 0.8294982463121414, + "step": 1155 + }, + { + "epoch": 0.4913172384582804, + "grad_norm": 1.0601849025025707, + "learning_rate": 1.2041485584450945e-05, + "loss": 0.4496, + "mean_token_accuracy": 0.8288684636354446, + "step": 1160 + }, + { + "epoch": 0.493434985175773, + "grad_norm": 1.1432826242197904, + "learning_rate": 1.1969033278404816e-05, + "loss": 0.472, + "mean_token_accuracy": 0.8184500396251678, + "step": 1165 + }, + { + "epoch": 0.49555273189326554, + "grad_norm": 1.178255399480397, + "learning_rate": 1.1896473280764498e-05, + "loss": 0.453, + "mean_token_accuracy": 0.82464899122715, + "step": 1170 + }, + { + "epoch": 0.49767047861075814, + "grad_norm": 1.2123556499205794, + "learning_rate": 1.1823809560026558e-05, + "loss": 0.442, + "mean_token_accuracy": 0.8262520909309388, + "step": 1175 + }, + { + "epoch": 0.49978822532825073, + "grad_norm": 1.490671459887953, + "learning_rate": 1.175104609036047e-05, + "loss": 0.4493, + "mean_token_accuracy": 0.8295370072126389, + "step": 1180 + }, + { + "epoch": 0.5019059720457433, + "grad_norm": 3.5058816478434993, + "learning_rate": 1.1678186851391218e-05, + "loss": 0.4593, + "mean_token_accuracy": 0.8269213020801545, + "step": 1185 + }, + { + "epoch": 0.5040237187632359, + "grad_norm": 1.1384716073513477, + "learning_rate": 1.1605235827981673e-05, + "loss": 0.4463, + "mean_token_accuracy": 0.8314786165952682, + "step": 1190 + }, + { + "epoch": 0.5061414654807285, + "grad_norm": 1.1752572701433124, + "learning_rate": 1.1532197010014636e-05, + "loss": 0.4453, + "mean_token_accuracy": 0.8288865953683853, + "step": 1195 + }, + { + "epoch": 0.5082592121982211, + "grad_norm": 1.0006379736398943, + "learning_rate": 1.1459074392174619e-05, + "loss": 0.4293, + "mean_token_accuracy": 0.8350226402282714, + "step": 1200 + }, + { + "epoch": 0.5103769589157137, + "grad_norm": 1.1784455736187447, + "learning_rate": 1.138587197372937e-05, + "loss": 0.4612, + "mean_token_accuracy": 0.8215854614973068, + "step": 1205 + }, + { + "epoch": 0.5124947056332063, + "grad_norm": 1.1048766566547503, + "learning_rate": 1.1312593758311143e-05, + "loss": 0.4279, + "mean_token_accuracy": 0.8407860666513443, + "step": 1210 + }, + { + "epoch": 0.5146124523506989, + "grad_norm": 1.0718700385713946, + "learning_rate": 1.1239243753697728e-05, + "loss": 0.4288, + "mean_token_accuracy": 0.8378984898328781, + "step": 1215 + }, + { + "epoch": 0.5167301990681914, + "grad_norm": 1.558568433227081, + "learning_rate": 1.1165825971593251e-05, + "loss": 0.4678, + "mean_token_accuracy": 0.825000548362732, + "step": 1220 + }, + { + "epoch": 0.518847945785684, + "grad_norm": 1.082392246698731, + "learning_rate": 1.1092344427408767e-05, + "loss": 0.4276, + "mean_token_accuracy": 0.8359992414712906, + "step": 1225 + }, + { + "epoch": 0.5209656925031766, + "grad_norm": 1.256334909576375, + "learning_rate": 1.1018803140042651e-05, + "loss": 0.4633, + "mean_token_accuracy": 0.8229638338088989, + "step": 1230 + }, + { + "epoch": 0.5230834392206692, + "grad_norm": 1.303814596864245, + "learning_rate": 1.0945206131660787e-05, + "loss": 0.469, + "mean_token_accuracy": 0.8193328499794006, + "step": 1235 + }, + { + "epoch": 0.5252011859381618, + "grad_norm": 1.0507039996160834, + "learning_rate": 1.0871557427476585e-05, + "loss": 0.4414, + "mean_token_accuracy": 0.8317544460296631, + "step": 1240 + }, + { + "epoch": 0.5273189326556544, + "grad_norm": 1.015866344156703, + "learning_rate": 1.0797861055530832e-05, + "loss": 0.428, + "mean_token_accuracy": 0.8305379122495651, + "step": 1245 + }, + { + "epoch": 0.529436679373147, + "grad_norm": 1.1624992956977676, + "learning_rate": 1.07241210464714e-05, + "loss": 0.467, + "mean_token_accuracy": 0.820591053366661, + "step": 1250 + }, + { + "epoch": 0.5315544260906395, + "grad_norm": 1.2782647412686758, + "learning_rate": 1.0650341433332778e-05, + "loss": 0.4689, + "mean_token_accuracy": 0.8219984292984008, + "step": 1255 + }, + { + "epoch": 0.5336721728081322, + "grad_norm": 1.1784870838731618, + "learning_rate": 1.0576526251315515e-05, + "loss": 0.4596, + "mean_token_accuracy": 0.8260756641626358, + "step": 1260 + }, + { + "epoch": 0.5357899195256247, + "grad_norm": 1.1204805080469906, + "learning_rate": 1.0502679537565507e-05, + "loss": 0.442, + "mean_token_accuracy": 0.8296466141939163, + "step": 1265 + }, + { + "epoch": 0.5379076662431174, + "grad_norm": 1.0718296420595828, + "learning_rate": 1.0428805330953209e-05, + "loss": 0.4215, + "mean_token_accuracy": 0.8308669030666351, + "step": 1270 + }, + { + "epoch": 0.5400254129606099, + "grad_norm": 1.1125024136410944, + "learning_rate": 1.0354907671852733e-05, + "loss": 0.4363, + "mean_token_accuracy": 0.8332655102014541, + "step": 1275 + }, + { + "epoch": 0.5421431596781024, + "grad_norm": 1.090167844275342, + "learning_rate": 1.0280990601920863e-05, + "loss": 0.4435, + "mean_token_accuracy": 0.8282716870307922, + "step": 1280 + }, + { + "epoch": 0.5442609063955951, + "grad_norm": 1.0290238619990948, + "learning_rate": 1.0207058163876021e-05, + "loss": 0.4413, + "mean_token_accuracy": 0.8311887979507446, + "step": 1285 + }, + { + "epoch": 0.5463786531130876, + "grad_norm": 1.0778232888370207, + "learning_rate": 1.013311440127714e-05, + "loss": 0.4386, + "mean_token_accuracy": 0.8266764581203461, + "step": 1290 + }, + { + "epoch": 0.5484963998305803, + "grad_norm": 1.1219731141973122, + "learning_rate": 1.0059163358302537e-05, + "loss": 0.4103, + "mean_token_accuracy": 0.8391000181436539, + "step": 1295 + }, + { + "epoch": 0.5506141465480728, + "grad_norm": 1.1468466517999107, + "learning_rate": 9.9852090795287e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8361193478107453, + "step": 1300 + }, + { + "epoch": 0.5527318932655655, + "grad_norm": 1.0284132663014267, + "learning_rate": 9.911255609709089e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8269284754991532, + "step": 1305 + }, + { + "epoch": 0.554849639983058, + "grad_norm": 1.0310999165822667, + "learning_rate": 9.83730699355294e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.835135304927826, + "step": 1310 + }, + { + "epoch": 0.5569673867005506, + "grad_norm": 1.2728900066425748, + "learning_rate": 9.76336727550401e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8267913639545441, + "step": 1315 + }, + { + "epoch": 0.5590851334180432, + "grad_norm": 1.2269899407592741, + "learning_rate": 9.689440499519395e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8314703017473221, + "step": 1320 + }, + { + "epoch": 0.5612028801355358, + "grad_norm": 1.1418757049837882, + "learning_rate": 9.615530708848373e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8340400338172913, + "step": 1325 + }, + { + "epoch": 0.5633206268530284, + "grad_norm": 1.1108149486798655, + "learning_rate": 9.541641945811233e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8232677519321442, + "step": 1330 + }, + { + "epoch": 0.565438373570521, + "grad_norm": 1.1088127297572268, + "learning_rate": 9.467778251578217e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8236530691385269, + "step": 1335 + }, + { + "epoch": 0.5675561202880135, + "grad_norm": 0.9179664771961787, + "learning_rate": 9.393943665948478e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8244054973125458, + "step": 1340 + }, + { + "epoch": 0.5696738670055062, + "grad_norm": 1.1777867866273308, + "learning_rate": 9.320142227129158e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8331925332546234, + "step": 1345 + }, + { + "epoch": 0.5717916137229987, + "grad_norm": 1.0020743360016087, + "learning_rate": 9.246377971514504e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8360674440860748, + "step": 1350 + }, + { + "epoch": 0.5739093604404913, + "grad_norm": 1.346066080223308, + "learning_rate": 9.172654933465114e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8250635206699372, + "step": 1355 + }, + { + "epoch": 0.5760271071579839, + "grad_norm": 1.3221207747875352, + "learning_rate": 9.0989771450873e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8357968628406525, + "step": 1360 + }, + { + "epoch": 0.5781448538754765, + "grad_norm": 1.1501989319658534, + "learning_rate": 9.025348636012537e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8290417343378067, + "step": 1365 + }, + { + "epoch": 0.5802626005929691, + "grad_norm": 1.1694331116554113, + "learning_rate": 8.951773433177095e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8303040146827698, + "step": 1370 + }, + { + "epoch": 0.5823803473104616, + "grad_norm": 1.2089472872967426, + "learning_rate": 8.878255560601781e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8339911371469497, + "step": 1375 + }, + { + "epoch": 0.5844980940279543, + "grad_norm": 1.1555334960481487, + "learning_rate": 8.804799039171863e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8346673488616944, + "step": 1380 + }, + { + "epoch": 0.5866158407454468, + "grad_norm": 0.9976941601020334, + "learning_rate": 8.731407886417155e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8272438108921051, + "step": 1385 + }, + { + "epoch": 0.5887335874629395, + "grad_norm": 1.0977726966561636, + "learning_rate": 8.658086116292283e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8334219962358475, + "step": 1390 + }, + { + "epoch": 0.590851334180432, + "grad_norm": 2.0194878160007987, + "learning_rate": 8.584837738957155e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8283408343791961, + "step": 1395 + }, + { + "epoch": 0.5929690808979247, + "grad_norm": 1.2186719145281468, + "learning_rate": 8.511666760557638e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8232256740331649, + "step": 1400 + }, + { + "epoch": 0.5950868276154172, + "grad_norm": 1.1198588684752515, + "learning_rate": 8.438577183006448e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8324928849935531, + "step": 1405 + }, + { + "epoch": 0.5972045743329097, + "grad_norm": 1.1215071963961742, + "learning_rate": 8.36557300376427e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8286356210708619, + "step": 1410 + }, + { + "epoch": 0.5993223210504024, + "grad_norm": 1.107475266800191, + "learning_rate": 8.292658215621139e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8313880443572998, + "step": 1415 + }, + { + "epoch": 0.6014400677678949, + "grad_norm": 1.1686631557802003, + "learning_rate": 8.219836806478049e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8312123149633408, + "step": 1420 + }, + { + "epoch": 0.6035578144853876, + "grad_norm": 1.230978585871069, + "learning_rate": 8.147112759128859e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8231993585824966, + "step": 1425 + }, + { + "epoch": 0.6056755612028801, + "grad_norm": 1.0717890273842352, + "learning_rate": 8.074490051042447e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8321529895067215, + "step": 1430 + }, + { + "epoch": 0.6077933079203727, + "grad_norm": 1.085108371368418, + "learning_rate": 8.001972654145194e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8277548223733902, + "step": 1435 + }, + { + "epoch": 0.6099110546378653, + "grad_norm": 1.2119593900205077, + "learning_rate": 7.929564534603722e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8255878984928131, + "step": 1440 + }, + { + "epoch": 0.6120288013553579, + "grad_norm": 1.1055437345283827, + "learning_rate": 7.857269652607995e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8275179982185363, + "step": 1445 + }, + { + "epoch": 0.6141465480728505, + "grad_norm": 1.1275451956189597, + "learning_rate": 7.78509196215472e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8301453530788422, + "step": 1450 + }, + { + "epoch": 0.6162642947903431, + "grad_norm": 1.2886494426253579, + "learning_rate": 7.713035410831086e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8251194447278977, + "step": 1455 + }, + { + "epoch": 0.6183820415078357, + "grad_norm": 1.1109768793864798, + "learning_rate": 7.64110393959887e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8380070447921752, + "step": 1460 + }, + { + "epoch": 0.6204997882253283, + "grad_norm": 1.0182035864318235, + "learning_rate": 7.569301482578885e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8316156834363937, + "step": 1465 + }, + { + "epoch": 0.6226175349428208, + "grad_norm": 1.2074345207100396, + "learning_rate": 7.497631966835828e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8231601238250732, + "step": 1470 + }, + { + "epoch": 0.6247352816603134, + "grad_norm": 0.991329003303421, + "learning_rate": 7.42609931216348e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8327670186758042, + "step": 1475 + }, + { + "epoch": 0.626853028377806, + "grad_norm": 1.38024365126256, + "learning_rate": 7.354707430870332e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8324557185173035, + "step": 1480 + }, + { + "epoch": 0.6289707750952986, + "grad_norm": 1.2263457500699402, + "learning_rate": 7.283460227565614e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8289420217275619, + "step": 1485 + }, + { + "epoch": 0.6310885218127912, + "grad_norm": 1.1601375730316865, + "learning_rate": 7.2123615989457364e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.832300814986229, + "step": 1490 + }, + { + "epoch": 0.6332062685302838, + "grad_norm": 1.3029839142463893, + "learning_rate": 7.141415433581169e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8393772184848786, + "step": 1495 + }, + { + "epoch": 0.6353240152477764, + "grad_norm": 1.0421344337402514, + "learning_rate": 7.070625611703762e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8257811456918717, + "step": 1500 + }, + { + "epoch": 0.6374417619652689, + "grad_norm": 1.1352186472493642, + "learning_rate": 6.9999960049945406e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8368300348520279, + "step": 1505 + }, + { + "epoch": 0.6395595086827616, + "grad_norm": 0.9884985072070904, + "learning_rate": 6.929530476371935e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8349219173192978, + "step": 1510 + }, + { + "epoch": 0.6416772554002541, + "grad_norm": 1.7766008455284357, + "learning_rate": 6.859232879780515e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8374936401844024, + "step": 1515 + }, + { + "epoch": 0.6437950021177468, + "grad_norm": 1.012934970024209, + "learning_rate": 6.7891070599802045e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8239244252443314, + "step": 1520 + }, + { + "epoch": 0.6459127488352393, + "grad_norm": 0.9859441855867837, + "learning_rate": 6.719156852336015e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8353272944688797, + "step": 1525 + }, + { + "epoch": 0.6480304955527318, + "grad_norm": 1.261329902420831, + "learning_rate": 6.649386082608256e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8329044044017792, + "step": 1530 + }, + { + "epoch": 0.6501482422702245, + "grad_norm": 1.2457535519058567, + "learning_rate": 6.579798566743314e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8307075470685958, + "step": 1535 + }, + { + "epoch": 0.652265988987717, + "grad_norm": 1.213114456712863, + "learning_rate": 6.510398110664939e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8351607590913772, + "step": 1540 + }, + { + "epoch": 0.6543837357052097, + "grad_norm": 1.155264435257233, + "learning_rate": 6.441188510066092e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8374445289373398, + "step": 1545 + }, + { + "epoch": 0.6565014824227022, + "grad_norm": 1.1756119576548756, + "learning_rate": 6.372173550201346e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8390755444765091, + "step": 1550 + }, + { + "epoch": 0.6586192291401949, + "grad_norm": 1.0243897900651528, + "learning_rate": 6.303357005679858e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8277173846960068, + "step": 1555 + }, + { + "epoch": 0.6607369758576874, + "grad_norm": 1.0868676429874986, + "learning_rate": 6.234742640258938e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.827509269118309, + "step": 1560 + }, + { + "epoch": 0.66285472257518, + "grad_norm": 1.1792649536698685, + "learning_rate": 6.166334206638186e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8288001954555512, + "step": 1565 + }, + { + "epoch": 0.6649724692926726, + "grad_norm": 1.171894663481444, + "learning_rate": 6.0981354462542456e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8315492898225785, + "step": 1570 + }, + { + "epoch": 0.6670902160101652, + "grad_norm": 1.1333037764256397, + "learning_rate": 6.030150089076199e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8316318243741989, + "step": 1575 + }, + { + "epoch": 0.6692079627276578, + "grad_norm": 1.1892286300854609, + "learning_rate": 5.9623818534015275e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8352140128612519, + "step": 1580 + }, + { + "epoch": 0.6713257094451504, + "grad_norm": 4.250523219515856, + "learning_rate": 5.894834445652777e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8329778879880905, + "step": 1585 + }, + { + "epoch": 0.6734434561626429, + "grad_norm": 1.157008090047474, + "learning_rate": 5.827511560174835e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.832972839474678, + "step": 1590 + }, + { + "epoch": 0.6755612028801355, + "grad_norm": 1.1834078816860993, + "learning_rate": 5.7604168790328774e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8443128287792205, + "step": 1595 + }, + { + "epoch": 0.6776789495976281, + "grad_norm": 1.0766345733639675, + "learning_rate": 5.693554071810987e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8282081812620163, + "step": 1600 + }, + { + "epoch": 0.6797966963151207, + "grad_norm": 1.0314594529031804, + "learning_rate": 5.626926795411447e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8321157455444336, + "step": 1605 + }, + { + "epoch": 0.6819144430326133, + "grad_norm": 1.055274137880832, + "learning_rate": 5.560538693854751e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8316533505916596, + "step": 1610 + }, + { + "epoch": 0.6840321897501059, + "grad_norm": 1.1972782090907812, + "learning_rate": 5.494393398080292e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.834712353348732, + "step": 1615 + }, + { + "epoch": 0.6861499364675985, + "grad_norm": 1.0962501568970522, + "learning_rate": 5.428494525747769e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8248083680868149, + "step": 1620 + }, + { + "epoch": 0.688267683185091, + "grad_norm": 1.0751444988160856, + "learning_rate": 5.362845681039348e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8374727904796601, + "step": 1625 + }, + { + "epoch": 0.6903854299025837, + "grad_norm": 1.1471090324016462, + "learning_rate": 5.297450454462526e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8296476870775222, + "step": 1630 + }, + { + "epoch": 0.6925031766200762, + "grad_norm": 0.962534660265453, + "learning_rate": 5.23231242265375e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.83418510556221, + "step": 1635 + }, + { + "epoch": 0.6946209233375689, + "grad_norm": 1.1168651450432128, + "learning_rate": 5.167435148182824e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8372534781694412, + "step": 1640 + }, + { + "epoch": 0.6967386700550614, + "grad_norm": 1.2186341287706137, + "learning_rate": 5.102822179358037e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8409687280654907, + "step": 1645 + }, + { + "epoch": 0.6988564167725541, + "grad_norm": 0.9820636174800459, + "learning_rate": 5.0384770500321175e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8384972155094147, + "step": 1650 + }, + { + "epoch": 0.7009741634900466, + "grad_norm": 0.943830506781205, + "learning_rate": 4.97440327940895e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8365049093961716, + "step": 1655 + }, + { + "epoch": 0.7030919102075391, + "grad_norm": 1.0574783345670844, + "learning_rate": 4.910604371851091e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8333552926778793, + "step": 1660 + }, + { + "epoch": 0.7052096569250318, + "grad_norm": 1.103380699456734, + "learning_rate": 4.847083816688123e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8425119102001191, + "step": 1665 + }, + { + "epoch": 0.7073274036425243, + "grad_norm": 1.117253769501395, + "learning_rate": 4.783845088025807e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8330845534801483, + "step": 1670 + }, + { + "epoch": 0.709445150360017, + "grad_norm": 1.4108563780024128, + "learning_rate": 4.7208916445560625e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8379091322422028, + "step": 1675 + }, + { + "epoch": 0.7115628970775095, + "grad_norm": 1.031565575748758, + "learning_rate": 4.658226929367826e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8240681082010269, + "step": 1680 + }, + { + "epoch": 0.7136806437950021, + "grad_norm": 1.2248996065912452, + "learning_rate": 4.595854369758727e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8363937050104141, + "step": 1685 + }, + { + "epoch": 0.7157983905124947, + "grad_norm": 1.1049025661918381, + "learning_rate": 4.5337773770476245e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8340339243412018, + "step": 1690 + }, + { + "epoch": 0.7179161372299873, + "grad_norm": 1.1244170950870136, + "learning_rate": 4.4719993463880695e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8225684702396393, + "step": 1695 + }, + { + "epoch": 0.7200338839474799, + "grad_norm": 1.1969285633316296, + "learning_rate": 4.410523656582576e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8440569192171097, + "step": 1700 + }, + { + "epoch": 0.7221516306649725, + "grad_norm": 1.122866308313561, + "learning_rate": 4.349353669897856e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.837623131275177, + "step": 1705 + }, + { + "epoch": 0.7242693773824651, + "grad_norm": 1.0173115464088704, + "learning_rate": 4.288492731880917e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8388867497444152, + "step": 1710 + }, + { + "epoch": 0.7263871240999576, + "grad_norm": 1.1018457774189827, + "learning_rate": 4.227944171176072e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8392677456140518, + "step": 1715 + }, + { + "epoch": 0.7285048708174502, + "grad_norm": 1.2471156860459571, + "learning_rate": 4.167711299342909e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8256678134202957, + "step": 1720 + }, + { + "epoch": 0.7306226175349428, + "grad_norm": 1.1273568017592417, + "learning_rate": 4.107797410675166e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8386416286230087, + "step": 1725 + }, + { + "epoch": 0.7327403642524354, + "grad_norm": 1.20918067568615, + "learning_rate": 4.048205782020544e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8220532357692718, + "step": 1730 + }, + { + "epoch": 0.734858110969928, + "grad_norm": 1.1573583276355073, + "learning_rate": 3.988939672601509e-06, + "loss": 0.395, + "mean_token_accuracy": 0.844212406873703, + "step": 1735 + }, + { + "epoch": 0.7369758576874206, + "grad_norm": 1.1516374922245958, + "learning_rate": 3.930002323837026e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8371291518211365, + "step": 1740 + }, + { + "epoch": 0.7390936044049131, + "grad_norm": 1.274643963255776, + "learning_rate": 3.871396959165267e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8348165363073349, + "step": 1745 + }, + { + "epoch": 0.7412113511224058, + "grad_norm": 1.025583507042276, + "learning_rate": 3.8131267838673336e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8343986541032791, + "step": 1750 + }, + { + "epoch": 0.7433290978398983, + "grad_norm": 1.1299748085754966, + "learning_rate": 3.755194984891943e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8430469453334808, + "step": 1755 + }, + { + "epoch": 0.745446844557391, + "grad_norm": 1.0603027089656643, + "learning_rate": 3.6976047306811115e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8382641762495041, + "step": 1760 + }, + { + "epoch": 0.7475645912748835, + "grad_norm": 1.1281590494510496, + "learning_rate": 3.6403591709968924e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8320927768945694, + "step": 1765 + }, + { + "epoch": 0.7496823379923762, + "grad_norm": 1.0367839611389602, + "learning_rate": 3.5834614367490706e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.835366889834404, + "step": 1770 + }, + { + "epoch": 0.7518000847098687, + "grad_norm": 1.0958827736818129, + "learning_rate": 3.526914639823973e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8301591634750366, + "step": 1775 + }, + { + "epoch": 0.7539178314273612, + "grad_norm": 1.0559223618431266, + "learning_rate": 3.4707218729142224e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8316712707281113, + "step": 1780 + }, + { + "epoch": 0.7560355781448539, + "grad_norm": 1.0792688197107765, + "learning_rate": 3.414886209349615e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.835688841342926, + "step": 1785 + }, + { + "epoch": 0.7581533248623464, + "grad_norm": 1.1979681287726258, + "learning_rate": 3.3594107029290347e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8371979027986527, + "step": 1790 + }, + { + "epoch": 0.7602710715798391, + "grad_norm": 1.1468783022113433, + "learning_rate": 3.304298387753426e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8341523915529251, + "step": 1795 + }, + { + "epoch": 0.7623888182973316, + "grad_norm": 1.142335742385377, + "learning_rate": 3.2495522780598442e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8298469454050064, + "step": 1800 + }, + { + "epoch": 0.7645065650148243, + "grad_norm": 1.1968773332651736, + "learning_rate": 3.1951753680566143e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8313175171613694, + "step": 1805 + }, + { + "epoch": 0.7666243117323168, + "grad_norm": 1.0804618708583653, + "learning_rate": 3.141170631759558e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8373444229364395, + "step": 1810 + }, + { + "epoch": 0.7687420584498094, + "grad_norm": 1.0872538790077677, + "learning_rate": 3.087541022829347e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8371105402708053, + "step": 1815 + }, + { + "epoch": 0.770859805167302, + "grad_norm": 0.9905135006363225, + "learning_rate": 3.034289474409943e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8365035742521286, + "step": 1820 + }, + { + "epoch": 0.7729775518847946, + "grad_norm": 1.0890914888672922, + "learning_rate": 2.981418898968186e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.838862606883049, + "step": 1825 + }, + { + "epoch": 0.7750952986022872, + "grad_norm": 1.1417209565486737, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.833648070693016, + "step": 1830 + }, + { + "epoch": 0.7772130453197797, + "grad_norm": 1.1684616910176908, + "learning_rate": 2.8768322125448265e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.83056038916111, + "step": 1835 + }, + { + "epoch": 0.7793307920372723, + "grad_norm": 1.1845681597767028, + "learning_rate": 2.825121821683391e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8353413581848145, + "step": 1840 + }, + { + "epoch": 0.7814485387547649, + "grad_norm": 1.1732126933903428, + "learning_rate": 2.7738038437271288e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.842677703499794, + "step": 1845 + }, + { + "epoch": 0.7835662854722575, + "grad_norm": 1.0292583187860371, + "learning_rate": 2.7228810853908406e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8447476714849472, + "step": 1850 + }, + { + "epoch": 0.7856840321897501, + "grad_norm": 0.9892030702997285, + "learning_rate": 2.67235633177373e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8288900941610337, + "step": 1855 + }, + { + "epoch": 0.7878017789072427, + "grad_norm": 1.0050687986582967, + "learning_rate": 2.6222323462070897e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.828187745809555, + "step": 1860 + }, + { + "epoch": 0.7899195256247353, + "grad_norm": 1.1304197153376732, + "learning_rate": 2.572511870103149e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8425087302923202, + "step": 1865 + }, + { + "epoch": 0.7920372723422279, + "grad_norm": 1.0444576639344187, + "learning_rate": 2.5231976228051526e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8337043792009353, + "step": 1870 + }, + { + "epoch": 0.7941550190597204, + "grad_norm": 1.0875080317220023, + "learning_rate": 2.4742923014386154e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8368022799491882, + "step": 1875 + }, + { + "epoch": 0.7962727657772131, + "grad_norm": 1.1517129093084153, + "learning_rate": 2.4257985807638294e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8356128752231597, + "step": 1880 + }, + { + "epoch": 0.7983905124947056, + "grad_norm": 1.2213468844119533, + "learning_rate": 2.3777191130295673e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8373890697956086, + "step": 1885 + }, + { + "epoch": 0.8005082592121983, + "grad_norm": 1.1105462272187794, + "learning_rate": 2.330056527828013e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8282926619052887, + "step": 1890 + }, + { + "epoch": 0.8026260059296908, + "grad_norm": 1.1626653178571262, + "learning_rate": 2.282813431950952e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8333282887935638, + "step": 1895 + }, + { + "epoch": 0.8047437526471835, + "grad_norm": 1.1195581942328177, + "learning_rate": 2.235992409247214e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8319763302803039, + "step": 1900 + }, + { + "epoch": 0.806861499364676, + "grad_norm": 1.026868168904022, + "learning_rate": 2.1895960204813194e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8370046824216842, + "step": 1905 + }, + { + "epoch": 0.8089792460821685, + "grad_norm": 1.0639569641896143, + "learning_rate": 2.1436268031934602e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8297486454248428, + "step": 1910 + }, + { + "epoch": 0.8110969927996612, + "grad_norm": 1.0385740186847223, + "learning_rate": 2.098087271560687e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8370089381933212, + "step": 1915 + }, + { + "epoch": 0.8132147395171537, + "grad_norm": 1.1169845772777505, + "learning_rate": 2.0529799162594242e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.839673039317131, + "step": 1920 + }, + { + "epoch": 0.8153324862346464, + "grad_norm": 1.0745546751170598, + "learning_rate": 2.0083072043292406e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8379459470510483, + "step": 1925 + }, + { + "epoch": 0.8174502329521389, + "grad_norm": 1.206429363415916, + "learning_rate": 1.9640715790379084e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8345289677381516, + "step": 1930 + }, + { + "epoch": 0.8195679796696315, + "grad_norm": 1.0452292636568519, + "learning_rate": 1.920275459747796e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8368586808443069, + "step": 1935 + }, + { + "epoch": 0.8216857263871241, + "grad_norm": 1.0706189800647066, + "learning_rate": 1.8769212417835314e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8513321369886399, + "step": 1940 + }, + { + "epoch": 0.8238034731046167, + "grad_norm": 1.0974535637452612, + "learning_rate": 1.8340112963009993e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8337271898984909, + "step": 1945 + }, + { + "epoch": 0.8259212198221093, + "grad_norm": 1.0867209847341632, + "learning_rate": 1.7915479701576577e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8291646331548691, + "step": 1950 + }, + { + "epoch": 0.8280389665396019, + "grad_norm": 1.1993569416921062, + "learning_rate": 1.7495335857841855e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8385995358228684, + "step": 1955 + }, + { + "epoch": 0.8301567132570945, + "grad_norm": 1.1414883228473476, + "learning_rate": 1.7079704410574505e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8459228605031968, + "step": 1960 + }, + { + "epoch": 0.832274459974587, + "grad_norm": 1.048311577922366, + "learning_rate": 1.6668608091748495e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8357879251241684, + "step": 1965 + }, + { + "epoch": 0.8343922066920796, + "grad_norm": 1.0617438922962255, + "learning_rate": 1.6262069385299694e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8343731433153152, + "step": 1970 + }, + { + "epoch": 0.8365099534095722, + "grad_norm": 1.1279209328755353, + "learning_rate": 1.5860110525896143e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.835442116856575, + "step": 1975 + }, + { + "epoch": 0.8386277001270648, + "grad_norm": 0.9640338154076892, + "learning_rate": 1.5462753497722139e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8363285154104233, + "step": 1980 + }, + { + "epoch": 0.8407454468445574, + "grad_norm": 1.065476222817932, + "learning_rate": 1.5070020033275655e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8427035689353943, + "step": 1985 + }, + { + "epoch": 0.84286319356205, + "grad_norm": 1.055480105683973, + "learning_rate": 1.4681931612179901e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8340502351522445, + "step": 1990 + }, + { + "epoch": 0.8449809402795425, + "grad_norm": 1.0690985831761302, + "learning_rate": 1.4298509460008491e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8402904689311981, + "step": 1995 + }, + { + "epoch": 0.8470986869970352, + "grad_norm": 1.0063164183000968, + "learning_rate": 1.39197745471245e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8361636906862259, + "step": 2000 + }, + { + "epoch": 0.8492164337145277, + "grad_norm": 1.0247616494577987, + "learning_rate": 1.354574758753363e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8310322672128677, + "step": 2005 + }, + { + "epoch": 0.8513341804320204, + "grad_norm": 1.044860588344852, + "learning_rate": 1.3176449037751294e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8303707420825959, + "step": 2010 + }, + { + "epoch": 0.8534519271495129, + "grad_norm": 2.4537559889629694, + "learning_rate": 1.28118990956837e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.835821408033371, + "step": 2015 + }, + { + "epoch": 0.8555696738670056, + "grad_norm": 1.0972489520800874, + "learning_rate": 1.2452117699523303e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8460766285657882, + "step": 2020 + }, + { + "epoch": 0.8576874205844981, + "grad_norm": 1.2309045137234433, + "learning_rate": 1.2097124526658277e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8366678208112717, + "step": 2025 + }, + { + "epoch": 0.8598051673019906, + "grad_norm": 1.0849048269411365, + "learning_rate": 1.1746938992596257e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8296289384365082, + "step": 2030 + }, + { + "epoch": 0.8619229140194833, + "grad_norm": 0.989974221522167, + "learning_rate": 1.1401580249902566e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8379861056804657, + "step": 2035 + }, + { + "epoch": 0.8640406607369758, + "grad_norm": 1.0066596115748891, + "learning_rate": 1.1061067187152584e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8417060792446136, + "step": 2040 + }, + { + "epoch": 0.8661584074544685, + "grad_norm": 1.0526259070425423, + "learning_rate": 1.0725418427898792e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8398545056581497, + "step": 2045 + }, + { + "epoch": 0.868276154171961, + "grad_norm": 1.134470581551777, + "learning_rate": 1.0394652329652165e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8354752600193024, + "step": 2050 + }, + { + "epoch": 0.8703939008894537, + "grad_norm": 1.130864865166622, + "learning_rate": 1.0068786982878087e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8398678600788116, + "step": 2055 + }, + { + "epoch": 0.8725116476069462, + "grad_norm": 1.1500087879964977, + "learning_rate": 9.747840210007021e-07, + "loss": 0.4157, + "mean_token_accuracy": 0.8322781622409821, + "step": 2060 + }, + { + "epoch": 0.8746293943244388, + "grad_norm": 0.9770307768209092, + "learning_rate": 9.43182956445976e-07, + "loss": 0.3977, + "mean_token_accuracy": 0.8416966944932938, + "step": 2065 + }, + { + "epoch": 0.8767471410419314, + "grad_norm": 1.2583818143393242, + "learning_rate": 9.120772329687278e-07, + "loss": 0.4251, + "mean_token_accuracy": 0.8354076951742172, + "step": 2070 + }, + { + "epoch": 0.878864887759424, + "grad_norm": 1.0618576291439479, + "learning_rate": 8.814685518225552e-07, + "loss": 0.4291, + "mean_token_accuracy": 0.8308704495429993, + "step": 2075 + }, + { + "epoch": 0.8809826344769166, + "grad_norm": 1.1180367611245425, + "learning_rate": 8.513585870765118e-07, + "loss": 0.3907, + "mean_token_accuracy": 0.8452890306711197, + "step": 2080 + }, + { + "epoch": 0.8831003811944091, + "grad_norm": 1.230123212504081, + "learning_rate": 8.217489855235338e-07, + "loss": 0.4144, + "mean_token_accuracy": 0.8392110764980316, + "step": 2085 + }, + { + "epoch": 0.8852181279119017, + "grad_norm": 1.1108948475484288, + "learning_rate": 7.926413665903931e-07, + "loss": 0.4151, + "mean_token_accuracy": 0.8380868971347809, + "step": 2090 + }, + { + "epoch": 0.8873358746293943, + "grad_norm": 1.098761542271965, + "learning_rate": 7.640373222491038e-07, + "loss": 0.4196, + "mean_token_accuracy": 0.8407029449939728, + "step": 2095 + }, + { + "epoch": 0.8894536213468869, + "grad_norm": 1.0940803341605705, + "learning_rate": 7.359384169298744e-07, + "loss": 0.4097, + "mean_token_accuracy": 0.8401619613170623, + "step": 2100 + }, + { + "epoch": 0.8915713680643795, + "grad_norm": 0.9066347453646844, + "learning_rate": 7.083461874355335e-07, + "loss": 0.4257, + "mean_token_accuracy": 0.8362819194793701, + "step": 2105 + }, + { + "epoch": 0.8936891147818721, + "grad_norm": 1.0448023766882066, + "learning_rate": 6.81262142857475e-07, + "loss": 0.3898, + "mean_token_accuracy": 0.8459620922803879, + "step": 2110 + }, + { + "epoch": 0.8958068614993647, + "grad_norm": 1.0611643496346475, + "learning_rate": 6.546877644931315e-07, + "loss": 0.4208, + "mean_token_accuracy": 0.8312031596899032, + "step": 2115 + }, + { + "epoch": 0.8979246082168573, + "grad_norm": 1.1224663985096108, + "learning_rate": 6.286245057649542e-07, + "loss": 0.3994, + "mean_token_accuracy": 0.8465497404336929, + "step": 2120 + }, + { + "epoch": 0.9000423549343498, + "grad_norm": 1.0832056476567533, + "learning_rate": 6.030737921409169e-07, + "loss": 0.3867, + "mean_token_accuracy": 0.8440623044967651, + "step": 2125 + }, + { + "epoch": 0.9021601016518425, + "grad_norm": 1.0523110523954844, + "learning_rate": 5.7803702105656e-07, + "loss": 0.4127, + "mean_token_accuracy": 0.8366563141345977, + "step": 2130 + }, + { + "epoch": 0.904277848369335, + "grad_norm": 1.0105232913792406, + "learning_rate": 5.535155618385612e-07, + "loss": 0.4195, + "mean_token_accuracy": 0.8335390537977219, + "step": 2135 + }, + { + "epoch": 0.9063955950868277, + "grad_norm": 1.1129917485868344, + "learning_rate": 5.295107556298329e-07, + "loss": 0.3928, + "mean_token_accuracy": 0.8431670844554902, + "step": 2140 + }, + { + "epoch": 0.9085133418043202, + "grad_norm": 1.145719574659648, + "learning_rate": 5.060239153161872e-07, + "loss": 0.4019, + "mean_token_accuracy": 0.8419764310121536, + "step": 2145 + }, + { + "epoch": 0.9106310885218127, + "grad_norm": 1.443628282269306, + "learning_rate": 4.830563254545207e-07, + "loss": 0.4233, + "mean_token_accuracy": 0.8361739784479141, + "step": 2150 + }, + { + "epoch": 0.9127488352393054, + "grad_norm": 1.1691030241329559, + "learning_rate": 4.6060924220255654e-07, + "loss": 0.4257, + "mean_token_accuracy": 0.8305665761232376, + "step": 2155 + }, + { + "epoch": 0.9148665819567979, + "grad_norm": 1.2424085660240223, + "learning_rate": 4.386838932501547e-07, + "loss": 0.4303, + "mean_token_accuracy": 0.8358988225460052, + "step": 2160 + }, + { + "epoch": 0.9169843286742906, + "grad_norm": 1.0258262640063769, + "learning_rate": 4.172814777521483e-07, + "loss": 0.4298, + "mean_token_accuracy": 0.8366893321275711, + "step": 2165 + }, + { + "epoch": 0.9191020753917831, + "grad_norm": 1.0932401792673323, + "learning_rate": 3.9640316626277654e-07, + "loss": 0.4172, + "mean_token_accuracy": 0.836585283279419, + "step": 2170 + }, + { + "epoch": 0.9212198221092758, + "grad_norm": 1.0881178279493329, + "learning_rate": 3.7605010067165216e-07, + "loss": 0.42, + "mean_token_accuracy": 0.8352493315935134, + "step": 2175 + }, + { + "epoch": 0.9233375688267683, + "grad_norm": 1.057750886441079, + "learning_rate": 3.562233941413096e-07, + "loss": 0.3975, + "mean_token_accuracy": 0.8412194460630417, + "step": 2180 + }, + { + "epoch": 0.9254553155442609, + "grad_norm": 1.1056774030421723, + "learning_rate": 3.3692413104633226e-07, + "loss": 0.3976, + "mean_token_accuracy": 0.840697067975998, + "step": 2185 + }, + { + "epoch": 0.9275730622617535, + "grad_norm": 1.163101779598673, + "learning_rate": 3.1815336691403464e-07, + "loss": 0.3751, + "mean_token_accuracy": 0.8496327966451644, + "step": 2190 + }, + { + "epoch": 0.929690808979246, + "grad_norm": 0.9755793569303719, + "learning_rate": 2.999121283667339e-07, + "loss": 0.4079, + "mean_token_accuracy": 0.8418219208717346, + "step": 2195 + }, + { + "epoch": 0.9318085556967387, + "grad_norm": 1.021358583461123, + "learning_rate": 2.8220141306561034e-07, + "loss": 0.4186, + "mean_token_accuracy": 0.8352805793285369, + "step": 2200 + }, + { + "epoch": 0.9339263024142312, + "grad_norm": 1.0396837778560488, + "learning_rate": 2.6502218965613335e-07, + "loss": 0.4225, + "mean_token_accuracy": 0.8338442891836166, + "step": 2205 + }, + { + "epoch": 0.9360440491317239, + "grad_norm": 1.1742052357618658, + "learning_rate": 2.483753977150882e-07, + "loss": 0.4067, + "mean_token_accuracy": 0.8387827515602112, + "step": 2210 + }, + { + "epoch": 0.9381617958492164, + "grad_norm": 1.0739901995137444, + "learning_rate": 2.3226194769918497e-07, + "loss": 0.4041, + "mean_token_accuracy": 0.837730023264885, + "step": 2215 + }, + { + "epoch": 0.940279542566709, + "grad_norm": 1.0246012489566791, + "learning_rate": 2.1668272089526377e-07, + "loss": 0.4161, + "mean_token_accuracy": 0.8399739652872086, + "step": 2220 + }, + { + "epoch": 0.9423972892842016, + "grad_norm": 1.0463273467785923, + "learning_rate": 2.0163856937210236e-07, + "loss": 0.4245, + "mean_token_accuracy": 0.8379955619573594, + "step": 2225 + }, + { + "epoch": 0.9445150360016942, + "grad_norm": 1.13493837929642, + "learning_rate": 1.8713031593380116e-07, + "loss": 0.405, + "mean_token_accuracy": 0.8368137925863266, + "step": 2230 + }, + { + "epoch": 0.9466327827191868, + "grad_norm": 1.1326422720007092, + "learning_rate": 1.731587540747903e-07, + "loss": 0.4164, + "mean_token_accuracy": 0.839913833141327, + "step": 2235 + }, + { + "epoch": 0.9487505294366794, + "grad_norm": 1.1288581153860058, + "learning_rate": 1.597246479364345e-07, + "loss": 0.4345, + "mean_token_accuracy": 0.8263521671295166, + "step": 2240 + }, + { + "epoch": 0.9508682761541719, + "grad_norm": 1.0618048867408285, + "learning_rate": 1.4682873226523064e-07, + "loss": 0.4116, + "mean_token_accuracy": 0.8380947977304458, + "step": 2245 + }, + { + "epoch": 0.9529860228716646, + "grad_norm": 1.0089748524009554, + "learning_rate": 1.3447171237262912e-07, + "loss": 0.4281, + "mean_token_accuracy": 0.8311914891004563, + "step": 2250 + }, + { + "epoch": 0.9551037695891571, + "grad_norm": 1.1718653607363838, + "learning_rate": 1.2265426409645676e-07, + "loss": 0.4205, + "mean_token_accuracy": 0.8367854833602906, + "step": 2255 + }, + { + "epoch": 0.9572215163066498, + "grad_norm": 1.009709808393184, + "learning_rate": 1.1137703376395304e-07, + "loss": 0.4307, + "mean_token_accuracy": 0.8332184463739395, + "step": 2260 + }, + { + "epoch": 0.9593392630241423, + "grad_norm": 1.0456672123180084, + "learning_rate": 1.0064063815642178e-07, + "loss": 0.4143, + "mean_token_accuracy": 0.8407183200120926, + "step": 2265 + }, + { + "epoch": 0.961457009741635, + "grad_norm": 1.4564232381895734, + "learning_rate": 9.044566447549697e-08, + "loss": 0.3935, + "mean_token_accuracy": 0.843877837061882, + "step": 2270 + }, + { + "epoch": 0.9635747564591275, + "grad_norm": 1.006395133879737, + "learning_rate": 8.079267031102844e-08, + "loss": 0.4379, + "mean_token_accuracy": 0.8322035163640976, + "step": 2275 + }, + { + "epoch": 0.96569250317662, + "grad_norm": 1.0451381392295622, + "learning_rate": 7.16821836105841e-08, + "loss": 0.3998, + "mean_token_accuracy": 0.8473025262355804, + "step": 2280 + }, + { + "epoch": 0.9678102498941127, + "grad_norm": 1.0472971428422386, + "learning_rate": 6.311470265057518e-08, + "loss": 0.423, + "mean_token_accuracy": 0.8354467749595642, + "step": 2285 + }, + { + "epoch": 0.9699279966116052, + "grad_norm": 1.1605199240395647, + "learning_rate": 5.5090696009004744e-08, + "loss": 0.4257, + "mean_token_accuracy": 0.8360013753175736, + "step": 2290 + }, + { + "epoch": 0.9720457433290979, + "grad_norm": 0.9898182837486158, + "learning_rate": 4.761060253984151e-08, + "loss": 0.4204, + "mean_token_accuracy": 0.8367842882871628, + "step": 2295 + }, + { + "epoch": 0.9741634900465904, + "grad_norm": 1.088987157568079, + "learning_rate": 4.067483134901573e-08, + "loss": 0.4134, + "mean_token_accuracy": 0.83856400847435, + "step": 2300 + }, + { + "epoch": 0.976281236764083, + "grad_norm": 1.0295706774122013, + "learning_rate": 3.4283761772042623e-08, + "loss": 0.4224, + "mean_token_accuracy": 0.8354990780353546, + "step": 2305 + }, + { + "epoch": 0.9783989834815756, + "grad_norm": 1.0694735478921555, + "learning_rate": 2.84377433532812e-08, + "loss": 0.4305, + "mean_token_accuracy": 0.8316824287176132, + "step": 2310 + }, + { + "epoch": 0.9805167301990682, + "grad_norm": 1.0827744380795652, + "learning_rate": 2.3137095826809564e-08, + "loss": 0.402, + "mean_token_accuracy": 0.8404913783073426, + "step": 2315 + }, + { + "epoch": 0.9826344769165608, + "grad_norm": 1.0960538876783272, + "learning_rate": 1.8382109098944444e-08, + "loss": 0.4352, + "mean_token_accuracy": 0.8338410943746567, + "step": 2320 + }, + { + "epoch": 0.9847522236340533, + "grad_norm": 1.1206569874331853, + "learning_rate": 1.4173043232380557e-08, + "loss": 0.4076, + "mean_token_accuracy": 0.8435803085565567, + "step": 2325 + }, + { + "epoch": 0.986869970351546, + "grad_norm": 1.0708342349134583, + "learning_rate": 1.0510128431968635e-08, + "loss": 0.4041, + "mean_token_accuracy": 0.8435177773237228, + "step": 2330 + }, + { + "epoch": 0.9889877170690385, + "grad_norm": 1.0195754299190762, + "learning_rate": 7.3935650321255156e-09, + "loss": 0.4017, + "mean_token_accuracy": 0.8434190511703491, + "step": 2335 + }, + { + "epoch": 0.9911054637865311, + "grad_norm": 1.043562362927536, + "learning_rate": 4.823523485879556e-09, + "loss": 0.4441, + "mean_token_accuracy": 0.8331767469644547, + "step": 2340 + }, + { + "epoch": 0.9932232105040237, + "grad_norm": 0.9692528305370003, + "learning_rate": 2.800144355540324e-09, + "loss": 0.4112, + "mean_token_accuracy": 0.836205193400383, + "step": 2345 + }, + { + "epoch": 0.9953409572215163, + "grad_norm": 0.9767634882260735, + "learning_rate": 1.32353830502141e-09, + "loss": 0.4233, + "mean_token_accuracy": 0.8327444672584534, + "step": 2350 + }, + { + "epoch": 0.9974587039390089, + "grad_norm": 1.1074445733229596, + "learning_rate": 3.9378609377971335e-10, + "loss": 0.3959, + "mean_token_accuracy": 0.8446923106908798, + "step": 2355 + }, + { + "epoch": 0.9995764506565015, + "grad_norm": 1.0245959330198788, + "learning_rate": 1.0938572402308111e-11, + "loss": 0.4106, + "mean_token_accuracy": 0.8339618355035782, + "step": 2360 + }, + { + "epoch": 1.0, + "mean_token_accuracy": 0.890313521027565, + "step": 2361, + "total_flos": 451385831948288.0, + "train_loss": 0.48239256052181206, + "train_runtime": 37146.7848, + "train_samples_per_second": 1.017, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 5, + "max_steps": 2361, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 451385831948288.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}