diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4606 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999768695209678, + "eval_steps": 100, + "global_step": 2702, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0018504383225776606, + "grad_norm": 3.2766246795654297, + "learning_rate": 3.690036900369004e-07, + "loss": 1.3084, + "mean_token_accuracy": 0.6738631311804741, + "step": 5 + }, + { + "epoch": 0.003700876645155321, + "grad_norm": 3.404125452041626, + "learning_rate": 7.380073800738008e-07, + "loss": 1.2822, + "mean_token_accuracy": 0.6823436709727585, + "step": 10 + }, + { + "epoch": 0.005551314967732981, + "grad_norm": 3.664184093475342, + "learning_rate": 1.1070110701107011e-06, + "loss": 1.3204, + "mean_token_accuracy": 0.6697103577274166, + "step": 15 + }, + { + "epoch": 0.007401753290310642, + "grad_norm": 2.8933751583099365, + "learning_rate": 1.4760147601476015e-06, + "loss": 1.3266, + "mean_token_accuracy": 0.6671803351578223, + "step": 20 + }, + { + "epoch": 0.009252191612888302, + "grad_norm": 2.534046173095703, + "learning_rate": 1.845018450184502e-06, + "loss": 1.2231, + "mean_token_accuracy": 0.6901344745535501, + "step": 25 + }, + { + "epoch": 0.011102629935465963, + "grad_norm": 2.2067267894744873, + "learning_rate": 2.2140221402214023e-06, + "loss": 1.2715, + "mean_token_accuracy": 0.6754532741145192, + "step": 30 + }, + { + "epoch": 0.012953068258043625, + "grad_norm": 1.7358348369598389, + "learning_rate": 2.5830258302583027e-06, + "loss": 1.1998, + "mean_token_accuracy": 0.6898413280984148, + "step": 35 + }, + { + "epoch": 0.014803506580621285, + "grad_norm": 1.7121713161468506, + "learning_rate": 2.952029520295203e-06, + "loss": 1.1642, + "mean_token_accuracy": 0.6958384680047558, + "step": 40 + }, + { + "epoch": 0.016653944903198947, + "grad_norm": 1.55677330493927, + "learning_rate": 3.3210332103321034e-06, + "loss": 1.131, + "mean_token_accuracy": 0.6990899712188562, + "step": 45 + }, + { + "epoch": 0.018504383225776605, + "grad_norm": 1.4929264783859253, + "learning_rate": 3.690036900369004e-06, + "loss": 1.0993, + "mean_token_accuracy": 0.7033520375781326, + "step": 50 + }, + { + "epoch": 0.020354821548354267, + "grad_norm": 1.5617218017578125, + "learning_rate": 4.059040590405905e-06, + "loss": 1.103, + "mean_token_accuracy": 0.7004610115160694, + "step": 55 + }, + { + "epoch": 0.022205259870931925, + "grad_norm": 1.2556904554367065, + "learning_rate": 4.428044280442805e-06, + "loss": 1.0614, + "mean_token_accuracy": 0.7098527718822583, + "step": 60 + }, + { + "epoch": 0.024055698193509587, + "grad_norm": 1.5663235187530518, + "learning_rate": 4.797047970479705e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.7172534410927646, + "step": 65 + }, + { + "epoch": 0.02590613651608725, + "grad_norm": 1.2256712913513184, + "learning_rate": 5.166051660516605e-06, + "loss": 1.067, + "mean_token_accuracy": 0.7091192188708707, + "step": 70 + }, + { + "epoch": 0.027756574838664907, + "grad_norm": 1.2844516038894653, + "learning_rate": 5.535055350553506e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.7196393054758531, + "step": 75 + }, + { + "epoch": 0.02960701316124257, + "grad_norm": 1.3047950267791748, + "learning_rate": 5.904059040590406e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7261756622816481, + "step": 80 + }, + { + "epoch": 0.03145745148382023, + "grad_norm": 1.1991015672683716, + "learning_rate": 6.273062730627307e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.7153248429171789, + "step": 85 + }, + { + "epoch": 0.03330788980639789, + "grad_norm": 1.422570824623108, + "learning_rate": 6.642066420664207e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7279154357680314, + "step": 90 + }, + { + "epoch": 0.03515832812897555, + "grad_norm": 1.2546910047531128, + "learning_rate": 7.011070110701108e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.7217563432093013, + "step": 95 + }, + { + "epoch": 0.03700876645155321, + "grad_norm": 1.2376253604888916, + "learning_rate": 7.380073800738008e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7268496288420679, + "step": 100 + }, + { + "epoch": 0.03700876645155321, + "eval_loss": 1.015299916267395, + "eval_mean_token_accuracy": 0.7169665389206827, + "eval_runtime": 50.7642, + "eval_samples_per_second": 10.106, + "eval_steps_per_second": 10.106, + "step": 100 + }, + { + "epoch": 0.03885920477413087, + "grad_norm": 1.1947935819625854, + "learning_rate": 7.749077490774908e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7237562053639273, + "step": 105 + }, + { + "epoch": 0.040709643096708534, + "grad_norm": 1.1547764539718628, + "learning_rate": 8.11808118081181e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.7196395141478394, + "step": 110 + }, + { + "epoch": 0.042560081419286196, + "grad_norm": 1.1664645671844482, + "learning_rate": 8.48708487084871e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7334516951299597, + "step": 115 + }, + { + "epoch": 0.04441051974186385, + "grad_norm": 1.3240509033203125, + "learning_rate": 8.85608856088561e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7275700433987599, + "step": 120 + }, + { + "epoch": 0.04626095806444151, + "grad_norm": 1.388859748840332, + "learning_rate": 9.22509225092251e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7230461683516555, + "step": 125 + }, + { + "epoch": 0.048111396387019174, + "grad_norm": 1.271419882774353, + "learning_rate": 9.59409594095941e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7282658568793423, + "step": 130 + }, + { + "epoch": 0.049961834709596836, + "grad_norm": 1.343276023864746, + "learning_rate": 9.963099630996312e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7295744106084986, + "step": 135 + }, + { + "epoch": 0.0518122730321745, + "grad_norm": 1.2812855243682861, + "learning_rate": 1.033210332103321e-05, + "loss": 0.9761, + "mean_token_accuracy": 0.7243435776946641, + "step": 140 + }, + { + "epoch": 0.05366271135475216, + "grad_norm": 1.296567440032959, + "learning_rate": 1.0701107011070112e-05, + "loss": 0.9804, + "mean_token_accuracy": 0.7237954107958932, + "step": 145 + }, + { + "epoch": 0.055513149677329815, + "grad_norm": 1.3086339235305786, + "learning_rate": 1.1070110701107012e-05, + "loss": 0.9411, + "mean_token_accuracy": 0.7346382404393238, + "step": 150 + }, + { + "epoch": 0.05736358799990748, + "grad_norm": 1.3291301727294922, + "learning_rate": 1.1439114391143913e-05, + "loss": 0.9385, + "mean_token_accuracy": 0.7342997457804343, + "step": 155 + }, + { + "epoch": 0.05921402632248514, + "grad_norm": 1.1311614513397217, + "learning_rate": 1.1808118081180812e-05, + "loss": 0.9666, + "mean_token_accuracy": 0.7260505604675033, + "step": 160 + }, + { + "epoch": 0.0610644646450628, + "grad_norm": 1.2835899591445923, + "learning_rate": 1.2177121771217713e-05, + "loss": 0.977, + "mean_token_accuracy": 0.7244513931183378, + "step": 165 + }, + { + "epoch": 0.06291490296764046, + "grad_norm": 1.2636359930038452, + "learning_rate": 1.2546125461254614e-05, + "loss": 0.9551, + "mean_token_accuracy": 0.7282803329395334, + "step": 170 + }, + { + "epoch": 0.06476534129021812, + "grad_norm": 1.5195773839950562, + "learning_rate": 1.2915129151291515e-05, + "loss": 0.9524, + "mean_token_accuracy": 0.7309458986105004, + "step": 175 + }, + { + "epoch": 0.06661577961279579, + "grad_norm": 1.256447196006775, + "learning_rate": 1.3284132841328414e-05, + "loss": 0.9232, + "mean_token_accuracy": 0.7378681775799953, + "step": 180 + }, + { + "epoch": 0.06846621793537344, + "grad_norm": 1.2798049449920654, + "learning_rate": 1.3653136531365315e-05, + "loss": 0.9897, + "mean_token_accuracy": 0.7213663838416546, + "step": 185 + }, + { + "epoch": 0.0703166562579511, + "grad_norm": 1.5569769144058228, + "learning_rate": 1.4022140221402215e-05, + "loss": 0.9585, + "mean_token_accuracy": 0.7270874163121077, + "step": 190 + }, + { + "epoch": 0.07216709458052877, + "grad_norm": 1.1950033903121948, + "learning_rate": 1.4391143911439116e-05, + "loss": 0.9519, + "mean_token_accuracy": 0.7320935494632173, + "step": 195 + }, + { + "epoch": 0.07401753290310642, + "grad_norm": 1.2576892375946045, + "learning_rate": 1.4760147601476015e-05, + "loss": 0.9207, + "mean_token_accuracy": 0.736455222510665, + "step": 200 + }, + { + "epoch": 0.07401753290310642, + "eval_loss": 0.9724639058113098, + "eval_mean_token_accuracy": 0.7245206744181131, + "eval_runtime": 50.6905, + "eval_samples_per_second": 10.12, + "eval_steps_per_second": 10.12, + "step": 200 + }, + { + "epoch": 0.07586797122568409, + "grad_norm": 1.32821524143219, + "learning_rate": 1.5129151291512916e-05, + "loss": 0.9343, + "mean_token_accuracy": 0.7352068673658746, + "step": 205 + }, + { + "epoch": 0.07771840954826174, + "grad_norm": 1.2804689407348633, + "learning_rate": 1.5498154981549817e-05, + "loss": 0.9867, + "mean_token_accuracy": 0.71895514692202, + "step": 210 + }, + { + "epoch": 0.0795688478708394, + "grad_norm": 1.3231711387634277, + "learning_rate": 1.5867158671586716e-05, + "loss": 0.9472, + "mean_token_accuracy": 0.7307138213343876, + "step": 215 + }, + { + "epoch": 0.08141928619341707, + "grad_norm": 1.4526623487472534, + "learning_rate": 1.623616236162362e-05, + "loss": 0.9821, + "mean_token_accuracy": 0.7217701773341699, + "step": 220 + }, + { + "epoch": 0.08326972451599472, + "grad_norm": 1.317412257194519, + "learning_rate": 1.6605166051660518e-05, + "loss": 0.9396, + "mean_token_accuracy": 0.7325251665277629, + "step": 225 + }, + { + "epoch": 0.08512016283857239, + "grad_norm": 1.4098377227783203, + "learning_rate": 1.697416974169742e-05, + "loss": 0.9651, + "mean_token_accuracy": 0.7255207585860529, + "step": 230 + }, + { + "epoch": 0.08697060116115005, + "grad_norm": 1.3292772769927979, + "learning_rate": 1.734317343173432e-05, + "loss": 0.9247, + "mean_token_accuracy": 0.7352377124328339, + "step": 235 + }, + { + "epoch": 0.0888210394837277, + "grad_norm": 1.2886641025543213, + "learning_rate": 1.771217712177122e-05, + "loss": 0.9342, + "mean_token_accuracy": 0.7347689630621619, + "step": 240 + }, + { + "epoch": 0.09067147780630537, + "grad_norm": 1.4062870740890503, + "learning_rate": 1.8081180811808117e-05, + "loss": 0.9255, + "mean_token_accuracy": 0.7344553522178484, + "step": 245 + }, + { + "epoch": 0.09252191612888302, + "grad_norm": 1.415042757987976, + "learning_rate": 1.845018450184502e-05, + "loss": 0.9222, + "mean_token_accuracy": 0.738081657358488, + "step": 250 + }, + { + "epoch": 0.0943723544514607, + "grad_norm": 1.1698943376541138, + "learning_rate": 1.8819188191881922e-05, + "loss": 0.9294, + "mean_token_accuracy": 0.7366268991041479, + "step": 255 + }, + { + "epoch": 0.09622279277403835, + "grad_norm": 1.2418172359466553, + "learning_rate": 1.918819188191882e-05, + "loss": 0.9143, + "mean_token_accuracy": 0.7391926803693939, + "step": 260 + }, + { + "epoch": 0.09807323109661602, + "grad_norm": 1.5692613124847412, + "learning_rate": 1.955719557195572e-05, + "loss": 0.926, + "mean_token_accuracy": 0.7368639311244666, + "step": 265 + }, + { + "epoch": 0.09992366941919367, + "grad_norm": 1.1764543056488037, + "learning_rate": 1.9926199261992623e-05, + "loss": 0.9358, + "mean_token_accuracy": 0.7338136905409297, + "step": 270 + }, + { + "epoch": 0.10177410774177133, + "grad_norm": 1.240388035774231, + "learning_rate": 1.9999866396188624e-05, + "loss": 0.9157, + "mean_token_accuracy": 0.7374722373906926, + "step": 275 + }, + { + "epoch": 0.103624546064349, + "grad_norm": 1.2100290060043335, + "learning_rate": 1.9999323636823398e-05, + "loss": 0.9179, + "mean_token_accuracy": 0.7363942630932592, + "step": 280 + }, + { + "epoch": 0.10547498438692665, + "grad_norm": 1.2169815301895142, + "learning_rate": 1.9998363394309497e-05, + "loss": 0.9379, + "mean_token_accuracy": 0.7341725861043149, + "step": 285 + }, + { + "epoch": 0.10732542270950432, + "grad_norm": 1.3079049587249756, + "learning_rate": 1.9996985708738146e-05, + "loss": 0.9318, + "mean_token_accuracy": 0.7340476100242543, + "step": 290 + }, + { + "epoch": 0.10917586103208198, + "grad_norm": 1.3396153450012207, + "learning_rate": 1.999519063762928e-05, + "loss": 0.9187, + "mean_token_accuracy": 0.7370102479106311, + "step": 295 + }, + { + "epoch": 0.11102629935465963, + "grad_norm": 1.274489402770996, + "learning_rate": 1.9992978255929168e-05, + "loss": 0.884, + "mean_token_accuracy": 0.7474725751891694, + "step": 300 + }, + { + "epoch": 0.11102629935465963, + "eval_loss": 0.9594149589538574, + "eval_mean_token_accuracy": 0.7260321389978355, + "eval_runtime": 78.825, + "eval_samples_per_second": 6.508, + "eval_steps_per_second": 6.508, + "step": 300 + }, + { + "epoch": 0.1128767376772373, + "grad_norm": 1.2882938385009766, + "learning_rate": 1.999034865600726e-05, + "loss": 0.8989, + "mean_token_accuracy": 0.7411822076550901, + "step": 305 + }, + { + "epoch": 0.11472717599981495, + "grad_norm": 1.2861721515655518, + "learning_rate": 1.9987301947652354e-05, + "loss": 0.9024, + "mean_token_accuracy": 0.7414158014701427, + "step": 310 + }, + { + "epoch": 0.11657761432239262, + "grad_norm": 1.4147839546203613, + "learning_rate": 1.998383825806799e-05, + "loss": 0.9163, + "mean_token_accuracy": 0.7390650543887309, + "step": 315 + }, + { + "epoch": 0.11842805264497028, + "grad_norm": 1.2372761964797974, + "learning_rate": 1.9979957731867143e-05, + "loss": 0.9133, + "mean_token_accuracy": 0.7386950204852115, + "step": 320 + }, + { + "epoch": 0.12027849096754793, + "grad_norm": 1.2764817476272583, + "learning_rate": 1.9975660531066215e-05, + "loss": 0.9377, + "mean_token_accuracy": 0.7348123426639266, + "step": 325 + }, + { + "epoch": 0.1221289292901256, + "grad_norm": 1.178112268447876, + "learning_rate": 1.9970946835078227e-05, + "loss": 0.9341, + "mean_token_accuracy": 0.7334139550957592, + "step": 330 + }, + { + "epoch": 0.12397936761270326, + "grad_norm": 1.2661017179489136, + "learning_rate": 1.9965816840705355e-05, + "loss": 0.9666, + "mean_token_accuracy": 0.723251078215526, + "step": 335 + }, + { + "epoch": 0.12582980593528093, + "grad_norm": 1.20822274684906, + "learning_rate": 1.9960270762130705e-05, + "loss": 0.9388, + "mean_token_accuracy": 0.7318659879034792, + "step": 340 + }, + { + "epoch": 0.1276802442578586, + "grad_norm": 1.260669469833374, + "learning_rate": 1.9954308830909372e-05, + "loss": 0.8983, + "mean_token_accuracy": 0.7420506330369535, + "step": 345 + }, + { + "epoch": 0.12953068258043623, + "grad_norm": 1.22865891456604, + "learning_rate": 1.9947931295958778e-05, + "loss": 0.8898, + "mean_token_accuracy": 0.74350299707088, + "step": 350 + }, + { + "epoch": 0.1313811209030139, + "grad_norm": 1.306064486503601, + "learning_rate": 1.9941138423548266e-05, + "loss": 0.9358, + "mean_token_accuracy": 0.7305222666688053, + "step": 355 + }, + { + "epoch": 0.13323155922559157, + "grad_norm": 1.2020059823989868, + "learning_rate": 1.9933930497287996e-05, + "loss": 0.9269, + "mean_token_accuracy": 0.7334965080243132, + "step": 360 + }, + { + "epoch": 0.1350819975481692, + "grad_norm": 1.274563193321228, + "learning_rate": 1.9926307818117098e-05, + "loss": 0.9225, + "mean_token_accuracy": 0.735283383137161, + "step": 365 + }, + { + "epoch": 0.13693243587074688, + "grad_norm": 1.2880144119262695, + "learning_rate": 1.9918270704291104e-05, + "loss": 0.9039, + "mean_token_accuracy": 0.7401982686077297, + "step": 370 + }, + { + "epoch": 0.13878287419332455, + "grad_norm": 1.5040298700332642, + "learning_rate": 1.9909819491368677e-05, + "loss": 0.9211, + "mean_token_accuracy": 0.7366455081833134, + "step": 375 + }, + { + "epoch": 0.1406333125159022, + "grad_norm": 1.1860566139221191, + "learning_rate": 1.990095453219757e-05, + "loss": 0.9008, + "mean_token_accuracy": 0.7395652659782839, + "step": 380 + }, + { + "epoch": 0.14248375083847986, + "grad_norm": 1.2104592323303223, + "learning_rate": 1.989167619689993e-05, + "loss": 0.9402, + "mean_token_accuracy": 0.7305055021182448, + "step": 385 + }, + { + "epoch": 0.14433418916105753, + "grad_norm": 1.305290937423706, + "learning_rate": 1.988198487285682e-05, + "loss": 0.9542, + "mean_token_accuracy": 0.7249303930518376, + "step": 390 + }, + { + "epoch": 0.1461846274836352, + "grad_norm": 1.2977765798568726, + "learning_rate": 1.9871880964692055e-05, + "loss": 0.9075, + "mean_token_accuracy": 0.7387939424587044, + "step": 395 + }, + { + "epoch": 0.14803506580621284, + "grad_norm": 1.309501051902771, + "learning_rate": 1.9861364894255306e-05, + "loss": 0.9207, + "mean_token_accuracy": 0.7343055812075197, + "step": 400 + }, + { + "epoch": 0.14803506580621284, + "eval_loss": 0.9451568126678467, + "eval_mean_token_accuracy": 0.72881764551406, + "eval_runtime": 79.77, + "eval_samples_per_second": 6.431, + "eval_steps_per_second": 6.431, + "step": 400 + }, + { + "epoch": 0.1498855041287905, + "grad_norm": 1.2279026508331299, + "learning_rate": 1.985043710060449e-05, + "loss": 0.8866, + "mean_token_accuracy": 0.7445145321342795, + "step": 405 + }, + { + "epoch": 0.15173594245136818, + "grad_norm": 1.4455894231796265, + "learning_rate": 1.9839098039987435e-05, + "loss": 0.9268, + "mean_token_accuracy": 0.7340181375673809, + "step": 410 + }, + { + "epoch": 0.15358638077394582, + "grad_norm": 1.2705790996551514, + "learning_rate": 1.9827348185822834e-05, + "loss": 0.8996, + "mean_token_accuracy": 0.7409869253740823, + "step": 415 + }, + { + "epoch": 0.1554368190965235, + "grad_norm": 1.206468939781189, + "learning_rate": 1.981518802868048e-05, + "loss": 0.9142, + "mean_token_accuracy": 0.7350925363737065, + "step": 420 + }, + { + "epoch": 0.15728725741910116, + "grad_norm": 1.1508530378341675, + "learning_rate": 1.9802618076260784e-05, + "loss": 0.9287, + "mean_token_accuracy": 0.7330718566128385, + "step": 425 + }, + { + "epoch": 0.1591376957416788, + "grad_norm": 1.3081907033920288, + "learning_rate": 1.9789638853373563e-05, + "loss": 0.9385, + "mean_token_accuracy": 0.731924339293188, + "step": 430 + }, + { + "epoch": 0.16098813406425647, + "grad_norm": 1.292937994003296, + "learning_rate": 1.9776250901916168e-05, + "loss": 0.8869, + "mean_token_accuracy": 0.7459759129187684, + "step": 435 + }, + { + "epoch": 0.16283857238683414, + "grad_norm": 1.196614384651184, + "learning_rate": 1.9762454780850807e-05, + "loss": 0.9365, + "mean_token_accuracy": 0.7308315683480648, + "step": 440 + }, + { + "epoch": 0.1646890107094118, + "grad_norm": 1.2768101692199707, + "learning_rate": 1.9748251066181247e-05, + "loss": 0.9458, + "mean_token_accuracy": 0.7291328350939127, + "step": 445 + }, + { + "epoch": 0.16653944903198944, + "grad_norm": 1.28200364112854, + "learning_rate": 1.973364035092875e-05, + "loss": 0.9296, + "mean_token_accuracy": 0.7317795343677007, + "step": 450 + }, + { + "epoch": 0.1683898873545671, + "grad_norm": 1.200141429901123, + "learning_rate": 1.971862324510732e-05, + "loss": 0.9127, + "mean_token_accuracy": 0.7369081696753893, + "step": 455 + }, + { + "epoch": 0.17024032567714478, + "grad_norm": 1.2196623086929321, + "learning_rate": 1.9703200375698223e-05, + "loss": 0.9347, + "mean_token_accuracy": 0.7317238542786955, + "step": 460 + }, + { + "epoch": 0.17209076399972242, + "grad_norm": 1.1578480005264282, + "learning_rate": 1.968737238662382e-05, + "loss": 0.9266, + "mean_token_accuracy": 0.7334499338025752, + "step": 465 + }, + { + "epoch": 0.1739412023223001, + "grad_norm": 1.2814745903015137, + "learning_rate": 1.9671139938720678e-05, + "loss": 0.9161, + "mean_token_accuracy": 0.7351944384479114, + "step": 470 + }, + { + "epoch": 0.17579164064487776, + "grad_norm": 1.2746986150741577, + "learning_rate": 1.9654503709711984e-05, + "loss": 0.9144, + "mean_token_accuracy": 0.738140942384358, + "step": 475 + }, + { + "epoch": 0.1776420789674554, + "grad_norm": 1.1708147525787354, + "learning_rate": 1.963746439417924e-05, + "loss": 0.906, + "mean_token_accuracy": 0.7386923400408494, + "step": 480 + }, + { + "epoch": 0.17949251729003307, + "grad_norm": 1.220799207687378, + "learning_rate": 1.962002270353328e-05, + "loss": 0.9308, + "mean_token_accuracy": 0.7312089465302491, + "step": 485 + }, + { + "epoch": 0.18134295561261074, + "grad_norm": 1.379558801651001, + "learning_rate": 1.960217936598454e-05, + "loss": 0.8983, + "mean_token_accuracy": 0.741063724019383, + "step": 490 + }, + { + "epoch": 0.1831933939351884, + "grad_norm": 1.2207688093185425, + "learning_rate": 1.958393512651269e-05, + "loss": 0.9266, + "mean_token_accuracy": 0.7335562049097655, + "step": 495 + }, + { + "epoch": 0.18504383225776605, + "grad_norm": 1.1398905515670776, + "learning_rate": 1.956529074683551e-05, + "loss": 0.9342, + "mean_token_accuracy": 0.7320654199930504, + "step": 500 + }, + { + "epoch": 0.18504383225776605, + "eval_loss": 0.9367051720619202, + "eval_mean_token_accuracy": 0.7303092480029469, + "eval_runtime": 66.3213, + "eval_samples_per_second": 7.735, + "eval_steps_per_second": 7.735, + "step": 500 + }, + { + "epoch": 0.18689427058034372, + "grad_norm": 1.2544001340866089, + "learning_rate": 1.9546247005377065e-05, + "loss": 0.8923, + "mean_token_accuracy": 0.7418732998401973, + "step": 505 + }, + { + "epoch": 0.1887447089029214, + "grad_norm": 1.2007049322128296, + "learning_rate": 1.952680469723526e-05, + "loss": 0.9018, + "mean_token_accuracy": 0.7390339266062526, + "step": 510 + }, + { + "epoch": 0.19059514722549903, + "grad_norm": 1.3158226013183594, + "learning_rate": 1.9506964634148597e-05, + "loss": 0.8859, + "mean_token_accuracy": 0.7457988652681784, + "step": 515 + }, + { + "epoch": 0.1924455855480767, + "grad_norm": 1.191190242767334, + "learning_rate": 1.9486727644462306e-05, + "loss": 0.907, + "mean_token_accuracy": 0.7398091503885992, + "step": 520 + }, + { + "epoch": 0.19429602387065437, + "grad_norm": 1.1855626106262207, + "learning_rate": 1.9466094573093744e-05, + "loss": 0.913, + "mean_token_accuracy": 0.7377695160507135, + "step": 525 + }, + { + "epoch": 0.19614646219323204, + "grad_norm": 1.226499080657959, + "learning_rate": 1.9445066281497144e-05, + "loss": 0.9199, + "mean_token_accuracy": 0.7334141588016627, + "step": 530 + }, + { + "epoch": 0.19799690051580968, + "grad_norm": 1.1852086782455444, + "learning_rate": 1.9423643647627625e-05, + "loss": 0.8999, + "mean_token_accuracy": 0.7383552793509549, + "step": 535 + }, + { + "epoch": 0.19984733883838735, + "grad_norm": 1.1205965280532837, + "learning_rate": 1.940182756590454e-05, + "loss": 0.9184, + "mean_token_accuracy": 0.736318010130394, + "step": 540 + }, + { + "epoch": 0.201697777160965, + "grad_norm": 1.2531499862670898, + "learning_rate": 1.9379618947174155e-05, + "loss": 0.9443, + "mean_token_accuracy": 0.7257061607299684, + "step": 545 + }, + { + "epoch": 0.20354821548354265, + "grad_norm": 1.2362135648727417, + "learning_rate": 1.935701871867158e-05, + "loss": 0.8954, + "mean_token_accuracy": 0.7395217784179058, + "step": 550 + }, + { + "epoch": 0.20539865380612032, + "grad_norm": 1.1606743335723877, + "learning_rate": 1.9334027823982103e-05, + "loss": 0.891, + "mean_token_accuracy": 0.743093979178, + "step": 555 + }, + { + "epoch": 0.207249092128698, + "grad_norm": 1.2009831666946411, + "learning_rate": 1.9310647223001752e-05, + "loss": 0.9474, + "mean_token_accuracy": 0.7266491757165843, + "step": 560 + }, + { + "epoch": 0.20909953045127563, + "grad_norm": 1.1111478805541992, + "learning_rate": 1.9286877891897244e-05, + "loss": 0.8886, + "mean_token_accuracy": 0.7429377197565084, + "step": 565 + }, + { + "epoch": 0.2109499687738533, + "grad_norm": 1.199816107749939, + "learning_rate": 1.9262720823065217e-05, + "loss": 0.9071, + "mean_token_accuracy": 0.7380534542801875, + "step": 570 + }, + { + "epoch": 0.21280040709643097, + "grad_norm": 1.2556802034378052, + "learning_rate": 1.923817702509081e-05, + "loss": 0.9063, + "mean_token_accuracy": 0.7376037582197961, + "step": 575 + }, + { + "epoch": 0.21465084541900864, + "grad_norm": 1.175026535987854, + "learning_rate": 1.9213247522705532e-05, + "loss": 0.9126, + "mean_token_accuracy": 0.7368979682438305, + "step": 580 + }, + { + "epoch": 0.21650128374158628, + "grad_norm": 1.1675595045089722, + "learning_rate": 1.9187933356744504e-05, + "loss": 0.876, + "mean_token_accuracy": 0.7452829314493784, + "step": 585 + }, + { + "epoch": 0.21835172206416395, + "grad_norm": 1.2250741720199585, + "learning_rate": 1.9162235584102973e-05, + "loss": 0.9056, + "mean_token_accuracy": 0.7382800839384349, + "step": 590 + }, + { + "epoch": 0.22020216038674162, + "grad_norm": 1.2157166004180908, + "learning_rate": 1.9136155277692215e-05, + "loss": 0.9289, + "mean_token_accuracy": 0.7296502066251958, + "step": 595 + }, + { + "epoch": 0.22205259870931926, + "grad_norm": 1.2010993957519531, + "learning_rate": 1.9109693526394722e-05, + "loss": 0.9288, + "mean_token_accuracy": 0.7325398892814555, + "step": 600 + }, + { + "epoch": 0.22205259870931926, + "eval_loss": 0.9283245801925659, + "eval_mean_token_accuracy": 0.7320316075391832, + "eval_runtime": 50.6599, + "eval_samples_per_second": 10.126, + "eval_steps_per_second": 10.126, + "step": 600 + }, + { + "epoch": 0.22390303703189693, + "grad_norm": 1.1503756046295166, + "learning_rate": 1.9082851435018743e-05, + "loss": 0.8932, + "mean_token_accuracy": 0.7414583125820019, + "step": 605 + }, + { + "epoch": 0.2257534753544746, + "grad_norm": 1.2121210098266602, + "learning_rate": 1.905563012425216e-05, + "loss": 0.91, + "mean_token_accuracy": 0.7360977430386327, + "step": 610 + }, + { + "epoch": 0.22760391367705224, + "grad_norm": 1.2150719165802002, + "learning_rate": 1.9028030730615696e-05, + "loss": 0.9003, + "mean_token_accuracy": 0.738964053140992, + "step": 615 + }, + { + "epoch": 0.2294543519996299, + "grad_norm": 1.1619393825531006, + "learning_rate": 1.9000054406415467e-05, + "loss": 0.8475, + "mean_token_accuracy": 0.754003586658783, + "step": 620 + }, + { + "epoch": 0.23130479032220758, + "grad_norm": 1.1259123086929321, + "learning_rate": 1.897170231969486e-05, + "loss": 0.8836, + "mean_token_accuracy": 0.7426343946369743, + "step": 625 + }, + { + "epoch": 0.23315522864478525, + "grad_norm": 1.2872394323349, + "learning_rate": 1.8942975654185788e-05, + "loss": 0.9003, + "mean_token_accuracy": 0.7400158833605734, + "step": 630 + }, + { + "epoch": 0.2350056669673629, + "grad_norm": 1.1954014301300049, + "learning_rate": 1.8913875609259246e-05, + "loss": 0.9009, + "mean_token_accuracy": 0.7401640259366237, + "step": 635 + }, + { + "epoch": 0.23685610528994056, + "grad_norm": 1.3242663145065308, + "learning_rate": 1.8884403399875252e-05, + "loss": 0.9173, + "mean_token_accuracy": 0.7366328368067789, + "step": 640 + }, + { + "epoch": 0.23870654361251822, + "grad_norm": 1.1375986337661743, + "learning_rate": 1.8854560256532098e-05, + "loss": 0.9079, + "mean_token_accuracy": 0.738805148743965, + "step": 645 + }, + { + "epoch": 0.24055698193509586, + "grad_norm": 1.1969835758209229, + "learning_rate": 1.8824347425215016e-05, + "loss": 0.9001, + "mean_token_accuracy": 0.7413721409763886, + "step": 650 + }, + { + "epoch": 0.24240742025767353, + "grad_norm": 1.1655569076538086, + "learning_rate": 1.8793766167344115e-05, + "loss": 0.8596, + "mean_token_accuracy": 0.7499591110003503, + "step": 655 + }, + { + "epoch": 0.2442578585802512, + "grad_norm": 1.2159533500671387, + "learning_rate": 1.8762817759721735e-05, + "loss": 0.9074, + "mean_token_accuracy": 0.7375846045002021, + "step": 660 + }, + { + "epoch": 0.24610829690282887, + "grad_norm": 1.1925801038742065, + "learning_rate": 1.8731503494479132e-05, + "loss": 0.8976, + "mean_token_accuracy": 0.7398126959719362, + "step": 665 + }, + { + "epoch": 0.2479587352254065, + "grad_norm": 1.2198364734649658, + "learning_rate": 1.869982467902255e-05, + "loss": 0.8633, + "mean_token_accuracy": 0.7489514762175022, + "step": 670 + }, + { + "epoch": 0.24980917354798418, + "grad_norm": 1.0977855920791626, + "learning_rate": 1.8667782635978597e-05, + "loss": 0.8873, + "mean_token_accuracy": 0.7421541213702059, + "step": 675 + }, + { + "epoch": 0.25165961187056185, + "grad_norm": 1.2008975744247437, + "learning_rate": 1.8635378703139066e-05, + "loss": 0.9017, + "mean_token_accuracy": 0.7381848400310442, + "step": 680 + }, + { + "epoch": 0.2535100501931395, + "grad_norm": 1.2168858051300049, + "learning_rate": 1.8602614233405047e-05, + "loss": 0.9115, + "mean_token_accuracy": 0.7354954553001021, + "step": 685 + }, + { + "epoch": 0.2553604885157172, + "grad_norm": 1.3037954568862915, + "learning_rate": 1.8569490594730474e-05, + "loss": 0.8864, + "mean_token_accuracy": 0.7435983539382823, + "step": 690 + }, + { + "epoch": 0.25721092683829483, + "grad_norm": 1.2055126428604126, + "learning_rate": 1.853600917006497e-05, + "loss": 0.8766, + "mean_token_accuracy": 0.7440811039197984, + "step": 695 + }, + { + "epoch": 0.25906136516087247, + "grad_norm": 1.1308127641677856, + "learning_rate": 1.8502171357296144e-05, + "loss": 0.8857, + "mean_token_accuracy": 0.7449071002901505, + "step": 700 + }, + { + "epoch": 0.25906136516087247, + "eval_loss": 0.921642541885376, + "eval_mean_token_accuracy": 0.7332576670109641, + "eval_runtime": 50.7893, + "eval_samples_per_second": 10.101, + "eval_steps_per_second": 10.101, + "step": 700 + }, + { + "epoch": 0.26091180348345017, + "grad_norm": 1.186762809753418, + "learning_rate": 1.8467978569191216e-05, + "loss": 0.9174, + "mean_token_accuracy": 0.7333227510987468, + "step": 705 + }, + { + "epoch": 0.2627622418060278, + "grad_norm": 1.2207229137420654, + "learning_rate": 1.8433432233338027e-05, + "loss": 0.8746, + "mean_token_accuracy": 0.7463131881651428, + "step": 710 + }, + { + "epoch": 0.26461268012860545, + "grad_norm": 1.196184754371643, + "learning_rate": 1.8398533792085436e-05, + "loss": 0.88, + "mean_token_accuracy": 0.7460801215672447, + "step": 715 + }, + { + "epoch": 0.26646311845118315, + "grad_norm": 1.1278691291809082, + "learning_rate": 1.8363284702483106e-05, + "loss": 0.9095, + "mean_token_accuracy": 0.7360591613918188, + "step": 720 + }, + { + "epoch": 0.2683135567737608, + "grad_norm": 1.1802078485488892, + "learning_rate": 1.832768643622067e-05, + "loss": 0.9267, + "mean_token_accuracy": 0.7342043041294353, + "step": 725 + }, + { + "epoch": 0.2701639950963384, + "grad_norm": 1.1267465353012085, + "learning_rate": 1.8291740479566286e-05, + "loss": 0.8988, + "mean_token_accuracy": 0.7399242844986644, + "step": 730 + }, + { + "epoch": 0.2720144334189161, + "grad_norm": 1.0495985746383667, + "learning_rate": 1.825544833330457e-05, + "loss": 0.8687, + "mean_token_accuracy": 0.7469224892009212, + "step": 735 + }, + { + "epoch": 0.27386487174149377, + "grad_norm": 1.146759271621704, + "learning_rate": 1.8218811512673958e-05, + "loss": 0.8858, + "mean_token_accuracy": 0.744181844578751, + "step": 740 + }, + { + "epoch": 0.2757153100640714, + "grad_norm": 1.1623131036758423, + "learning_rate": 1.818183154730344e-05, + "loss": 0.8777, + "mean_token_accuracy": 0.7464787908985644, + "step": 745 + }, + { + "epoch": 0.2775657483866491, + "grad_norm": 1.1983212232589722, + "learning_rate": 1.8144509981148675e-05, + "loss": 0.8857, + "mean_token_accuracy": 0.74380218229527, + "step": 750 + }, + { + "epoch": 0.27941618670922674, + "grad_norm": 1.202833652496338, + "learning_rate": 1.810684837242755e-05, + "loss": 0.9102, + "mean_token_accuracy": 0.7358472675370848, + "step": 755 + }, + { + "epoch": 0.2812666250318044, + "grad_norm": 1.2793760299682617, + "learning_rate": 1.8068848293555118e-05, + "loss": 0.8961, + "mean_token_accuracy": 0.7392355986645086, + "step": 760 + }, + { + "epoch": 0.2831170633543821, + "grad_norm": 1.1184508800506592, + "learning_rate": 1.8030511331077945e-05, + "loss": 0.8868, + "mean_token_accuracy": 0.7439213670336557, + "step": 765 + }, + { + "epoch": 0.2849675016769597, + "grad_norm": 1.1284053325653076, + "learning_rate": 1.799183908560787e-05, + "loss": 0.8959, + "mean_token_accuracy": 0.7389006136503797, + "step": 770 + }, + { + "epoch": 0.28681793999953736, + "grad_norm": 1.1895016431808472, + "learning_rate": 1.795283317175518e-05, + "loss": 0.8829, + "mean_token_accuracy": 0.7442627836085055, + "step": 775 + }, + { + "epoch": 0.28866837832211506, + "grad_norm": 1.0703201293945312, + "learning_rate": 1.7913495218061202e-05, + "loss": 0.8947, + "mean_token_accuracy": 0.7412916509680103, + "step": 780 + }, + { + "epoch": 0.2905188166446927, + "grad_norm": 1.1624069213867188, + "learning_rate": 1.787382686693029e-05, + "loss": 0.862, + "mean_token_accuracy": 0.7471440251545716, + "step": 785 + }, + { + "epoch": 0.2923692549672704, + "grad_norm": 1.2251648902893066, + "learning_rate": 1.783382977456128e-05, + "loss": 0.8923, + "mean_token_accuracy": 0.7410979902586569, + "step": 790 + }, + { + "epoch": 0.29421969328984804, + "grad_norm": 1.143458366394043, + "learning_rate": 1.779350561087833e-05, + "loss": 0.8853, + "mean_token_accuracy": 0.7423898317827671, + "step": 795 + }, + { + "epoch": 0.2960701316124257, + "grad_norm": 1.1026256084442139, + "learning_rate": 1.775285605946119e-05, + "loss": 0.8586, + "mean_token_accuracy": 0.751576924914566, + "step": 800 + }, + { + "epoch": 0.2960701316124257, + "eval_loss": 0.9158464670181274, + "eval_mean_token_accuracy": 0.7348994079566449, + "eval_runtime": 51.1725, + "eval_samples_per_second": 10.025, + "eval_steps_per_second": 10.025, + "step": 800 + }, + { + "epoch": 0.2979205699350034, + "grad_norm": 1.1115987300872803, + "learning_rate": 1.7711882817474922e-05, + "loss": 0.867, + "mean_token_accuracy": 0.7480320992990822, + "step": 805 + }, + { + "epoch": 0.299771008257581, + "grad_norm": 1.2489161491394043, + "learning_rate": 1.7670587595599034e-05, + "loss": 0.8892, + "mean_token_accuracy": 0.7421057530949784, + "step": 810 + }, + { + "epoch": 0.30162144658015866, + "grad_norm": 1.1725574731826782, + "learning_rate": 1.762897211795607e-05, + "loss": 0.8758, + "mean_token_accuracy": 0.744083576959311, + "step": 815 + }, + { + "epoch": 0.30347188490273636, + "grad_norm": 1.2797213792800903, + "learning_rate": 1.758703812203961e-05, + "loss": 0.89, + "mean_token_accuracy": 0.7418882377620613, + "step": 820 + }, + { + "epoch": 0.305322323225314, + "grad_norm": 1.2301912307739258, + "learning_rate": 1.7544787358641735e-05, + "loss": 0.8787, + "mean_token_accuracy": 0.7447037499906078, + "step": 825 + }, + { + "epoch": 0.30717276154789164, + "grad_norm": 1.285537600517273, + "learning_rate": 1.7502221591779932e-05, + "loss": 0.88, + "mean_token_accuracy": 0.744204531409647, + "step": 830 + }, + { + "epoch": 0.30902319987046933, + "grad_norm": 1.1410529613494873, + "learning_rate": 1.7459342598623438e-05, + "loss": 0.8914, + "mean_token_accuracy": 0.7435018667641985, + "step": 835 + }, + { + "epoch": 0.310873638193047, + "grad_norm": 1.1405855417251587, + "learning_rate": 1.741615216941905e-05, + "loss": 0.893, + "mean_token_accuracy": 0.7407222726883036, + "step": 840 + }, + { + "epoch": 0.3127240765156246, + "grad_norm": 1.0677855014801025, + "learning_rate": 1.7372652107416364e-05, + "loss": 0.9012, + "mean_token_accuracy": 0.7392550453159372, + "step": 845 + }, + { + "epoch": 0.3145745148382023, + "grad_norm": 1.1801300048828125, + "learning_rate": 1.7328844228792513e-05, + "loss": 0.8887, + "mean_token_accuracy": 0.7418257843405092, + "step": 850 + }, + { + "epoch": 0.31642495316077995, + "grad_norm": 1.101582407951355, + "learning_rate": 1.7284730362576308e-05, + "loss": 0.88, + "mean_token_accuracy": 0.7453135253974471, + "step": 855 + }, + { + "epoch": 0.3182753914833576, + "grad_norm": 1.1998122930526733, + "learning_rate": 1.7240312350571905e-05, + "loss": 0.8864, + "mean_token_accuracy": 0.7427352454319888, + "step": 860 + }, + { + "epoch": 0.3201258298059353, + "grad_norm": 1.13681161403656, + "learning_rate": 1.719559204728188e-05, + "loss": 0.901, + "mean_token_accuracy": 0.7380651694454986, + "step": 865 + }, + { + "epoch": 0.32197626812851293, + "grad_norm": 1.216718316078186, + "learning_rate": 1.715057131982983e-05, + "loss": 0.8885, + "mean_token_accuracy": 0.7411097870617352, + "step": 870 + }, + { + "epoch": 0.32382670645109063, + "grad_norm": 1.1352806091308594, + "learning_rate": 1.710525204788239e-05, + "loss": 0.8727, + "mean_token_accuracy": 0.7462809426313219, + "step": 875 + }, + { + "epoch": 0.32567714477366827, + "grad_norm": 1.2423046827316284, + "learning_rate": 1.7059636123570767e-05, + "loss": 0.9074, + "mean_token_accuracy": 0.7375009900783651, + "step": 880 + }, + { + "epoch": 0.3275275830962459, + "grad_norm": 1.0711945295333862, + "learning_rate": 1.7013725451411757e-05, + "loss": 0.8866, + "mean_token_accuracy": 0.7420681995170615, + "step": 885 + }, + { + "epoch": 0.3293780214188236, + "grad_norm": 1.147957444190979, + "learning_rate": 1.696752194822819e-05, + "loss": 0.8758, + "mean_token_accuracy": 0.7470981950404637, + "step": 890 + }, + { + "epoch": 0.33122845974140125, + "grad_norm": 1.1426104307174683, + "learning_rate": 1.692102754306895e-05, + "loss": 0.888, + "mean_token_accuracy": 0.7420897783947114, + "step": 895 + }, + { + "epoch": 0.3330788980639789, + "grad_norm": 1.12646484375, + "learning_rate": 1.6874244177128395e-05, + "loss": 0.891, + "mean_token_accuracy": 0.7410190186584571, + "step": 900 + }, + { + "epoch": 0.3330788980639789, + "eval_loss": 0.9115377068519592, + "eval_mean_token_accuracy": 0.7353900755746844, + "eval_runtime": 78.6969, + "eval_samples_per_second": 6.519, + "eval_steps_per_second": 6.519, + "step": 900 + }, + { + "epoch": 0.3349293363865566, + "grad_norm": 1.1655060052871704, + "learning_rate": 1.6827173803665328e-05, + "loss": 0.8861, + "mean_token_accuracy": 0.7420408174595587, + "step": 905 + }, + { + "epoch": 0.3367797747091342, + "grad_norm": 1.189650058746338, + "learning_rate": 1.677981838792144e-05, + "loss": 0.8939, + "mean_token_accuracy": 0.741134526619554, + "step": 910 + }, + { + "epoch": 0.33863021303171187, + "grad_norm": 1.301318645477295, + "learning_rate": 1.6732179907039266e-05, + "loss": 0.9091, + "mean_token_accuracy": 0.7390976253952309, + "step": 915 + }, + { + "epoch": 0.34048065135428957, + "grad_norm": 1.250311255455017, + "learning_rate": 1.6684260349979637e-05, + "loss": 0.8621, + "mean_token_accuracy": 0.748405369402981, + "step": 920 + }, + { + "epoch": 0.3423310896768672, + "grad_norm": 1.1436880826950073, + "learning_rate": 1.6636061717438626e-05, + "loss": 0.8591, + "mean_token_accuracy": 0.7498006647906157, + "step": 925 + }, + { + "epoch": 0.34418152799944485, + "grad_norm": 1.0761877298355103, + "learning_rate": 1.6587586021764022e-05, + "loss": 0.8765, + "mean_token_accuracy": 0.7454029311737017, + "step": 930 + }, + { + "epoch": 0.34603196632202254, + "grad_norm": 1.1089153289794922, + "learning_rate": 1.653883528687133e-05, + "loss": 0.8434, + "mean_token_accuracy": 0.7529633045613144, + "step": 935 + }, + { + "epoch": 0.3478824046446002, + "grad_norm": 1.1343047618865967, + "learning_rate": 1.6489811548159245e-05, + "loss": 0.8618, + "mean_token_accuracy": 0.747673238978996, + "step": 940 + }, + { + "epoch": 0.3497328429671778, + "grad_norm": 1.1737910509109497, + "learning_rate": 1.6440516852424678e-05, + "loss": 0.8918, + "mean_token_accuracy": 0.741031903005348, + "step": 945 + }, + { + "epoch": 0.3515832812897555, + "grad_norm": 1.3350353240966797, + "learning_rate": 1.6390953257777324e-05, + "loss": 0.908, + "mean_token_accuracy": 0.7361824438406707, + "step": 950 + }, + { + "epoch": 0.35343371961233316, + "grad_norm": 1.179598331451416, + "learning_rate": 1.634112283355369e-05, + "loss": 0.9018, + "mean_token_accuracy": 0.7387367278989166, + "step": 955 + }, + { + "epoch": 0.3552841579349108, + "grad_norm": 1.1995991468429565, + "learning_rate": 1.6291027660230735e-05, + "loss": 0.8618, + "mean_token_accuracy": 0.7483973267665744, + "step": 960 + }, + { + "epoch": 0.3571345962574885, + "grad_norm": 1.1567256450653076, + "learning_rate": 1.6240669829338992e-05, + "loss": 0.8953, + "mean_token_accuracy": 0.736539362307222, + "step": 965 + }, + { + "epoch": 0.35898503458006614, + "grad_norm": 1.1351323127746582, + "learning_rate": 1.6190051443375248e-05, + "loss": 0.8847, + "mean_token_accuracy": 0.7417752874148211, + "step": 970 + }, + { + "epoch": 0.36083547290264384, + "grad_norm": 1.1404730081558228, + "learning_rate": 1.6139174615714753e-05, + "loss": 0.8763, + "mean_token_accuracy": 0.7457316258582162, + "step": 975 + }, + { + "epoch": 0.3626859112252215, + "grad_norm": 1.2140825986862183, + "learning_rate": 1.6088041470523005e-05, + "loss": 0.8694, + "mean_token_accuracy": 0.745424525779393, + "step": 980 + }, + { + "epoch": 0.3645363495477991, + "grad_norm": 1.0882492065429688, + "learning_rate": 1.6036654142667043e-05, + "loss": 0.8524, + "mean_token_accuracy": 0.7514694551764137, + "step": 985 + }, + { + "epoch": 0.3663867878703768, + "grad_norm": 1.2724220752716064, + "learning_rate": 1.598501477762632e-05, + "loss": 0.9048, + "mean_token_accuracy": 0.7358279308224847, + "step": 990 + }, + { + "epoch": 0.36823722619295446, + "grad_norm": 1.059874176979065, + "learning_rate": 1.5933125531403135e-05, + "loss": 0.8849, + "mean_token_accuracy": 0.7424272766741411, + "step": 995 + }, + { + "epoch": 0.3700876645155321, + "grad_norm": 1.0729840993881226, + "learning_rate": 1.5880988570432603e-05, + "loss": 0.8809, + "mean_token_accuracy": 0.7437243222926472, + "step": 1000 + }, + { + "epoch": 0.3700876645155321, + "eval_loss": 0.9052200317382812, + "eval_mean_token_accuracy": 0.7371176382735689, + "eval_runtime": 78.2052, + "eval_samples_per_second": 6.56, + "eval_steps_per_second": 6.56, + "step": 1000 + }, + { + "epoch": 0.3719381028381098, + "grad_norm": 1.1344876289367676, + "learning_rate": 1.582860607149222e-05, + "loss": 0.9101, + "mean_token_accuracy": 0.7372656047371745, + "step": 1005 + }, + { + "epoch": 0.37378854116068744, + "grad_norm": 1.1230980157852173, + "learning_rate": 1.5775980221610966e-05, + "loss": 0.8475, + "mean_token_accuracy": 0.750859103854946, + "step": 1010 + }, + { + "epoch": 0.3756389794832651, + "grad_norm": 1.2423635721206665, + "learning_rate": 1.5723113217978e-05, + "loss": 0.8619, + "mean_token_accuracy": 0.7491803187809288, + "step": 1015 + }, + { + "epoch": 0.3774894178058428, + "grad_norm": 1.1783347129821777, + "learning_rate": 1.567000726785093e-05, + "loss": 0.8685, + "mean_token_accuracy": 0.7480539254792099, + "step": 1020 + }, + { + "epoch": 0.3793398561284204, + "grad_norm": 1.1289883852005005, + "learning_rate": 1.561666458846365e-05, + "loss": 0.8302, + "mean_token_accuracy": 0.7585016645128863, + "step": 1025 + }, + { + "epoch": 0.38119029445099806, + "grad_norm": 1.1546114683151245, + "learning_rate": 1.5563087406933762e-05, + "loss": 0.8562, + "mean_token_accuracy": 0.7499629460804305, + "step": 1030 + }, + { + "epoch": 0.38304073277357575, + "grad_norm": 1.1813424825668335, + "learning_rate": 1.550927796016961e-05, + "loss": 0.8805, + "mean_token_accuracy": 0.7437648212454886, + "step": 1035 + }, + { + "epoch": 0.3848911710961534, + "grad_norm": 1.0651295185089111, + "learning_rate": 1.5455238494776876e-05, + "loss": 0.897, + "mean_token_accuracy": 0.7403527790535602, + "step": 1040 + }, + { + "epoch": 0.38674160941873104, + "grad_norm": 1.1787759065628052, + "learning_rate": 1.5400971266964772e-05, + "loss": 0.8274, + "mean_token_accuracy": 0.7571744736657067, + "step": 1045 + }, + { + "epoch": 0.38859204774130873, + "grad_norm": 1.1317715644836426, + "learning_rate": 1.5346478542451862e-05, + "loss": 0.8492, + "mean_token_accuracy": 0.7520489518464395, + "step": 1050 + }, + { + "epoch": 0.3904424860638864, + "grad_norm": 1.1802644729614258, + "learning_rate": 1.529176259637145e-05, + "loss": 0.8883, + "mean_token_accuracy": 0.7416859799704518, + "step": 1055 + }, + { + "epoch": 0.39229292438646407, + "grad_norm": 1.1477546691894531, + "learning_rate": 1.5236825713176584e-05, + "loss": 0.8453, + "mean_token_accuracy": 0.7534802458771865, + "step": 1060 + }, + { + "epoch": 0.3941433627090417, + "grad_norm": 1.0698646306991577, + "learning_rate": 1.5181670186544706e-05, + "loss": 0.8841, + "mean_token_accuracy": 0.7430932063263405, + "step": 1065 + }, + { + "epoch": 0.39599380103161935, + "grad_norm": 1.133300542831421, + "learning_rate": 1.5126298319281859e-05, + "loss": 0.8591, + "mean_token_accuracy": 0.7490520840317914, + "step": 1070 + }, + { + "epoch": 0.39784423935419705, + "grad_norm": 1.1771512031555176, + "learning_rate": 1.5070712423226552e-05, + "loss": 0.8835, + "mean_token_accuracy": 0.742742449439321, + "step": 1075 + }, + { + "epoch": 0.3996946776767747, + "grad_norm": 1.183058500289917, + "learning_rate": 1.5014914819153252e-05, + "loss": 0.8647, + "mean_token_accuracy": 0.7484671674745523, + "step": 1080 + }, + { + "epoch": 0.40154511599935233, + "grad_norm": 1.1038291454315186, + "learning_rate": 1.4958907836675467e-05, + "loss": 0.898, + "mean_token_accuracy": 0.7388261469205153, + "step": 1085 + }, + { + "epoch": 0.40339555432193, + "grad_norm": 1.2172880172729492, + "learning_rate": 1.490269381414849e-05, + "loss": 0.9214, + "mean_token_accuracy": 0.7315794003843903, + "step": 1090 + }, + { + "epoch": 0.40524599264450767, + "grad_norm": 1.1312109231948853, + "learning_rate": 1.484627509857178e-05, + "loss": 0.8548, + "mean_token_accuracy": 0.7498190106815769, + "step": 1095 + }, + { + "epoch": 0.4070964309670853, + "grad_norm": 1.172813892364502, + "learning_rate": 1.4789654045490957e-05, + "loss": 0.8523, + "mean_token_accuracy": 0.7516241507062726, + "step": 1100 + }, + { + "epoch": 0.4070964309670853, + "eval_loss": 0.9006803631782532, + "eval_mean_token_accuracy": 0.7380126150724604, + "eval_runtime": 78.0529, + "eval_samples_per_second": 6.572, + "eval_steps_per_second": 6.572, + "step": 1100 + }, + { + "epoch": 0.408946869289663, + "grad_norm": 1.2298842668533325, + "learning_rate": 1.4732833018899468e-05, + "loss": 0.9118, + "mean_token_accuracy": 0.7363720229201065, + "step": 1105 + }, + { + "epoch": 0.41079730761224065, + "grad_norm": 1.2025749683380127, + "learning_rate": 1.4675814391139875e-05, + "loss": 0.8789, + "mean_token_accuracy": 0.7419271920026719, + "step": 1110 + }, + { + "epoch": 0.4126477459348183, + "grad_norm": 1.237095832824707, + "learning_rate": 1.4618600542804819e-05, + "loss": 0.8407, + "mean_token_accuracy": 0.7536702060776138, + "step": 1115 + }, + { + "epoch": 0.414498184257396, + "grad_norm": 1.15714430809021, + "learning_rate": 1.4561193862637621e-05, + "loss": 0.8755, + "mean_token_accuracy": 0.7445992346991004, + "step": 1120 + }, + { + "epoch": 0.4163486225799736, + "grad_norm": 1.1363252401351929, + "learning_rate": 1.4503596747432554e-05, + "loss": 0.8493, + "mean_token_accuracy": 0.7512692116507991, + "step": 1125 + }, + { + "epoch": 0.41819906090255127, + "grad_norm": 1.219039797782898, + "learning_rate": 1.4445811601934763e-05, + "loss": 0.8591, + "mean_token_accuracy": 0.7476566205055524, + "step": 1130 + }, + { + "epoch": 0.42004949922512896, + "grad_norm": 1.061074137687683, + "learning_rate": 1.4387840838739875e-05, + "loss": 0.8537, + "mean_token_accuracy": 0.7499305110493293, + "step": 1135 + }, + { + "epoch": 0.4218999375477066, + "grad_norm": 1.1493220329284668, + "learning_rate": 1.4329686878193271e-05, + "loss": 0.8465, + "mean_token_accuracy": 0.7527445329146073, + "step": 1140 + }, + { + "epoch": 0.42375037587028425, + "grad_norm": 1.1958634853363037, + "learning_rate": 1.4271352148289025e-05, + "loss": 0.8885, + "mean_token_accuracy": 0.7418003324662512, + "step": 1145 + }, + { + "epoch": 0.42560081419286194, + "grad_norm": 1.1059014797210693, + "learning_rate": 1.421283908456854e-05, + "loss": 0.8861, + "mean_token_accuracy": 0.7397866706192172, + "step": 1150 + }, + { + "epoch": 0.4274512525154396, + "grad_norm": 1.0388813018798828, + "learning_rate": 1.4154150130018867e-05, + "loss": 0.8482, + "mean_token_accuracy": 0.750769571523241, + "step": 1155 + }, + { + "epoch": 0.4293016908380173, + "grad_norm": 1.25522780418396, + "learning_rate": 1.4095287734970678e-05, + "loss": 0.8886, + "mean_token_accuracy": 0.7420644725182851, + "step": 1160 + }, + { + "epoch": 0.4311521291605949, + "grad_norm": 1.2089345455169678, + "learning_rate": 1.4036254356996004e-05, + "loss": 0.8429, + "mean_token_accuracy": 0.7526113236357822, + "step": 1165 + }, + { + "epoch": 0.43300256748317256, + "grad_norm": 1.1411499977111816, + "learning_rate": 1.3977052460805597e-05, + "loss": 0.8612, + "mean_token_accuracy": 0.7494721167484096, + "step": 1170 + }, + { + "epoch": 0.43485300580575026, + "grad_norm": 1.062648057937622, + "learning_rate": 1.3917684518146044e-05, + "loss": 0.9042, + "mean_token_accuracy": 0.7372648541482008, + "step": 1175 + }, + { + "epoch": 0.4367034441283279, + "grad_norm": 1.1319317817687988, + "learning_rate": 1.3858153007696552e-05, + "loss": 0.88, + "mean_token_accuracy": 0.7426588688482566, + "step": 1180 + }, + { + "epoch": 0.43855388245090554, + "grad_norm": 1.2135462760925293, + "learning_rate": 1.3798460414965475e-05, + "loss": 0.8617, + "mean_token_accuracy": 0.7475439228361497, + "step": 1185 + }, + { + "epoch": 0.44040432077348324, + "grad_norm": 1.1821489334106445, + "learning_rate": 1.3738609232186537e-05, + "loss": 0.8774, + "mean_token_accuracy": 0.7442542768849871, + "step": 1190 + }, + { + "epoch": 0.4422547590960609, + "grad_norm": 1.1596920490264893, + "learning_rate": 1.3678601958214779e-05, + "loss": 0.9043, + "mean_token_accuracy": 0.737392939549257, + "step": 1195 + }, + { + "epoch": 0.4441051974186385, + "grad_norm": 1.05031418800354, + "learning_rate": 1.3618441098422215e-05, + "loss": 0.858, + "mean_token_accuracy": 0.7495626407835908, + "step": 1200 + }, + { + "epoch": 0.4441051974186385, + "eval_loss": 0.8962021470069885, + "eval_mean_token_accuracy": 0.7389938838082026, + "eval_runtime": 50.4621, + "eval_samples_per_second": 10.166, + "eval_steps_per_second": 10.166, + "step": 1200 + }, + { + "epoch": 0.4459556357412162, + "grad_norm": 1.1134624481201172, + "learning_rate": 1.3558129164593256e-05, + "loss": 0.8615, + "mean_token_accuracy": 0.7486654072277983, + "step": 1205 + }, + { + "epoch": 0.44780607406379386, + "grad_norm": 1.1625741720199585, + "learning_rate": 1.349766867481982e-05, + "loss": 0.8784, + "mean_token_accuracy": 0.7430844560579396, + "step": 1210 + }, + { + "epoch": 0.4496565123863715, + "grad_norm": 1.2227801084518433, + "learning_rate": 1.3437062153396201e-05, + "loss": 0.867, + "mean_token_accuracy": 0.7453914280475592, + "step": 1215 + }, + { + "epoch": 0.4515069507089492, + "grad_norm": 1.1665985584259033, + "learning_rate": 1.337631213071369e-05, + "loss": 0.862, + "mean_token_accuracy": 0.7485184284333548, + "step": 1220 + }, + { + "epoch": 0.45335738903152684, + "grad_norm": 1.1335411071777344, + "learning_rate": 1.331542114315491e-05, + "loss": 0.8621, + "mean_token_accuracy": 0.7484168861668221, + "step": 1225 + }, + { + "epoch": 0.4552078273541045, + "grad_norm": 1.1079161167144775, + "learning_rate": 1.325439173298793e-05, + "loss": 0.8932, + "mean_token_accuracy": 0.7378822601715721, + "step": 1230 + }, + { + "epoch": 0.4570582656766822, + "grad_norm": 1.0776060819625854, + "learning_rate": 1.3193226448260128e-05, + "loss": 0.8516, + "mean_token_accuracy": 0.7499922526524231, + "step": 1235 + }, + { + "epoch": 0.4589087039992598, + "grad_norm": 1.1441576480865479, + "learning_rate": 1.3131927842691793e-05, + "loss": 0.8608, + "mean_token_accuracy": 0.7493913958395965, + "step": 1240 + }, + { + "epoch": 0.4607591423218375, + "grad_norm": 1.1261563301086426, + "learning_rate": 1.3070498475569507e-05, + "loss": 0.8751, + "mean_token_accuracy": 0.7454891607846024, + "step": 1245 + }, + { + "epoch": 0.46260958064441515, + "grad_norm": 1.1878074407577515, + "learning_rate": 1.3008940911639302e-05, + "loss": 0.8354, + "mean_token_accuracy": 0.7543122392839603, + "step": 1250 + }, + { + "epoch": 0.4644600189669928, + "grad_norm": 1.1509381532669067, + "learning_rate": 1.2947257720999577e-05, + "loss": 0.8474, + "mean_token_accuracy": 0.7522147924985935, + "step": 1255 + }, + { + "epoch": 0.4663104572895705, + "grad_norm": 1.1772030591964722, + "learning_rate": 1.2885451478993777e-05, + "loss": 0.8666, + "mean_token_accuracy": 0.7461336524892079, + "step": 1260 + }, + { + "epoch": 0.46816089561214813, + "grad_norm": 1.0959590673446655, + "learning_rate": 1.282352476610289e-05, + "loss": 0.8095, + "mean_token_accuracy": 0.7617041479079839, + "step": 1265 + }, + { + "epoch": 0.4700113339347258, + "grad_norm": 1.0549659729003906, + "learning_rate": 1.2761480167837705e-05, + "loss": 0.8633, + "mean_token_accuracy": 0.7483238275951523, + "step": 1270 + }, + { + "epoch": 0.47186177225730347, + "grad_norm": 1.1503080129623413, + "learning_rate": 1.2699320274630847e-05, + "loss": 0.8889, + "mean_token_accuracy": 0.7395314122897056, + "step": 1275 + }, + { + "epoch": 0.4737122105798811, + "grad_norm": 1.1426583528518677, + "learning_rate": 1.263704768172864e-05, + "loss": 0.9038, + "mean_token_accuracy": 0.7375030812712198, + "step": 1280 + }, + { + "epoch": 0.47556264890245875, + "grad_norm": 1.1978209018707275, + "learning_rate": 1.257466498908276e-05, + "loss": 0.8658, + "mean_token_accuracy": 0.747567498713126, + "step": 1285 + }, + { + "epoch": 0.47741308722503645, + "grad_norm": 1.0632010698318481, + "learning_rate": 1.2512174801241657e-05, + "loss": 0.8186, + "mean_token_accuracy": 0.7578140334225285, + "step": 1290 + }, + { + "epoch": 0.4792635255476141, + "grad_norm": 1.0424400568008423, + "learning_rate": 1.2449579727241834e-05, + "loss": 0.8369, + "mean_token_accuracy": 0.7558098493900247, + "step": 1295 + }, + { + "epoch": 0.48111396387019173, + "grad_norm": 1.134675145149231, + "learning_rate": 1.2386882380498918e-05, + "loss": 0.8464, + "mean_token_accuracy": 0.7537183149776496, + "step": 1300 + }, + { + "epoch": 0.48111396387019173, + "eval_loss": 0.8923500180244446, + "eval_mean_token_accuracy": 0.7399605975475906, + "eval_runtime": 50.3806, + "eval_samples_per_second": 10.182, + "eval_steps_per_second": 10.182, + "step": 1300 + }, + { + "epoch": 0.4829644021927694, + "grad_norm": 1.1513569355010986, + "learning_rate": 1.2324085378698529e-05, + "loss": 0.8676, + "mean_token_accuracy": 0.748164952205539, + "step": 1305 + }, + { + "epoch": 0.48481484051534707, + "grad_norm": 1.0311802625656128, + "learning_rate": 1.2261191343687e-05, + "loss": 0.8518, + "mean_token_accuracy": 0.7521871498641179, + "step": 1310 + }, + { + "epoch": 0.4866652788379247, + "grad_norm": 1.1396102905273438, + "learning_rate": 1.219820290136192e-05, + "loss": 0.8839, + "mean_token_accuracy": 0.7428744239445215, + "step": 1315 + }, + { + "epoch": 0.4885157171605024, + "grad_norm": 1.1476742029190063, + "learning_rate": 1.2135122681562481e-05, + "loss": 0.8913, + "mean_token_accuracy": 0.7383458968011796, + "step": 1320 + }, + { + "epoch": 0.49036615548308005, + "grad_norm": 1.2438527345657349, + "learning_rate": 1.2071953317959692e-05, + "loss": 0.8744, + "mean_token_accuracy": 0.7452624657333512, + "step": 1325 + }, + { + "epoch": 0.49221659380565774, + "grad_norm": 1.1306873559951782, + "learning_rate": 1.2008697447946421e-05, + "loss": 0.8751, + "mean_token_accuracy": 0.7424721560380638, + "step": 1330 + }, + { + "epoch": 0.4940670321282354, + "grad_norm": 1.1581846475601196, + "learning_rate": 1.1945357712527273e-05, + "loss": 0.8585, + "mean_token_accuracy": 0.7497287108747003, + "step": 1335 + }, + { + "epoch": 0.495917470450813, + "grad_norm": 1.3308736085891724, + "learning_rate": 1.1881936756208329e-05, + "loss": 0.8868, + "mean_token_accuracy": 0.7392369943901189, + "step": 1340 + }, + { + "epoch": 0.4977679087733907, + "grad_norm": 1.143922209739685, + "learning_rate": 1.1818437226886738e-05, + "loss": 0.8286, + "mean_token_accuracy": 0.7565615599698585, + "step": 1345 + }, + { + "epoch": 0.49961834709596836, + "grad_norm": 1.3555456399917603, + "learning_rate": 1.1754861775740163e-05, + "loss": 0.8799, + "mean_token_accuracy": 0.7441001926506251, + "step": 1350 + }, + { + "epoch": 0.501468785418546, + "grad_norm": 1.0333168506622314, + "learning_rate": 1.1691213057116082e-05, + "loss": 0.833, + "mean_token_accuracy": 0.7563766714351905, + "step": 1355 + }, + { + "epoch": 0.5033192237411237, + "grad_norm": 1.1685550212860107, + "learning_rate": 1.1627493728420978e-05, + "loss": 0.8312, + "mean_token_accuracy": 0.7566255940877488, + "step": 1360 + }, + { + "epoch": 0.5051696620637013, + "grad_norm": 1.2081339359283447, + "learning_rate": 1.1563706450009391e-05, + "loss": 0.8407, + "mean_token_accuracy": 0.754443525932113, + "step": 1365 + }, + { + "epoch": 0.507020100386279, + "grad_norm": 1.0863213539123535, + "learning_rate": 1.1499853885072827e-05, + "loss": 0.8339, + "mean_token_accuracy": 0.7545843545749367, + "step": 1370 + }, + { + "epoch": 0.5088705387088567, + "grad_norm": 1.1328675746917725, + "learning_rate": 1.1435938699528586e-05, + "loss": 0.8911, + "mean_token_accuracy": 0.7379121479666565, + "step": 1375 + }, + { + "epoch": 0.5107209770314344, + "grad_norm": 1.1014608144760132, + "learning_rate": 1.137196356190845e-05, + "loss": 0.8486, + "mean_token_accuracy": 0.7507691158185413, + "step": 1380 + }, + { + "epoch": 0.512571415354012, + "grad_norm": 1.1279276609420776, + "learning_rate": 1.1307931143247268e-05, + "loss": 0.846, + "mean_token_accuracy": 0.7543217557206249, + "step": 1385 + }, + { + "epoch": 0.5144218536765897, + "grad_norm": 1.2000517845153809, + "learning_rate": 1.1243844116971433e-05, + "loss": 0.8628, + "mean_token_accuracy": 0.7468883900958883, + "step": 1390 + }, + { + "epoch": 0.5162722919991674, + "grad_norm": 1.4059293270111084, + "learning_rate": 1.1179705158787276e-05, + "loss": 0.8491, + "mean_token_accuracy": 0.7522363707168316, + "step": 1395 + }, + { + "epoch": 0.5181227303217449, + "grad_norm": 1.1212759017944336, + "learning_rate": 1.1115516946569333e-05, + "loss": 0.8497, + "mean_token_accuracy": 0.7499685858393633, + "step": 1400 + }, + { + "epoch": 0.5181227303217449, + "eval_loss": 0.8875036239624023, + "eval_mean_token_accuracy": 0.7405125948791582, + "eval_runtime": 50.4671, + "eval_samples_per_second": 10.165, + "eval_steps_per_second": 10.165, + "step": 1400 + }, + { + "epoch": 0.5199731686443226, + "grad_norm": 1.0802546739578247, + "learning_rate": 1.105128216024857e-05, + "loss": 0.8321, + "mean_token_accuracy": 0.7570983833156995, + "step": 1405 + }, + { + "epoch": 0.5218236069669003, + "grad_norm": 1.1517388820648193, + "learning_rate": 1.0987003481700456e-05, + "loss": 0.8682, + "mean_token_accuracy": 0.7435588169757222, + "step": 1410 + }, + { + "epoch": 0.5236740452894779, + "grad_norm": 1.335225224494934, + "learning_rate": 1.092268359463302e-05, + "loss": 0.8709, + "mean_token_accuracy": 0.7452917373238905, + "step": 1415 + }, + { + "epoch": 0.5255244836120556, + "grad_norm": 1.1167656183242798, + "learning_rate": 1.0858325184474796e-05, + "loss": 0.8927, + "mean_token_accuracy": 0.7391699468887288, + "step": 1420 + }, + { + "epoch": 0.5273749219346333, + "grad_norm": 1.127293348312378, + "learning_rate": 1.0793930938262689e-05, + "loss": 0.8771, + "mean_token_accuracy": 0.7431836462390942, + "step": 1425 + }, + { + "epoch": 0.5292253602572109, + "grad_norm": 1.219251275062561, + "learning_rate": 1.0729503544529814e-05, + "loss": 0.8919, + "mean_token_accuracy": 0.7401217708520834, + "step": 1430 + }, + { + "epoch": 0.5310757985797886, + "grad_norm": 1.0866413116455078, + "learning_rate": 1.0665045693193226e-05, + "loss": 0.8561, + "mean_token_accuracy": 0.7487249142355408, + "step": 1435 + }, + { + "epoch": 0.5329262369023663, + "grad_norm": 1.0730805397033691, + "learning_rate": 1.0600560075441617e-05, + "loss": 0.8763, + "mean_token_accuracy": 0.7433546825854954, + "step": 1440 + }, + { + "epoch": 0.5347766752249439, + "grad_norm": 1.1865015029907227, + "learning_rate": 1.0536049383622966e-05, + "loss": 0.873, + "mean_token_accuracy": 0.744337296630846, + "step": 1445 + }, + { + "epoch": 0.5366271135475216, + "grad_norm": 1.058233618736267, + "learning_rate": 1.047151631113212e-05, + "loss": 0.8574, + "mean_token_accuracy": 0.7489177692249259, + "step": 1450 + }, + { + "epoch": 0.5384775518700993, + "grad_norm": 1.0962151288986206, + "learning_rate": 1.0406963552298332e-05, + "loss": 0.8577, + "mean_token_accuracy": 0.7472454520686317, + "step": 1455 + }, + { + "epoch": 0.5403279901926769, + "grad_norm": 1.0905910730361938, + "learning_rate": 1.034239380227281e-05, + "loss": 0.8406, + "mean_token_accuracy": 0.7517492389021212, + "step": 1460 + }, + { + "epoch": 0.5421784285152546, + "grad_norm": 1.0674951076507568, + "learning_rate": 1.0277809756916134e-05, + "loss": 0.884, + "mean_token_accuracy": 0.7403210456441213, + "step": 1465 + }, + { + "epoch": 0.5440288668378322, + "grad_norm": 0.9999077916145325, + "learning_rate": 1.0213214112685747e-05, + "loss": 0.8357, + "mean_token_accuracy": 0.7536248701632074, + "step": 1470 + }, + { + "epoch": 0.5458793051604098, + "grad_norm": 1.1569849252700806, + "learning_rate": 1.0148609566523358e-05, + "loss": 0.8252, + "mean_token_accuracy": 0.7566011284124972, + "step": 1475 + }, + { + "epoch": 0.5477297434829875, + "grad_norm": 1.1695085763931274, + "learning_rate": 1.0083998815742335e-05, + "loss": 0.8526, + "mean_token_accuracy": 0.7503246035344221, + "step": 1480 + }, + { + "epoch": 0.5495801818055652, + "grad_norm": 1.2002400159835815, + "learning_rate": 1.0019384557915099e-05, + "loss": 0.8481, + "mean_token_accuracy": 0.7512016801534671, + "step": 1485 + }, + { + "epoch": 0.5514306201281428, + "grad_norm": 1.0182387828826904, + "learning_rate": 9.9547694907605e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7493889074191754, + "step": 1490 + }, + { + "epoch": 0.5532810584507205, + "grad_norm": 1.1398547887802124, + "learning_rate": 9.890156312031165e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7465289067053387, + "step": 1495 + }, + { + "epoch": 0.5551314967732982, + "grad_norm": 1.0890473127365112, + "learning_rate": 9.825547719400889e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7527299833547442, + "step": 1500 + }, + { + "epoch": 0.5551314967732982, + "eval_loss": 0.883466899394989, + "eval_mean_token_accuracy": 0.7415391964540806, + "eval_runtime": 78.5634, + "eval_samples_per_second": 6.53, + "eval_steps_per_second": 6.53, + "step": 1500 + }, + { + "epoch": 0.5569819350958758, + "grad_norm": 1.083844780921936, + "learning_rate": 9.760946410351988e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7577779781862539, + "step": 1505 + }, + { + "epoch": 0.5588323734184535, + "grad_norm": 1.12702214717865, + "learning_rate": 9.696355082062679e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7457827779769488, + "step": 1510 + }, + { + "epoch": 0.5606828117410312, + "grad_norm": 1.1723198890686035, + "learning_rate": 9.631776431294475e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7553337331891314, + "step": 1515 + }, + { + "epoch": 0.5625332500636088, + "grad_norm": 1.0552607774734497, + "learning_rate": 9.567213154279582e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7580377257670661, + "step": 1520 + }, + { + "epoch": 0.5643836883861865, + "grad_norm": 1.1943864822387695, + "learning_rate": 9.502667946608332e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7412889558685449, + "step": 1525 + }, + { + "epoch": 0.5662341267087642, + "grad_norm": 1.0926896333694458, + "learning_rate": 9.43814350311666e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7485775551693337, + "step": 1530 + }, + { + "epoch": 0.5680845650313417, + "grad_norm": 1.1257902383804321, + "learning_rate": 9.37364251777355e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.755104972794591, + "step": 1535 + }, + { + "epoch": 0.5699350033539194, + "grad_norm": 1.184830904006958, + "learning_rate": 9.309167683568597e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7347233994686599, + "step": 1540 + }, + { + "epoch": 0.5717854416764971, + "grad_norm": 1.0849738121032715, + "learning_rate": 9.244721692399545e-06, + "loss": 0.8368, + "mean_token_accuracy": 0.754807061603084, + "step": 1545 + }, + { + "epoch": 0.5736358799990747, + "grad_norm": 1.211175560951233, + "learning_rate": 9.180307234959918e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7446549598340149, + "step": 1550 + }, + { + "epoch": 0.5754863183216524, + "grad_norm": 1.1779320240020752, + "learning_rate": 9.115927000626665e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7515610439271313, + "step": 1555 + }, + { + "epoch": 0.5773367566442301, + "grad_norm": 1.085847020149231, + "learning_rate": 9.051583677347879e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7409961502294228, + "step": 1560 + }, + { + "epoch": 0.5791871949668078, + "grad_norm": 1.0653111934661865, + "learning_rate": 8.987279951530586e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7426297316689073, + "step": 1565 + }, + { + "epoch": 0.5810376332893854, + "grad_norm": 1.0567981004714966, + "learning_rate": 8.923018507928564e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7550816699978279, + "step": 1570 + }, + { + "epoch": 0.5828880716119631, + "grad_norm": 1.0699466466903687, + "learning_rate": 8.85880202953026e-06, + "loss": 0.8056, + "mean_token_accuracy": 0.7621896696985008, + "step": 1575 + }, + { + "epoch": 0.5847385099345408, + "grad_norm": 1.1082216501235962, + "learning_rate": 8.79463319744677e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7515386166583966, + "step": 1580 + }, + { + "epoch": 0.5865889482571184, + "grad_norm": 1.1345179080963135, + "learning_rate": 8.730514690799916e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7446916177566473, + "step": 1585 + }, + { + "epoch": 0.5884393865796961, + "grad_norm": 1.1079617738723755, + "learning_rate": 8.666449186610353e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7437692859659788, + "step": 1590 + }, + { + "epoch": 0.5902898249022738, + "grad_norm": 1.1664999723434448, + "learning_rate": 8.60243935968585e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7435284059896061, + "step": 1595 + }, + { + "epoch": 0.5921402632248514, + "grad_norm": 1.0710796117782593, + "learning_rate": 8.538487882509568e-06, + "loss": 0.8377, + "mean_token_accuracy": 0.7522821982666854, + "step": 1600 + }, + { + "epoch": 0.5921402632248514, + "eval_loss": 0.8806459307670593, + "eval_mean_token_accuracy": 0.7420680891691662, + "eval_runtime": 78.0774, + "eval_samples_per_second": 6.57, + "eval_steps_per_second": 6.57, + "step": 1600 + }, + { + "epoch": 0.5939907015474291, + "grad_norm": 1.0592025518417358, + "learning_rate": 8.474597425128501e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7485613520965848, + "step": 1605 + }, + { + "epoch": 0.5958411398700068, + "grad_norm": 1.1598796844482422, + "learning_rate": 8.410770655042003e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7454409843236658, + "step": 1610 + }, + { + "epoch": 0.5976915781925843, + "grad_norm": 1.1448596715927124, + "learning_rate": 8.347010237090408e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.744198405162078, + "step": 1615 + }, + { + "epoch": 0.599542016515162, + "grad_norm": 1.1301459074020386, + "learning_rate": 8.283318833343773e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7457724650975702, + "step": 1620 + }, + { + "epoch": 0.6013924548377397, + "grad_norm": 1.0771557092666626, + "learning_rate": 8.219699102990735e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7596411077898149, + "step": 1625 + }, + { + "epoch": 0.6032428931603173, + "grad_norm": 1.100756049156189, + "learning_rate": 8.156153702227484e-06, + "loss": 0.8235, + "mean_token_accuracy": 0.7558995024863349, + "step": 1630 + }, + { + "epoch": 0.605093331482895, + "grad_norm": 1.0815366506576538, + "learning_rate": 8.092685284146865e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7451292739524256, + "step": 1635 + }, + { + "epoch": 0.6069437698054727, + "grad_norm": 1.01080322265625, + "learning_rate": 8.029296498627608e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7460930718951544, + "step": 1640 + }, + { + "epoch": 0.6087942081280503, + "grad_norm": 1.0338646173477173, + "learning_rate": 7.965989992223693e-06, + "loss": 0.8386, + "mean_token_accuracy": 0.7530607004502294, + "step": 1645 + }, + { + "epoch": 0.610644646450628, + "grad_norm": 1.0714292526245117, + "learning_rate": 7.90276840805385e-06, + "loss": 0.821, + "mean_token_accuracy": 0.759004615934364, + "step": 1650 + }, + { + "epoch": 0.6124950847732057, + "grad_norm": 1.0714163780212402, + "learning_rate": 7.839634385691214e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7480340127823741, + "step": 1655 + }, + { + "epoch": 0.6143455230957833, + "grad_norm": 1.1181557178497314, + "learning_rate": 7.776590561053117e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7490940834342857, + "step": 1660 + }, + { + "epoch": 0.616195961418361, + "grad_norm": 1.2162024974822998, + "learning_rate": 7.713639566291028e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7524457191655578, + "step": 1665 + }, + { + "epoch": 0.6180463997409387, + "grad_norm": 1.132047176361084, + "learning_rate": 7.650784029680662e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7589660088987882, + "step": 1670 + }, + { + "epoch": 0.6198968380635163, + "grad_norm": 1.0826492309570312, + "learning_rate": 7.58802657551225e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7521249389316524, + "step": 1675 + }, + { + "epoch": 0.621747276386094, + "grad_norm": 1.0509703159332275, + "learning_rate": 7.52536982398097e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7483188436822825, + "step": 1680 + }, + { + "epoch": 0.6235977147086716, + "grad_norm": 1.0793825387954712, + "learning_rate": 7.46281639107755e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7428339069332885, + "step": 1685 + }, + { + "epoch": 0.6254481530312492, + "grad_norm": 1.1245028972625732, + "learning_rate": 7.400368888479048e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.738876062207339, + "step": 1690 + }, + { + "epoch": 0.6272985913538269, + "grad_norm": 1.0940346717834473, + "learning_rate": 7.3380299234398076e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7450572147899929, + "step": 1695 + }, + { + "epoch": 0.6291490296764046, + "grad_norm": 1.1423110961914062, + "learning_rate": 7.275802098682612e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.750851552690575, + "step": 1700 + }, + { + "epoch": 0.6291490296764046, + "eval_loss": 0.8765305280685425, + "eval_mean_token_accuracy": 0.7429402834830536, + "eval_runtime": 79.7639, + "eval_samples_per_second": 6.431, + "eval_steps_per_second": 6.431, + "step": 1700 + }, + { + "epoch": 0.6309994679989822, + "grad_norm": 1.0875543355941772, + "learning_rate": 7.213688012290004e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7555179092653466, + "step": 1705 + }, + { + "epoch": 0.6328499063215599, + "grad_norm": 1.1560087203979492, + "learning_rate": 7.151690257595826e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7575116156195778, + "step": 1710 + }, + { + "epoch": 0.6347003446441376, + "grad_norm": 1.1857202053070068, + "learning_rate": 7.089811423076936e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.7540097382022544, + "step": 1715 + }, + { + "epoch": 0.6365507829667152, + "grad_norm": 1.0499897003173828, + "learning_rate": 7.028054092245134e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7516812101912956, + "step": 1720 + }, + { + "epoch": 0.6384012212892929, + "grad_norm": 1.1187822818756104, + "learning_rate": 6.966420843539321e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.7528163038917939, + "step": 1725 + }, + { + "epoch": 0.6402516596118706, + "grad_norm": 1.066787838935852, + "learning_rate": 6.90491425021781e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.749689118169066, + "step": 1730 + }, + { + "epoch": 0.6421020979344482, + "grad_norm": 1.076682209968567, + "learning_rate": 6.843536880250914e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7507021768711251, + "step": 1735 + }, + { + "epoch": 0.6439525362570259, + "grad_norm": 1.1074074506759644, + "learning_rate": 6.7822912962137225e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.749668775072545, + "step": 1740 + }, + { + "epoch": 0.6458029745796036, + "grad_norm": 1.1533567905426025, + "learning_rate": 6.721180055179113e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7447913980843776, + "step": 1745 + }, + { + "epoch": 0.6476534129021813, + "grad_norm": 1.170899510383606, + "learning_rate": 6.660205708610987e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7510611028124827, + "step": 1750 + }, + { + "epoch": 0.6495038512247588, + "grad_norm": 1.1935901641845703, + "learning_rate": 6.599370802257755e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7451695366987459, + "step": 1755 + }, + { + "epoch": 0.6513542895473365, + "grad_norm": 1.1270606517791748, + "learning_rate": 6.5386778760460316e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7491003127984905, + "step": 1760 + }, + { + "epoch": 0.6532047278699142, + "grad_norm": 1.0685646533966064, + "learning_rate": 6.478129463974598e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7494648164169282, + "step": 1765 + }, + { + "epoch": 0.6550551661924918, + "grad_norm": 1.1324131488800049, + "learning_rate": 6.417728094008613e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7496021909362418, + "step": 1770 + }, + { + "epoch": 0.6569056045150695, + "grad_norm": 1.2056822776794434, + "learning_rate": 6.357476287974051e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.7528655783760423, + "step": 1775 + }, + { + "epoch": 0.6587560428376472, + "grad_norm": 1.0940866470336914, + "learning_rate": 6.297376561452428e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.7591131075443316, + "step": 1780 + }, + { + "epoch": 0.6606064811602248, + "grad_norm": 1.097103476524353, + "learning_rate": 6.237431423675764e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7475563002250533, + "step": 1785 + }, + { + "epoch": 0.6624569194828025, + "grad_norm": 1.071036458015442, + "learning_rate": 6.177643377421827e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7490388522721844, + "step": 1790 + }, + { + "epoch": 0.6643073578053802, + "grad_norm": 1.1054531335830688, + "learning_rate": 6.118014918909633e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7503596366057965, + "step": 1795 + }, + { + "epoch": 0.6661577961279578, + "grad_norm": 1.1277004480361938, + "learning_rate": 6.058548537695225e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.7482101706448133, + "step": 1800 + }, + { + "epoch": 0.6661577961279578, + "eval_loss": 0.8728470206260681, + "eval_mean_token_accuracy": 0.7436341718378576, + "eval_runtime": 78.5895, + "eval_samples_per_second": 6.528, + "eval_steps_per_second": 6.528, + "step": 1800 + }, + { + "epoch": 0.6680082344505355, + "grad_norm": 1.2152836322784424, + "learning_rate": 5.999246716567737e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7482331663974422, + "step": 1805 + }, + { + "epoch": 0.6698586727731132, + "grad_norm": 1.0343924760818481, + "learning_rate": 5.940111931445731e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7477439117392353, + "step": 1810 + }, + { + "epoch": 0.6717091110956908, + "grad_norm": 1.1265689134597778, + "learning_rate": 5.881146651273825e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7555190617342097, + "step": 1815 + }, + { + "epoch": 0.6735595494182685, + "grad_norm": 1.0546154975891113, + "learning_rate": 5.822353337919616e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7420459758922366, + "step": 1820 + }, + { + "epoch": 0.6754099877408462, + "grad_norm": 1.1439138650894165, + "learning_rate": 5.763734446070892e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7410674912058572, + "step": 1825 + }, + { + "epoch": 0.6772604260634237, + "grad_norm": 1.083448052406311, + "learning_rate": 5.705292423133133e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7535584571887595, + "step": 1830 + }, + { + "epoch": 0.6791108643860014, + "grad_norm": 1.1094322204589844, + "learning_rate": 5.647029709127355e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7444246110938043, + "step": 1835 + }, + { + "epoch": 0.6809613027085791, + "grad_norm": 1.0760412216186523, + "learning_rate": 5.5889487365882065e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.7574787063158566, + "step": 1840 + }, + { + "epoch": 0.6828117410311567, + "grad_norm": 1.108798861503601, + "learning_rate": 5.531051930462437e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7418184447228666, + "step": 1845 + }, + { + "epoch": 0.6846621793537344, + "grad_norm": 1.0983977317810059, + "learning_rate": 5.4733417080076325e-06, + "loss": 0.826, + "mean_token_accuracy": 0.7576316725845673, + "step": 1850 + }, + { + "epoch": 0.6865126176763121, + "grad_norm": 1.0830743312835693, + "learning_rate": 5.415820478691301e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7521840490748731, + "step": 1855 + }, + { + "epoch": 0.6883630559988897, + "grad_norm": 1.115321159362793, + "learning_rate": 5.358490644090263e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7493569745174777, + "step": 1860 + }, + { + "epoch": 0.6902134943214674, + "grad_norm": 1.0904319286346436, + "learning_rate": 5.3013545977904005e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7450953030774816, + "step": 1865 + }, + { + "epoch": 0.6920639326440451, + "grad_norm": 1.0365054607391357, + "learning_rate": 5.244414725286717e-06, + "loss": 0.8291, + "mean_token_accuracy": 0.7560889898260071, + "step": 1870 + }, + { + "epoch": 0.6939143709666227, + "grad_norm": 1.0902795791625977, + "learning_rate": 5.187673403883721e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7514938359735968, + "step": 1875 + }, + { + "epoch": 0.6957648092892004, + "grad_norm": 1.0894972085952759, + "learning_rate": 5.131133002596199e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7497225973502779, + "step": 1880 + }, + { + "epoch": 0.6976152476117781, + "grad_norm": 1.0798072814941406, + "learning_rate": 5.074795882050293e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7528341392472259, + "step": 1885 + }, + { + "epoch": 0.6994656859343557, + "grad_norm": 1.0907217264175415, + "learning_rate": 5.018664394384942e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7494421319615481, + "step": 1890 + }, + { + "epoch": 0.7013161242569333, + "grad_norm": 1.0164741277694702, + "learning_rate": 4.9627408831536705e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.7529435144223482, + "step": 1895 + }, + { + "epoch": 0.703166562579511, + "grad_norm": 1.142519474029541, + "learning_rate": 4.907027683226761e-06, + "loss": 0.8249, + "mean_token_accuracy": 0.7567480300956836, + "step": 1900 + }, + { + "epoch": 0.703166562579511, + "eval_loss": 0.8700699806213379, + "eval_mean_token_accuracy": 0.7442610694529287, + "eval_runtime": 50.1711, + "eval_samples_per_second": 10.225, + "eval_steps_per_second": 10.225, + "step": 1900 + }, + { + "epoch": 0.7050170009020886, + "grad_norm": 1.0815749168395996, + "learning_rate": 4.85152712069375e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7466736465102184, + "step": 1905 + }, + { + "epoch": 0.7068674392246663, + "grad_norm": 1.016316533088684, + "learning_rate": 4.7962415127663265e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7478070670678898, + "step": 1910 + }, + { + "epoch": 0.708717877547244, + "grad_norm": 1.15891432762146, + "learning_rate": 4.74117316768158e-06, + "loss": 0.8165, + "mean_token_accuracy": 0.759968391663386, + "step": 1915 + }, + { + "epoch": 0.7105683158698216, + "grad_norm": 1.0917257070541382, + "learning_rate": 4.686324384605629e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7553093824763246, + "step": 1920 + }, + { + "epoch": 0.7124187541923993, + "grad_norm": 1.0935016870498657, + "learning_rate": 4.631697453537623e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.7561149061155692, + "step": 1925 + }, + { + "epoch": 0.714269192514977, + "grad_norm": 1.049316167831421, + "learning_rate": 4.577294655214144e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7433473744990514, + "step": 1930 + }, + { + "epoch": 0.7161196308375547, + "grad_norm": 1.1667238473892212, + "learning_rate": 4.523118261013969e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7534449112393112, + "step": 1935 + }, + { + "epoch": 0.7179700691601323, + "grad_norm": 1.144909381866455, + "learning_rate": 4.469170532863254e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7536943727299822, + "step": 1940 + }, + { + "epoch": 0.71982050748271, + "grad_norm": 1.1312308311462402, + "learning_rate": 4.415453723141081e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7519524749356737, + "step": 1945 + }, + { + "epoch": 0.7216709458052877, + "grad_norm": 1.1640957593917847, + "learning_rate": 4.361970074585426e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.7592206858230149, + "step": 1950 + }, + { + "epoch": 0.7235213841278653, + "grad_norm": 1.1180189847946167, + "learning_rate": 4.308721820199529e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7509569768017925, + "step": 1955 + }, + { + "epoch": 0.725371822450443, + "grad_norm": 1.16630220413208, + "learning_rate": 4.255711183158635e-06, + "loss": 0.8303, + "mean_token_accuracy": 0.7560554269072497, + "step": 1960 + }, + { + "epoch": 0.7272222607730207, + "grad_norm": 1.0474036931991577, + "learning_rate": 4.2029403767172175e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.7623219816059803, + "step": 1965 + }, + { + "epoch": 0.7290726990955982, + "grad_norm": 1.136406421661377, + "learning_rate": 4.150411604116531e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7511481450162742, + "step": 1970 + }, + { + "epoch": 0.7309231374181759, + "grad_norm": 1.0562430620193481, + "learning_rate": 4.098127058492652e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7573706945444444, + "step": 1975 + }, + { + "epoch": 0.7327735757407536, + "grad_norm": 1.1478077173233032, + "learning_rate": 4.0460889227849e-06, + "loss": 0.8249, + "mean_token_accuracy": 0.7569546880750191, + "step": 1980 + }, + { + "epoch": 0.7346240140633312, + "grad_norm": 1.00325608253479, + "learning_rate": 3.9942993696447045e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7506293642917294, + "step": 1985 + }, + { + "epoch": 0.7364744523859089, + "grad_norm": 1.0663394927978516, + "learning_rate": 3.942760561344877e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.7502340169200126, + "step": 1990 + }, + { + "epoch": 0.7383248907084866, + "grad_norm": 1.130427360534668, + "learning_rate": 3.891474649689362e-06, + "loss": 0.8118, + "mean_token_accuracy": 0.7618501171744374, + "step": 1995 + }, + { + "epoch": 0.7401753290310642, + "grad_norm": 1.0855380296707153, + "learning_rate": 3.840443775923365e-06, + "loss": 0.8368, + "mean_token_accuracy": 0.7534221021997818, + "step": 2000 + }, + { + "epoch": 0.7401753290310642, + "eval_loss": 0.8671350479125977, + "eval_mean_token_accuracy": 0.7454383935250886, + "eval_runtime": 51.5876, + "eval_samples_per_second": 9.944, + "eval_steps_per_second": 9.944, + "step": 2000 + }, + { + "epoch": 0.7420257673536419, + "grad_norm": 1.209213137626648, + "learning_rate": 3.7896700706439826e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7606564087590786, + "step": 2005 + }, + { + "epoch": 0.7438762056762196, + "grad_norm": 1.148069977760315, + "learning_rate": 3.7391556537112282e-06, + "loss": 0.7874, + "mean_token_accuracy": 0.7669905418548161, + "step": 2010 + }, + { + "epoch": 0.7457266439987972, + "grad_norm": 1.1433311700820923, + "learning_rate": 3.6889026341595378e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7535976661109324, + "step": 2015 + }, + { + "epoch": 0.7475770823213749, + "grad_norm": 1.129437804222107, + "learning_rate": 3.6389131101096953e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.7475487510668153, + "step": 2020 + }, + { + "epoch": 0.7494275206439526, + "grad_norm": 1.0358542203903198, + "learning_rate": 3.5891891686812597e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7502852908167374, + "step": 2025 + }, + { + "epoch": 0.7512779589665302, + "grad_norm": 1.1142020225524902, + "learning_rate": 3.5397328859054138e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.749887645218871, + "step": 2030 + }, + { + "epoch": 0.7531283972891079, + "grad_norm": 1.1345794200897217, + "learning_rate": 3.490546326638273e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7597895364859456, + "step": 2035 + }, + { + "epoch": 0.7549788356116856, + "grad_norm": 1.0666804313659668, + "learning_rate": 3.441631544474705e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.7516444578066305, + "step": 2040 + }, + { + "epoch": 0.7568292739342631, + "grad_norm": 1.1180936098098755, + "learning_rate": 3.3929905816625653e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7454598959843268, + "step": 2045 + }, + { + "epoch": 0.7586797122568408, + "grad_norm": 1.1804002523422241, + "learning_rate": 3.344625469017445e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7508597223030974, + "step": 2050 + }, + { + "epoch": 0.7605301505794185, + "grad_norm": 1.0938820838928223, + "learning_rate": 3.2965382258378674e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7488317085016187, + "step": 2055 + }, + { + "epoch": 0.7623805889019961, + "grad_norm": 1.055139422416687, + "learning_rate": 3.248730859821002e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.764882704807944, + "step": 2060 + }, + { + "epoch": 0.7642310272245738, + "grad_norm": 1.1458957195281982, + "learning_rate": 3.2012053669788136e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7570119050077394, + "step": 2065 + }, + { + "epoch": 0.7660814655471515, + "grad_norm": 1.0989375114440918, + "learning_rate": 3.1539637315547524e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7496361163745409, + "step": 2070 + }, + { + "epoch": 0.7679319038697291, + "grad_norm": 1.0754046440124512, + "learning_rate": 3.1070079259408934e-06, + "loss": 0.807, + "mean_token_accuracy": 0.7616431298931425, + "step": 2075 + }, + { + "epoch": 0.7697823421923068, + "grad_norm": 1.02739417552948, + "learning_rate": 3.0603399105955966e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7544530392919, + "step": 2080 + }, + { + "epoch": 0.7716327805148845, + "grad_norm": 1.1344877481460571, + "learning_rate": 3.0139616339616394e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7536931725103136, + "step": 2085 + }, + { + "epoch": 0.7734832188374621, + "grad_norm": 1.112585186958313, + "learning_rate": 2.9678750323848893e-06, + "loss": 0.8152, + "mean_token_accuracy": 0.7586732704570804, + "step": 2090 + }, + { + "epoch": 0.7753336571600398, + "grad_norm": 0.986150324344635, + "learning_rate": 2.922082030033446e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.7574742660162402, + "step": 2095 + }, + { + "epoch": 0.7771840954826175, + "grad_norm": 1.0907959938049316, + "learning_rate": 2.8765845388172955e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7595165878036109, + "step": 2100 + }, + { + "epoch": 0.7771840954826175, + "eval_loss": 0.865083634853363, + "eval_mean_token_accuracy": 0.7459035597008792, + "eval_runtime": 50.488, + "eval_samples_per_second": 10.161, + "eval_steps_per_second": 10.161, + "step": 2100 + }, + { + "epoch": 0.779034533805195, + "grad_norm": 1.1251463890075684, + "learning_rate": 2.831384458308518e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7561818495362099, + "step": 2105 + }, + { + "epoch": 0.7808849721277727, + "grad_norm": 1.0305039882659912, + "learning_rate": 2.7864836756619407e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.748632069328805, + "step": 2110 + }, + { + "epoch": 0.7827354104503504, + "grad_norm": 1.0406534671783447, + "learning_rate": 2.741884065536373e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7567806612457586, + "step": 2115 + }, + { + "epoch": 0.7845858487729281, + "grad_norm": 1.117790937423706, + "learning_rate": 2.6975874900163223e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7546855864916886, + "step": 2120 + }, + { + "epoch": 0.7864362870955057, + "grad_norm": 1.065445065498352, + "learning_rate": 2.6535957985342653e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7606240695791034, + "step": 2125 + }, + { + "epoch": 0.7882867254180834, + "grad_norm": 1.116972804069519, + "learning_rate": 2.6099108277934105e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.756568328742729, + "step": 2130 + }, + { + "epoch": 0.7901371637406611, + "grad_norm": 1.0490765571594238, + "learning_rate": 2.5665344016910367e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7554207633123468, + "step": 2135 + }, + { + "epoch": 0.7919876020632387, + "grad_norm": 1.2332850694656372, + "learning_rate": 2.523468331242329e-06, + "loss": 0.8167, + "mean_token_accuracy": 0.7584442633530236, + "step": 2140 + }, + { + "epoch": 0.7938380403858164, + "grad_norm": 1.1261318922042847, + "learning_rate": 2.4807144145047734e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7562805852137349, + "step": 2145 + }, + { + "epoch": 0.7956884787083941, + "grad_norm": 1.0403841733932495, + "learning_rate": 2.438274436503074e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7497747898973967, + "step": 2150 + }, + { + "epoch": 0.7975389170309717, + "grad_norm": 1.0887295007705688, + "learning_rate": 2.396150169154644e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7454726202967417, + "step": 2155 + }, + { + "epoch": 0.7993893553535494, + "grad_norm": 1.0545668601989746, + "learning_rate": 2.3543433711956197e-06, + "loss": 0.8161, + "mean_token_accuracy": 0.7603535982628888, + "step": 2160 + }, + { + "epoch": 0.8012397936761271, + "grad_norm": 1.0553205013275146, + "learning_rate": 2.3128557881074153e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7482917388282289, + "step": 2165 + }, + { + "epoch": 0.8030902319987047, + "grad_norm": 1.0758658647537231, + "learning_rate": 2.271689152043873e-06, + "loss": 0.82, + "mean_token_accuracy": 0.7562723919351871, + "step": 2170 + }, + { + "epoch": 0.8049406703212824, + "grad_norm": 1.0912736654281616, + "learning_rate": 2.230845181758928e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7509281602598273, + "step": 2175 + }, + { + "epoch": 0.80679110864386, + "grad_norm": 1.091189980506897, + "learning_rate": 2.1903255825348533e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7585578303522367, + "step": 2180 + }, + { + "epoch": 0.8086415469664376, + "grad_norm": 1.1467081308364868, + "learning_rate": 2.150132046111054e-06, + "loss": 0.8167, + "mean_token_accuracy": 0.7596410948645677, + "step": 2185 + }, + { + "epoch": 0.8104919852890153, + "grad_norm": 1.0988317728042603, + "learning_rate": 2.1102662506134506e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7478351620581012, + "step": 2190 + }, + { + "epoch": 0.812342423611593, + "grad_norm": 1.0135034322738647, + "learning_rate": 2.0707298604843964e-06, + "loss": 0.82, + "mean_token_accuracy": 0.7557867360545079, + "step": 2195 + }, + { + "epoch": 0.8141928619341706, + "grad_norm": 1.1506068706512451, + "learning_rate": 2.03152452641321e-06, + "loss": 0.8168, + "mean_token_accuracy": 0.7570490789110471, + "step": 2200 + }, + { + "epoch": 0.8141928619341706, + "eval_loss": 0.8633614182472229, + "eval_mean_token_accuracy": 0.7464084312231831, + "eval_runtime": 50.5455, + "eval_samples_per_second": 10.149, + "eval_steps_per_second": 10.149, + "step": 2200 + }, + { + "epoch": 0.8160433002567483, + "grad_norm": 1.1421040296554565, + "learning_rate": 1.9926518852672294e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7624462261672521, + "step": 2205 + }, + { + "epoch": 0.817893738579326, + "grad_norm": 1.1922048330307007, + "learning_rate": 1.9541135600234917e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7499927786124297, + "step": 2210 + }, + { + "epoch": 0.8197441769019036, + "grad_norm": 1.0640493631362915, + "learning_rate": 1.9159111597009584e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.749026967175465, + "step": 2215 + }, + { + "epoch": 0.8215946152244813, + "grad_norm": 1.0632871389389038, + "learning_rate": 1.8780462792933473e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7554019995751211, + "step": 2220 + }, + { + "epoch": 0.823445053547059, + "grad_norm": 1.0224618911743164, + "learning_rate": 1.8405204997025394e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7662021634887914, + "step": 2225 + }, + { + "epoch": 0.8252954918696366, + "grad_norm": 1.1255993843078613, + "learning_rate": 1.8033353876725578e-06, + "loss": 0.8139, + "mean_token_accuracy": 0.7588138965092439, + "step": 2230 + }, + { + "epoch": 0.8271459301922143, + "grad_norm": 1.0906031131744385, + "learning_rate": 1.766492495724178e-06, + "loss": 0.7866, + "mean_token_accuracy": 0.7669799425354095, + "step": 2235 + }, + { + "epoch": 0.828996368514792, + "grad_norm": 1.0577834844589233, + "learning_rate": 1.7299933620900945e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7578440794991569, + "step": 2240 + }, + { + "epoch": 0.8308468068373696, + "grad_norm": 1.1319166421890259, + "learning_rate": 1.6938395106507034e-06, + "loss": 0.8174, + "mean_token_accuracy": 0.7602561091479803, + "step": 2245 + }, + { + "epoch": 0.8326972451599473, + "grad_norm": 1.0294007062911987, + "learning_rate": 1.658032450870467e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7548117730645351, + "step": 2250 + }, + { + "epoch": 0.834547683482525, + "grad_norm": 1.0928364992141724, + "learning_rate": 1.622573677734911e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7495608047356359, + "step": 2255 + }, + { + "epoch": 0.8363981218051025, + "grad_norm": 1.0050818920135498, + "learning_rate": 1.587464671688187e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7487124174368831, + "step": 2260 + }, + { + "epoch": 0.8382485601276802, + "grad_norm": 1.1902474164962769, + "learning_rate": 1.552706898571288e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7522115625399438, + "step": 2265 + }, + { + "epoch": 0.8400989984502579, + "grad_norm": 1.0865709781646729, + "learning_rate": 1.5183018095608138e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.746335436420623, + "step": 2270 + }, + { + "epoch": 0.8419494367728355, + "grad_norm": 1.0647828578948975, + "learning_rate": 1.4842508411084145e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7510108224281564, + "step": 2275 + }, + { + "epoch": 0.8437998750954132, + "grad_norm": 1.049154281616211, + "learning_rate": 1.4505554148807954e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7497810896600887, + "step": 2280 + }, + { + "epoch": 0.8456503134179909, + "grad_norm": 1.0914316177368164, + "learning_rate": 1.4172169377003775e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7590832228798032, + "step": 2285 + }, + { + "epoch": 0.8475007517405685, + "grad_norm": 1.1115086078643799, + "learning_rate": 1.3842368014865414e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7552640437678644, + "step": 2290 + }, + { + "epoch": 0.8493511900631462, + "grad_norm": 1.0784647464752197, + "learning_rate": 1.3516163831975337e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.7538963935240591, + "step": 2295 + }, + { + "epoch": 0.8512016283857239, + "grad_norm": 1.0417414903640747, + "learning_rate": 1.3193570447729642e-06, + "loss": 0.8291, + "mean_token_accuracy": 0.7568542404327636, + "step": 2300 + }, + { + "epoch": 0.8512016283857239, + "eval_loss": 0.8622255921363831, + "eval_mean_token_accuracy": 0.7465137017145697, + "eval_runtime": 79.4443, + "eval_samples_per_second": 6.457, + "eval_steps_per_second": 6.457, + "step": 2300 + }, + { + "epoch": 0.8530520667083016, + "grad_norm": 1.0113840103149414, + "learning_rate": 1.2874601330769488e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.7557854286956263, + "step": 2305 + }, + { + "epoch": 0.8549025050308792, + "grad_norm": 1.0227932929992676, + "learning_rate": 1.255926979841876e-06, + "loss": 0.8179, + "mean_token_accuracy": 0.759109213108344, + "step": 2310 + }, + { + "epoch": 0.8567529433534569, + "grad_norm": 1.0153535604476929, + "learning_rate": 1.224758901612796e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7530020536038673, + "step": 2315 + }, + { + "epoch": 0.8586033816760346, + "grad_norm": 1.023781657218933, + "learning_rate": 1.1939571996924738e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.76419716565575, + "step": 2320 + }, + { + "epoch": 0.8604538199986121, + "grad_norm": 1.0560698509216309, + "learning_rate": 1.1635231600870334e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7493070283055853, + "step": 2325 + }, + { + "epoch": 0.8623042583211898, + "grad_norm": 0.9805640578269958, + "learning_rate": 1.1334580534522932e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7544720834621528, + "step": 2330 + }, + { + "epoch": 0.8641546966437675, + "grad_norm": 1.0332831144332886, + "learning_rate": 1.1037631350406874e-06, + "loss": 0.7991, + "mean_token_accuracy": 0.7650346153959394, + "step": 2335 + }, + { + "epoch": 0.8660051349663451, + "grad_norm": 1.1042824983596802, + "learning_rate": 1.0744396446488781e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.7516225687482119, + "step": 2340 + }, + { + "epoch": 0.8678555732889228, + "grad_norm": 1.0638376474380493, + "learning_rate": 1.0454888065659775e-06, + "loss": 0.836, + "mean_token_accuracy": 0.752746712682739, + "step": 2345 + }, + { + "epoch": 0.8697060116115005, + "grad_norm": 1.018282175064087, + "learning_rate": 1.0169118295224488e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7523867122596545, + "step": 2350 + }, + { + "epoch": 0.8715564499340781, + "grad_norm": 1.0781739950180054, + "learning_rate": 9.887099066396178e-07, + "loss": 0.8172, + "mean_token_accuracy": 0.7580928881316813, + "step": 2355 + }, + { + "epoch": 0.8734068882566558, + "grad_norm": 1.0491783618927002, + "learning_rate": 9.608842153798903e-07, + "loss": 0.819, + "mean_token_accuracy": 0.7578187936876695, + "step": 2360 + }, + { + "epoch": 0.8752573265792335, + "grad_norm": 1.1399493217468262, + "learning_rate": 9.33435917497556e-07, + "loss": 0.8229, + "mean_token_accuracy": 0.7576256207110993, + "step": 2365 + }, + { + "epoch": 0.8771077649018111, + "grad_norm": 1.0112969875335693, + "learning_rate": 9.063661589903116e-07, + "loss": 0.7902, + "mean_token_accuracy": 0.7659863187941902, + "step": 2370 + }, + { + "epoch": 0.8789582032243888, + "grad_norm": 1.130234718322754, + "learning_rate": 8.796760700513984e-07, + "loss": 0.8418, + "mean_token_accuracy": 0.7490887679748288, + "step": 2375 + }, + { + "epoch": 0.8808086415469665, + "grad_norm": 1.168599247932434, + "learning_rate": 8.533667650224253e-07, + "loss": 0.8041, + "mean_token_accuracy": 0.7634415195741673, + "step": 2380 + }, + { + "epoch": 0.8826590798695441, + "grad_norm": 1.1895118951797485, + "learning_rate": 8.274393423468385e-07, + "loss": 0.8365, + "mean_token_accuracy": 0.7523803448191964, + "step": 2385 + }, + { + "epoch": 0.8845095181921218, + "grad_norm": 1.0385193824768066, + "learning_rate": 8.018948845240538e-07, + "loss": 0.8287, + "mean_token_accuracy": 0.7545559225361973, + "step": 2390 + }, + { + "epoch": 0.8863599565146995, + "grad_norm": 0.9930892586708069, + "learning_rate": 7.767344580642821e-07, + "loss": 0.8429, + "mean_token_accuracy": 0.7506830450011294, + "step": 2395 + }, + { + "epoch": 0.888210394837277, + "grad_norm": 1.0381356477737427, + "learning_rate": 7.519591134439753e-07, + "loss": 0.8338, + "mean_token_accuracy": 0.7532646629408799, + "step": 2400 + }, + { + "epoch": 0.888210394837277, + "eval_loss": 0.8614717721939087, + "eval_mean_token_accuracy": 0.746557203419715, + "eval_runtime": 78.5694, + "eval_samples_per_second": 6.529, + "eval_steps_per_second": 6.529, + "step": 2400 + }, + { + "epoch": 0.8900608331598547, + "grad_norm": 1.0127090215682983, + "learning_rate": 7.275698850619861e-07, + "loss": 0.8342, + "mean_token_accuracy": 0.7525232539905243, + "step": 2405 + }, + { + "epoch": 0.8919112714824324, + "grad_norm": 1.0005366802215576, + "learning_rate": 7.035677911963712e-07, + "loss": 0.8323, + "mean_token_accuracy": 0.7529371812088047, + "step": 2410 + }, + { + "epoch": 0.89376170980501, + "grad_norm": 1.0537385940551758, + "learning_rate": 6.799538339618838e-07, + "loss": 0.8355, + "mean_token_accuracy": 0.7534756236115644, + "step": 2415 + }, + { + "epoch": 0.8956121481275877, + "grad_norm": 1.1355481147766113, + "learning_rate": 6.567289992681258e-07, + "loss": 0.847, + "mean_token_accuracy": 0.7503863788178711, + "step": 2420 + }, + { + "epoch": 0.8974625864501654, + "grad_norm": 1.00087571144104, + "learning_rate": 6.33894256778399e-07, + "loss": 0.8086, + "mean_token_accuracy": 0.7609406991600837, + "step": 2425 + }, + { + "epoch": 0.899313024772743, + "grad_norm": 1.063475251197815, + "learning_rate": 6.114505598692011e-07, + "loss": 0.801, + "mean_token_accuracy": 0.7642469955916888, + "step": 2430 + }, + { + "epoch": 0.9011634630953207, + "grad_norm": 1.0210872888565063, + "learning_rate": 5.893988455904387e-07, + "loss": 0.8469, + "mean_token_accuracy": 0.7497207039220398, + "step": 2435 + }, + { + "epoch": 0.9030139014178984, + "grad_norm": 1.10099196434021, + "learning_rate": 5.677400346262918e-07, + "loss": 0.8375, + "mean_token_accuracy": 0.7533247716063558, + "step": 2440 + }, + { + "epoch": 0.904864339740476, + "grad_norm": 1.0928220748901367, + "learning_rate": 5.464750312567835e-07, + "loss": 0.8053, + "mean_token_accuracy": 0.7616478061524402, + "step": 2445 + }, + { + "epoch": 0.9067147780630537, + "grad_norm": 1.1153428554534912, + "learning_rate": 5.256047233200201e-07, + "loss": 0.8256, + "mean_token_accuracy": 0.7550626247557867, + "step": 2450 + }, + { + "epoch": 0.9085652163856314, + "grad_norm": 1.173052430152893, + "learning_rate": 5.051299821751254e-07, + "loss": 0.8144, + "mean_token_accuracy": 0.7592750603434603, + "step": 2455 + }, + { + "epoch": 0.910415654708209, + "grad_norm": 1.0116368532180786, + "learning_rate": 4.850516626658585e-07, + "loss": 0.84, + "mean_token_accuracy": 0.754272834810296, + "step": 2460 + }, + { + "epoch": 0.9122660930307867, + "grad_norm": 1.0587445497512817, + "learning_rate": 4.653706030849214e-07, + "loss": 0.8268, + "mean_token_accuracy": 0.7556583265085172, + "step": 2465 + }, + { + "epoch": 0.9141165313533643, + "grad_norm": 1.144407868385315, + "learning_rate": 4.4608762513896455e-07, + "loss": 0.8311, + "mean_token_accuracy": 0.7554218387899451, + "step": 2470 + }, + { + "epoch": 0.9159669696759419, + "grad_norm": 1.0363072156906128, + "learning_rate": 4.2720353391427547e-07, + "loss": 0.8452, + "mean_token_accuracy": 0.7492191739469968, + "step": 2475 + }, + { + "epoch": 0.9178174079985196, + "grad_norm": 1.108762502670288, + "learning_rate": 4.087191178431682e-07, + "loss": 0.8436, + "mean_token_accuracy": 0.7492986192564005, + "step": 2480 + }, + { + "epoch": 0.9196678463210973, + "grad_norm": 1.2412290573120117, + "learning_rate": 3.9063514867105914e-07, + "loss": 0.8517, + "mean_token_accuracy": 0.7480052030779112, + "step": 2485 + }, + { + "epoch": 0.921518284643675, + "grad_norm": 1.16669499874115, + "learning_rate": 3.729523814242608e-07, + "loss": 0.843, + "mean_token_accuracy": 0.752875461916825, + "step": 2490 + }, + { + "epoch": 0.9233687229662526, + "grad_norm": 1.0161534547805786, + "learning_rate": 3.5567155437843725e-07, + "loss": 0.8345, + "mean_token_accuracy": 0.7507810240759982, + "step": 2495 + }, + { + "epoch": 0.9252191612888303, + "grad_norm": 1.0766394138336182, + "learning_rate": 3.3879338902779945e-07, + "loss": 0.8275, + "mean_token_accuracy": 0.7544474125593488, + "step": 2500 + }, + { + "epoch": 0.9252191612888303, + "eval_loss": 0.8608765602111816, + "eval_mean_token_accuracy": 0.7467540683531231, + "eval_runtime": 80.3695, + "eval_samples_per_second": 6.383, + "eval_steps_per_second": 6.383, + "step": 2500 + }, + { + "epoch": 0.927069599611408, + "grad_norm": 1.044554591178894, + "learning_rate": 3.223185900549686e-07, + "loss": 0.8351, + "mean_token_accuracy": 0.753569345529369, + "step": 2505 + }, + { + "epoch": 0.9289200379339856, + "grad_norm": 1.0807628631591797, + "learning_rate": 3.0624784530156384e-07, + "loss": 0.8297, + "mean_token_accuracy": 0.7554732138827583, + "step": 2510 + }, + { + "epoch": 0.9307704762565633, + "grad_norm": 1.0551187992095947, + "learning_rate": 2.905818257394799e-07, + "loss": 0.8208, + "mean_token_accuracy": 0.7557148210943935, + "step": 2515 + }, + { + "epoch": 0.932620914579141, + "grad_norm": 1.0487548112869263, + "learning_rate": 2.753211854428728e-07, + "loss": 0.8201, + "mean_token_accuracy": 0.7570401368075597, + "step": 2520 + }, + { + "epoch": 0.9344713529017186, + "grad_norm": 1.063825011253357, + "learning_rate": 2.604665615608526e-07, + "loss": 0.8632, + "mean_token_accuracy": 0.7460995869477525, + "step": 2525 + }, + { + "epoch": 0.9363217912242963, + "grad_norm": 1.0384564399719238, + "learning_rate": 2.460185742908816e-07, + "loss": 0.8312, + "mean_token_accuracy": 0.7550518429016114, + "step": 2530 + }, + { + "epoch": 0.938172229546874, + "grad_norm": 1.0318973064422607, + "learning_rate": 2.3197782685288385e-07, + "loss": 0.8246, + "mean_token_accuracy": 0.7562569357485306, + "step": 2535 + }, + { + "epoch": 0.9400226678694515, + "grad_norm": 1.0621023178100586, + "learning_rate": 2.1834490546405186e-07, + "loss": 0.8316, + "mean_token_accuracy": 0.7541125212942166, + "step": 2540 + }, + { + "epoch": 0.9418731061920292, + "grad_norm": 1.0686606168746948, + "learning_rate": 2.0512037931437855e-07, + "loss": 0.8402, + "mean_token_accuracy": 0.7520761903722292, + "step": 2545 + }, + { + "epoch": 0.9437235445146069, + "grad_norm": 1.0552746057510376, + "learning_rate": 1.9230480054288958e-07, + "loss": 0.8163, + "mean_token_accuracy": 0.7575913676064547, + "step": 2550 + }, + { + "epoch": 0.9455739828371845, + "grad_norm": 1.107093334197998, + "learning_rate": 1.7989870421459498e-07, + "loss": 0.8288, + "mean_token_accuracy": 0.7551395264348012, + "step": 2555 + }, + { + "epoch": 0.9474244211597622, + "grad_norm": 1.0550057888031006, + "learning_rate": 1.6790260829814053e-07, + "loss": 0.8283, + "mean_token_accuracy": 0.75225936122926, + "step": 2560 + }, + { + "epoch": 0.9492748594823399, + "grad_norm": 1.040071725845337, + "learning_rate": 1.5631701364419492e-07, + "loss": 0.8351, + "mean_token_accuracy": 0.7528520564952128, + "step": 2565 + }, + { + "epoch": 0.9511252978049175, + "grad_norm": 1.1225743293762207, + "learning_rate": 1.4514240396452438e-07, + "loss": 0.8297, + "mean_token_accuracy": 0.7534344284757897, + "step": 2570 + }, + { + "epoch": 0.9529757361274952, + "grad_norm": 1.0739967823028564, + "learning_rate": 1.3437924581181205e-07, + "loss": 0.821, + "mean_token_accuracy": 0.7576335268115326, + "step": 2575 + }, + { + "epoch": 0.9548261744500729, + "grad_norm": 1.108796238899231, + "learning_rate": 1.2402798856016474e-07, + "loss": 0.8542, + "mean_token_accuracy": 0.7467002667880294, + "step": 2580 + }, + { + "epoch": 0.9566766127726505, + "grad_norm": 1.180294394493103, + "learning_rate": 1.1408906438636236e-07, + "loss": 0.8843, + "mean_token_accuracy": 0.7412087449919116, + "step": 2585 + }, + { + "epoch": 0.9585270510952282, + "grad_norm": 1.0492500066757202, + "learning_rate": 1.045628882518046e-07, + "loss": 0.8091, + "mean_token_accuracy": 0.7590382032512298, + "step": 2590 + }, + { + "epoch": 0.9603774894178059, + "grad_norm": 1.1209532022476196, + "learning_rate": 9.544985788519589e-08, + "loss": 0.8384, + "mean_token_accuracy": 0.7516073324809519, + "step": 2595 + }, + { + "epoch": 0.9622279277403835, + "grad_norm": 1.0509525537490845, + "learning_rate": 8.675035376593088e-08, + "loss": 0.8496, + "mean_token_accuracy": 0.7503324838100137, + "step": 2600 + }, + { + "epoch": 0.9622279277403835, + "eval_loss": 0.8606518507003784, + "eval_mean_token_accuracy": 0.7467864840774796, + "eval_runtime": 50.4189, + "eval_samples_per_second": 10.175, + "eval_steps_per_second": 10.175, + "step": 2600 + }, + { + "epoch": 0.9640783660629612, + "grad_norm": 1.1271697282791138, + "learning_rate": 7.846473910821162e-08, + "loss": 0.8575, + "mean_token_accuracy": 0.7484827357208227, + "step": 2605 + }, + { + "epoch": 0.9659288043855389, + "grad_norm": 1.0665255784988403, + "learning_rate": 7.059335984588634e-08, + "loss": 0.8431, + "mean_token_accuracy": 0.7513135573046622, + "step": 2610 + }, + { + "epoch": 0.9677792427081164, + "grad_norm": 1.1128307580947876, + "learning_rate": 6.313654461800322e-08, + "loss": 0.8304, + "mean_token_accuracy": 0.7541726078785163, + "step": 2615 + }, + { + "epoch": 0.9696296810306941, + "grad_norm": 1.0116729736328125, + "learning_rate": 5.609460475509032e-08, + "loss": 0.8228, + "mean_token_accuracy": 0.7557024711248536, + "step": 2620 + }, + { + "epoch": 0.9714801193532718, + "grad_norm": 1.0868538618087769, + "learning_rate": 4.9467834266154756e-08, + "loss": 0.8236, + "mean_token_accuracy": 0.7560558219373958, + "step": 2625 + }, + { + "epoch": 0.9733305576758494, + "grad_norm": 1.0722366571426392, + "learning_rate": 4.325650982641039e-08, + "loss": 0.8459, + "mean_token_accuracy": 0.7509982117560366, + "step": 2630 + }, + { + "epoch": 0.9751809959984271, + "grad_norm": 1.075330376625061, + "learning_rate": 3.746089076572701e-08, + "loss": 0.7883, + "mean_token_accuracy": 0.7683721646194278, + "step": 2635 + }, + { + "epoch": 0.9770314343210048, + "grad_norm": 1.0114487409591675, + "learning_rate": 3.208121905779904e-08, + "loss": 0.8639, + "mean_token_accuracy": 0.746171101860165, + "step": 2640 + }, + { + "epoch": 0.9788818726435824, + "grad_norm": 1.048780083656311, + "learning_rate": 2.711771931004692e-08, + "loss": 0.7718, + "mean_token_accuracy": 0.7700529319379708, + "step": 2645 + }, + { + "epoch": 0.9807323109661601, + "grad_norm": 0.9830264449119568, + "learning_rate": 2.257059875423795e-08, + "loss": 0.8044, + "mean_token_accuracy": 0.7627527924691726, + "step": 2650 + }, + { + "epoch": 0.9825827492887378, + "grad_norm": 1.0851788520812988, + "learning_rate": 1.8440047237832105e-08, + "loss": 0.825, + "mean_token_accuracy": 0.7561628552480948, + "step": 2655 + }, + { + "epoch": 0.9844331876113155, + "grad_norm": 1.1513216495513916, + "learning_rate": 1.472623721606059e-08, + "loss": 0.8282, + "mean_token_accuracy": 0.7562615035996398, + "step": 2660 + }, + { + "epoch": 0.9862836259338931, + "grad_norm": 1.0593661069869995, + "learning_rate": 1.1429323744720499e-08, + "loss": 0.8275, + "mean_token_accuracy": 0.7561241815379223, + "step": 2665 + }, + { + "epoch": 0.9881340642564708, + "grad_norm": 1.0882458686828613, + "learning_rate": 8.549444473702207e-09, + "loss": 0.8509, + "mean_token_accuracy": 0.7492303053916414, + "step": 2670 + }, + { + "epoch": 0.9899845025790485, + "grad_norm": 1.0708626508712769, + "learning_rate": 6.086719641246186e-09, + "loss": 0.7946, + "mean_token_accuracy": 0.7647034307432874, + "step": 2675 + }, + { + "epoch": 0.991834940901626, + "grad_norm": 1.0481311082839966, + "learning_rate": 4.041252068918145e-09, + "loss": 0.838, + "mean_token_accuracy": 0.751368344670206, + "step": 2680 + }, + { + "epoch": 0.9936853792242037, + "grad_norm": 1.0070174932479858, + "learning_rate": 2.4131271573191172e-09, + "loss": 0.826, + "mean_token_accuracy": 0.7566141631737711, + "step": 2685 + }, + { + "epoch": 0.9955358175467814, + "grad_norm": 1.024017095565796, + "learning_rate": 1.2024128825172121e-09, + "loss": 0.8202, + "mean_token_accuracy": 0.7572512065390945, + "step": 2690 + }, + { + "epoch": 0.997386255869359, + "grad_norm": 1.0600864887237549, + "learning_rate": 4.0915979321320967e-10, + "loss": 0.7981, + "mean_token_accuracy": 0.7627408443007209, + "step": 2695 + }, + { + "epoch": 0.9992366941919367, + "grad_norm": 1.0456680059432983, + "learning_rate": 3.3401008625588706e-11, + "loss": 0.7911, + "mean_token_accuracy": 0.766019055501874, + "step": 2700 + }, + { + "epoch": 0.9992366941919367, + "eval_loss": 0.860656201839447, + "eval_mean_token_accuracy": 0.7467079128693144, + "eval_runtime": 50.3479, + "eval_samples_per_second": 10.189, + "eval_steps_per_second": 10.189, + "step": 2700 + }, + { + "epoch": 0.9999768695209678, + "mean_token_accuracy": 0.7699437678031534, + "step": 2702, + "total_flos": 76965426954240.0, + "train_loss": 0.8771114350247613, + "train_runtime": 110619.4732, + "train_samples_per_second": 0.782, + "train_steps_per_second": 0.024 + } + ], + "logging_steps": 5, + "max_steps": 2702, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 76965426954240.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}