diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11399 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9997052321296978, + "eval_steps": 100, + "global_step": 7632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0013100794235650537, + "grad_norm": 65.25566997593643, + "learning_rate": 2.6178010471204188e-08, + "loss": 0.8756, + "step": 5 + }, + { + "epoch": 0.0026201588471301074, + "grad_norm": 62.55868359719172, + "learning_rate": 5.2356020942408376e-08, + "loss": 0.8355, + "step": 10 + }, + { + "epoch": 0.003930238270695161, + "grad_norm": 72.36291944323244, + "learning_rate": 7.853403141361257e-08, + "loss": 0.9024, + "step": 15 + }, + { + "epoch": 0.005240317694260215, + "grad_norm": 78.2927923163889, + "learning_rate": 1.0471204188481675e-07, + "loss": 0.9112, + "step": 20 + }, + { + "epoch": 0.006550397117825268, + "grad_norm": 73.20997266140758, + "learning_rate": 1.3089005235602092e-07, + "loss": 0.9038, + "step": 25 + }, + { + "epoch": 0.007860476541390321, + "grad_norm": 79.5698279926454, + "learning_rate": 1.5706806282722514e-07, + "loss": 0.7385, + "step": 30 + }, + { + "epoch": 0.009170555964955375, + "grad_norm": 62.25008705240314, + "learning_rate": 1.8324607329842932e-07, + "loss": 0.8842, + "step": 35 + }, + { + "epoch": 0.01048063538852043, + "grad_norm": 71.93212371078113, + "learning_rate": 2.094240837696335e-07, + "loss": 0.8456, + "step": 40 + }, + { + "epoch": 0.011790714812085483, + "grad_norm": 76.81854898396818, + "learning_rate": 2.356020942408377e-07, + "loss": 0.8637, + "step": 45 + }, + { + "epoch": 0.013100794235650536, + "grad_norm": 72.77223708616008, + "learning_rate": 2.6178010471204185e-07, + "loss": 0.8457, + "step": 50 + }, + { + "epoch": 0.01441087365921559, + "grad_norm": 82.53172515882576, + "learning_rate": 2.879581151832461e-07, + "loss": 0.809, + "step": 55 + }, + { + "epoch": 0.015720953082780643, + "grad_norm": 63.36114555923586, + "learning_rate": 3.1413612565445027e-07, + "loss": 0.7974, + "step": 60 + }, + { + "epoch": 0.017031032506345696, + "grad_norm": 67.13005941209079, + "learning_rate": 3.4031413612565446e-07, + "loss": 0.7941, + "step": 65 + }, + { + "epoch": 0.01834111192991075, + "grad_norm": 62.35455050500593, + "learning_rate": 3.6649214659685864e-07, + "loss": 0.8121, + "step": 70 + }, + { + "epoch": 0.019651191353475806, + "grad_norm": 61.13921963104784, + "learning_rate": 3.926701570680628e-07, + "loss": 0.6837, + "step": 75 + }, + { + "epoch": 0.02096127077704086, + "grad_norm": 60.734863824940575, + "learning_rate": 4.18848167539267e-07, + "loss": 0.6569, + "step": 80 + }, + { + "epoch": 0.022271350200605913, + "grad_norm": 68.43319499092718, + "learning_rate": 4.450261780104712e-07, + "loss": 0.7585, + "step": 85 + }, + { + "epoch": 0.023581429624170966, + "grad_norm": 52.09969303256668, + "learning_rate": 4.712041884816754e-07, + "loss": 0.6591, + "step": 90 + }, + { + "epoch": 0.02489150904773602, + "grad_norm": 60.01362166751946, + "learning_rate": 4.973821989528796e-07, + "loss": 0.625, + "step": 95 + }, + { + "epoch": 0.026201588471301072, + "grad_norm": 58.796731028708756, + "learning_rate": 5.235602094240837e-07, + "loss": 0.687, + "step": 100 + }, + { + "epoch": 0.026201588471301072, + "eval_accuracy": 0.5504, + "eval_loss": 0.7788666486740112, + "eval_runtime": 138.8257, + "eval_samples_per_second": 9.004, + "eval_steps_per_second": 2.255, + "step": 100 + }, + { + "epoch": 0.027511667894866126, + "grad_norm": 45.6814258154, + "learning_rate": 5.497382198952879e-07, + "loss": 0.6697, + "step": 105 + }, + { + "epoch": 0.02882174731843118, + "grad_norm": 55.768935605768114, + "learning_rate": 5.759162303664922e-07, + "loss": 0.6187, + "step": 110 + }, + { + "epoch": 0.030131826741996232, + "grad_norm": 44.123709497680935, + "learning_rate": 6.020942408376963e-07, + "loss": 0.59, + "step": 115 + }, + { + "epoch": 0.031441906165561286, + "grad_norm": 40.077067899984414, + "learning_rate": 6.282722513089005e-07, + "loss": 0.6216, + "step": 120 + }, + { + "epoch": 0.03275198558912634, + "grad_norm": 39.216463798971496, + "learning_rate": 6.544502617801047e-07, + "loss": 0.6603, + "step": 125 + }, + { + "epoch": 0.03406206501269139, + "grad_norm": 34.95260150433843, + "learning_rate": 6.806282722513089e-07, + "loss": 0.5664, + "step": 130 + }, + { + "epoch": 0.03537214443625645, + "grad_norm": 37.834491450835614, + "learning_rate": 7.06806282722513e-07, + "loss": 0.5947, + "step": 135 + }, + { + "epoch": 0.0366822238598215, + "grad_norm": 32.63280938760141, + "learning_rate": 7.329842931937173e-07, + "loss": 0.6015, + "step": 140 + }, + { + "epoch": 0.037992303283386555, + "grad_norm": 37.4913705926391, + "learning_rate": 7.591623036649214e-07, + "loss": 0.5922, + "step": 145 + }, + { + "epoch": 0.03930238270695161, + "grad_norm": 26.355681906302205, + "learning_rate": 7.853403141361256e-07, + "loss": 0.5681, + "step": 150 + }, + { + "epoch": 0.04061246213051666, + "grad_norm": 31.880949398269596, + "learning_rate": 8.115183246073298e-07, + "loss": 0.5664, + "step": 155 + }, + { + "epoch": 0.04192254155408172, + "grad_norm": 20.65782299342999, + "learning_rate": 8.37696335078534e-07, + "loss": 0.5211, + "step": 160 + }, + { + "epoch": 0.04323262097764677, + "grad_norm": 22.258499509814012, + "learning_rate": 8.638743455497382e-07, + "loss": 0.5219, + "step": 165 + }, + { + "epoch": 0.044542700401211825, + "grad_norm": 21.846834289704923, + "learning_rate": 8.900523560209424e-07, + "loss": 0.5059, + "step": 170 + }, + { + "epoch": 0.045852779824776875, + "grad_norm": 23.18302368260191, + "learning_rate": 9.162303664921466e-07, + "loss": 0.518, + "step": 175 + }, + { + "epoch": 0.04716285924834193, + "grad_norm": 19.652604647477776, + "learning_rate": 9.424083769633508e-07, + "loss": 0.5271, + "step": 180 + }, + { + "epoch": 0.04847293867190698, + "grad_norm": 17.965969880689265, + "learning_rate": 9.68586387434555e-07, + "loss": 0.5313, + "step": 185 + }, + { + "epoch": 0.04978301809547204, + "grad_norm": 17.265465595838617, + "learning_rate": 9.947643979057591e-07, + "loss": 0.4635, + "step": 190 + }, + { + "epoch": 0.051093097519037095, + "grad_norm": 12.731564683260206, + "learning_rate": 1.0209424083769633e-06, + "loss": 0.4914, + "step": 195 + }, + { + "epoch": 0.052403176942602145, + "grad_norm": 14.240010539299435, + "learning_rate": 1.0471204188481674e-06, + "loss": 0.5163, + "step": 200 + }, + { + "epoch": 0.052403176942602145, + "eval_accuracy": 0.4488, + "eval_loss": 0.773366391658783, + "eval_runtime": 139.5762, + "eval_samples_per_second": 8.956, + "eval_steps_per_second": 2.243, + "step": 200 + }, + { + "epoch": 0.0537132563661672, + "grad_norm": 19.63370884990648, + "learning_rate": 1.0732984293193717e-06, + "loss": 0.5581, + "step": 205 + }, + { + "epoch": 0.05502333578973225, + "grad_norm": 10.618120041300497, + "learning_rate": 1.0994764397905759e-06, + "loss": 0.444, + "step": 210 + }, + { + "epoch": 0.05633341521329731, + "grad_norm": 17.44393406285321, + "learning_rate": 1.12565445026178e-06, + "loss": 0.5224, + "step": 215 + }, + { + "epoch": 0.05764349463686236, + "grad_norm": 12.189359941143175, + "learning_rate": 1.1518324607329843e-06, + "loss": 0.4787, + "step": 220 + }, + { + "epoch": 0.058953574060427415, + "grad_norm": 12.078094446605022, + "learning_rate": 1.1780104712041885e-06, + "loss": 0.5046, + "step": 225 + }, + { + "epoch": 0.060263653483992465, + "grad_norm": 12.745981182030365, + "learning_rate": 1.2041884816753926e-06, + "loss": 0.5234, + "step": 230 + }, + { + "epoch": 0.06157373290755752, + "grad_norm": 9.685521970587265, + "learning_rate": 1.2303664921465967e-06, + "loss": 0.4313, + "step": 235 + }, + { + "epoch": 0.06288381233112257, + "grad_norm": 14.736633585803254, + "learning_rate": 1.256544502617801e-06, + "loss": 0.4547, + "step": 240 + }, + { + "epoch": 0.06419389175468763, + "grad_norm": 8.794949376696293, + "learning_rate": 1.2827225130890052e-06, + "loss": 0.4405, + "step": 245 + }, + { + "epoch": 0.06550397117825268, + "grad_norm": 11.249829085031893, + "learning_rate": 1.3089005235602093e-06, + "loss": 0.4307, + "step": 250 + }, + { + "epoch": 0.06681405060181773, + "grad_norm": 13.390941109258982, + "learning_rate": 1.3350785340314135e-06, + "loss": 0.4998, + "step": 255 + }, + { + "epoch": 0.06812413002538278, + "grad_norm": 9.534912328167007, + "learning_rate": 1.3612565445026178e-06, + "loss": 0.472, + "step": 260 + }, + { + "epoch": 0.06943420944894785, + "grad_norm": 8.397391075769068, + "learning_rate": 1.387434554973822e-06, + "loss": 0.4433, + "step": 265 + }, + { + "epoch": 0.0707442888725129, + "grad_norm": 10.746633363582351, + "learning_rate": 1.413612565445026e-06, + "loss": 0.4567, + "step": 270 + }, + { + "epoch": 0.07205436829607795, + "grad_norm": 12.05720859118597, + "learning_rate": 1.4397905759162302e-06, + "loss": 0.4549, + "step": 275 + }, + { + "epoch": 0.073364447719643, + "grad_norm": 10.006378930278041, + "learning_rate": 1.4659685863874346e-06, + "loss": 0.4329, + "step": 280 + }, + { + "epoch": 0.07467452714320806, + "grad_norm": 9.597335107124772, + "learning_rate": 1.4921465968586387e-06, + "loss": 0.4084, + "step": 285 + }, + { + "epoch": 0.07598460656677311, + "grad_norm": 11.481222700352367, + "learning_rate": 1.5183246073298428e-06, + "loss": 0.4149, + "step": 290 + }, + { + "epoch": 0.07729468599033816, + "grad_norm": 19.43866947694484, + "learning_rate": 1.544502617801047e-06, + "loss": 0.4507, + "step": 295 + }, + { + "epoch": 0.07860476541390322, + "grad_norm": 12.044720730463665, + "learning_rate": 1.5706806282722513e-06, + "loss": 0.4566, + "step": 300 + }, + { + "epoch": 0.07860476541390322, + "eval_accuracy": 0.5296, + "eval_loss": 0.8311891555786133, + "eval_runtime": 139.1725, + "eval_samples_per_second": 8.982, + "eval_steps_per_second": 2.249, + "step": 300 + }, + { + "epoch": 0.07991484483746827, + "grad_norm": 11.917516090251366, + "learning_rate": 1.5968586387434554e-06, + "loss": 0.4058, + "step": 305 + }, + { + "epoch": 0.08122492426103332, + "grad_norm": 12.158737920925903, + "learning_rate": 1.6230366492146596e-06, + "loss": 0.3985, + "step": 310 + }, + { + "epoch": 0.08253500368459837, + "grad_norm": 11.207695168487689, + "learning_rate": 1.649214659685864e-06, + "loss": 0.3668, + "step": 315 + }, + { + "epoch": 0.08384508310816344, + "grad_norm": 14.299364939902476, + "learning_rate": 1.675392670157068e-06, + "loss": 0.4118, + "step": 320 + }, + { + "epoch": 0.08515516253172849, + "grad_norm": 8.796874162358842, + "learning_rate": 1.7015706806282722e-06, + "loss": 0.417, + "step": 325 + }, + { + "epoch": 0.08646524195529354, + "grad_norm": 6.423994778367414, + "learning_rate": 1.7277486910994763e-06, + "loss": 0.3693, + "step": 330 + }, + { + "epoch": 0.08777532137885859, + "grad_norm": 14.257916893772826, + "learning_rate": 1.7539267015706804e-06, + "loss": 0.4209, + "step": 335 + }, + { + "epoch": 0.08908540080242365, + "grad_norm": 10.247378725750938, + "learning_rate": 1.7801047120418848e-06, + "loss": 0.4086, + "step": 340 + }, + { + "epoch": 0.0903954802259887, + "grad_norm": 13.028352515928068, + "learning_rate": 1.806282722513089e-06, + "loss": 0.4366, + "step": 345 + }, + { + "epoch": 0.09170555964955375, + "grad_norm": 8.037401497631812, + "learning_rate": 1.8324607329842933e-06, + "loss": 0.3272, + "step": 350 + }, + { + "epoch": 0.09301563907311881, + "grad_norm": 6.9081571017698655, + "learning_rate": 1.8586387434554974e-06, + "loss": 0.3677, + "step": 355 + }, + { + "epoch": 0.09432571849668386, + "grad_norm": 8.726274241413915, + "learning_rate": 1.8848167539267015e-06, + "loss": 0.3582, + "step": 360 + }, + { + "epoch": 0.09563579792024891, + "grad_norm": 8.438818851906133, + "learning_rate": 1.9109947643979056e-06, + "loss": 0.3923, + "step": 365 + }, + { + "epoch": 0.09694587734381396, + "grad_norm": 9.975870802252798, + "learning_rate": 1.93717277486911e-06, + "loss": 0.3755, + "step": 370 + }, + { + "epoch": 0.09825595676737903, + "grad_norm": 11.07336790257551, + "learning_rate": 1.963350785340314e-06, + "loss": 0.3931, + "step": 375 + }, + { + "epoch": 0.09956603619094408, + "grad_norm": 6.87508978730872, + "learning_rate": 1.9895287958115183e-06, + "loss": 0.3723, + "step": 380 + }, + { + "epoch": 0.10087611561450913, + "grad_norm": 7.299612626576022, + "learning_rate": 1.999999155039932e-06, + "loss": 0.3936, + "step": 385 + }, + { + "epoch": 0.10218619503807419, + "grad_norm": 8.19359622835764, + "learning_rate": 1.999993991400246e-06, + "loss": 0.3347, + "step": 390 + }, + { + "epoch": 0.10349627446163924, + "grad_norm": 7.35341462576354, + "learning_rate": 1.9999841335673434e-06, + "loss": 0.3843, + "step": 395 + }, + { + "epoch": 0.10480635388520429, + "grad_norm": 6.711354211930067, + "learning_rate": 1.999969581587499e-06, + "loss": 0.3568, + "step": 400 + }, + { + "epoch": 0.10480635388520429, + "eval_accuracy": 0.6608, + "eval_loss": 0.739296019077301, + "eval_runtime": 142.7751, + "eval_samples_per_second": 8.755, + "eval_steps_per_second": 2.192, + "step": 400 + }, + { + "epoch": 0.10611643330876934, + "grad_norm": 10.024329644547777, + "learning_rate": 1.999950335529023e-06, + "loss": 0.377, + "step": 405 + }, + { + "epoch": 0.1074265127323344, + "grad_norm": 9.78542639003509, + "learning_rate": 1.999926395482261e-06, + "loss": 0.3109, + "step": 410 + }, + { + "epoch": 0.10873659215589945, + "grad_norm": 5.4448738851908445, + "learning_rate": 1.999897761559593e-06, + "loss": 0.3238, + "step": 415 + }, + { + "epoch": 0.1100466715794645, + "grad_norm": 5.403174975957244, + "learning_rate": 1.999864433895432e-06, + "loss": 0.317, + "step": 420 + }, + { + "epoch": 0.11135675100302955, + "grad_norm": 14.489233866558225, + "learning_rate": 1.9998264126462264e-06, + "loss": 0.3485, + "step": 425 + }, + { + "epoch": 0.11266683042659462, + "grad_norm": 7.983108029464349, + "learning_rate": 1.999783697990456e-06, + "loss": 0.3745, + "step": 430 + }, + { + "epoch": 0.11397690985015967, + "grad_norm": 6.210263065691716, + "learning_rate": 1.9997362901286328e-06, + "loss": 0.3134, + "step": 435 + }, + { + "epoch": 0.11528698927372472, + "grad_norm": 5.7095639740154445, + "learning_rate": 1.9996841892832997e-06, + "loss": 0.2956, + "step": 440 + }, + { + "epoch": 0.11659706869728978, + "grad_norm": 5.926962115905033, + "learning_rate": 1.9996273956990303e-06, + "loss": 0.3558, + "step": 445 + }, + { + "epoch": 0.11790714812085483, + "grad_norm": 7.303119154977969, + "learning_rate": 1.999565909642425e-06, + "loss": 0.3417, + "step": 450 + }, + { + "epoch": 0.11921722754441988, + "grad_norm": 10.835920993927127, + "learning_rate": 1.9994997314021146e-06, + "loss": 0.4127, + "step": 455 + }, + { + "epoch": 0.12052730696798493, + "grad_norm": 7.096230008529465, + "learning_rate": 1.999428861288753e-06, + "loss": 0.3652, + "step": 460 + }, + { + "epoch": 0.12183738639154999, + "grad_norm": 7.856938952483924, + "learning_rate": 1.999353299635021e-06, + "loss": 0.3319, + "step": 465 + }, + { + "epoch": 0.12314746581511504, + "grad_norm": 5.163875987837063, + "learning_rate": 1.9992730467956218e-06, + "loss": 0.3274, + "step": 470 + }, + { + "epoch": 0.12445754523868009, + "grad_norm": 5.096460744829759, + "learning_rate": 1.9991881031472787e-06, + "loss": 0.3369, + "step": 475 + }, + { + "epoch": 0.12576762466224514, + "grad_norm": 4.898833760747336, + "learning_rate": 1.9990984690887376e-06, + "loss": 0.3342, + "step": 480 + }, + { + "epoch": 0.1270777040858102, + "grad_norm": 8.546538011577958, + "learning_rate": 1.99900414504076e-06, + "loss": 0.3675, + "step": 485 + }, + { + "epoch": 0.12838778350937527, + "grad_norm": 9.545619233480027, + "learning_rate": 1.998905131446124e-06, + "loss": 0.2993, + "step": 490 + }, + { + "epoch": 0.1296978629329403, + "grad_norm": 6.0468014618988, + "learning_rate": 1.998801428769621e-06, + "loss": 0.3337, + "step": 495 + }, + { + "epoch": 0.13100794235650537, + "grad_norm": 6.525761511275149, + "learning_rate": 1.998693037498054e-06, + "loss": 0.3504, + "step": 500 + }, + { + "epoch": 0.13100794235650537, + "eval_accuracy": 0.6496, + "eval_loss": 0.7146463394165039, + "eval_runtime": 141.9904, + "eval_samples_per_second": 8.803, + "eval_steps_per_second": 2.204, + "step": 500 + }, + { + "epoch": 0.1323180217800704, + "grad_norm": 6.303773120304537, + "learning_rate": 1.9985799581402366e-06, + "loss": 0.3254, + "step": 505 + }, + { + "epoch": 0.13362810120363547, + "grad_norm": 7.995893571860963, + "learning_rate": 1.998462191226988e-06, + "loss": 0.3392, + "step": 510 + }, + { + "epoch": 0.13493818062720053, + "grad_norm": 9.171359972165128, + "learning_rate": 1.9983397373111318e-06, + "loss": 0.3223, + "step": 515 + }, + { + "epoch": 0.13624826005076557, + "grad_norm": 5.152252573879665, + "learning_rate": 1.9982125969674943e-06, + "loss": 0.3214, + "step": 520 + }, + { + "epoch": 0.13755833947433063, + "grad_norm": 6.810478020843169, + "learning_rate": 1.9980807707929e-06, + "loss": 0.3643, + "step": 525 + }, + { + "epoch": 0.1388684188978957, + "grad_norm": 6.555447149076346, + "learning_rate": 1.99794425940617e-06, + "loss": 0.3173, + "step": 530 + }, + { + "epoch": 0.14017849832146073, + "grad_norm": 7.460824741390232, + "learning_rate": 1.99780306344812e-06, + "loss": 0.362, + "step": 535 + }, + { + "epoch": 0.1414885777450258, + "grad_norm": 4.465161138093627, + "learning_rate": 1.997657183581554e-06, + "loss": 0.2876, + "step": 540 + }, + { + "epoch": 0.14279865716859086, + "grad_norm": 6.492929729490839, + "learning_rate": 1.997506620491265e-06, + "loss": 0.3412, + "step": 545 + }, + { + "epoch": 0.1441087365921559, + "grad_norm": 6.323199580280994, + "learning_rate": 1.9973513748840294e-06, + "loss": 0.2913, + "step": 550 + }, + { + "epoch": 0.14541881601572096, + "grad_norm": 6.860707779728365, + "learning_rate": 1.997191447488604e-06, + "loss": 0.2841, + "step": 555 + }, + { + "epoch": 0.146728895439286, + "grad_norm": 7.428003430414849, + "learning_rate": 1.9970268390557235e-06, + "loss": 0.3296, + "step": 560 + }, + { + "epoch": 0.14803897486285106, + "grad_norm": 9.809910396455075, + "learning_rate": 1.996857550358097e-06, + "loss": 0.3316, + "step": 565 + }, + { + "epoch": 0.14934905428641612, + "grad_norm": 6.120283708913698, + "learning_rate": 1.9966835821904022e-06, + "loss": 0.3227, + "step": 570 + }, + { + "epoch": 0.15065913370998116, + "grad_norm": 5.518190183534834, + "learning_rate": 1.9965049353692853e-06, + "loss": 0.3271, + "step": 575 + }, + { + "epoch": 0.15196921313354622, + "grad_norm": 4.956536091880624, + "learning_rate": 1.996321610733353e-06, + "loss": 0.3677, + "step": 580 + }, + { + "epoch": 0.15327929255711129, + "grad_norm": 6.782512343216961, + "learning_rate": 1.9961336091431724e-06, + "loss": 0.3538, + "step": 585 + }, + { + "epoch": 0.15458937198067632, + "grad_norm": 4.957135478968806, + "learning_rate": 1.995940931481264e-06, + "loss": 0.3716, + "step": 590 + }, + { + "epoch": 0.15589945140424138, + "grad_norm": 5.89621538406006, + "learning_rate": 1.9957435786521003e-06, + "loss": 0.3211, + "step": 595 + }, + { + "epoch": 0.15720953082780645, + "grad_norm": 3.4716470492850133, + "learning_rate": 1.9955415515820982e-06, + "loss": 0.3335, + "step": 600 + }, + { + "epoch": 0.15720953082780645, + "eval_accuracy": 0.74, + "eval_loss": 0.6648128628730774, + "eval_runtime": 143.2818, + "eval_samples_per_second": 8.724, + "eval_steps_per_second": 2.185, + "step": 600 + }, + { + "epoch": 0.15851961025137148, + "grad_norm": 4.420746512696032, + "learning_rate": 1.9953348512196184e-06, + "loss": 0.3074, + "step": 605 + }, + { + "epoch": 0.15982968967493655, + "grad_norm": 5.278777879896076, + "learning_rate": 1.9951234785349572e-06, + "loss": 0.3338, + "step": 610 + }, + { + "epoch": 0.16113976909850158, + "grad_norm": 7.753730093542852, + "learning_rate": 1.9949074345203457e-06, + "loss": 0.3409, + "step": 615 + }, + { + "epoch": 0.16244984852206665, + "grad_norm": 4.666850591356625, + "learning_rate": 1.9946867201899415e-06, + "loss": 0.3368, + "step": 620 + }, + { + "epoch": 0.1637599279456317, + "grad_norm": 3.574785990989966, + "learning_rate": 1.994461336579827e-06, + "loss": 0.2872, + "step": 625 + }, + { + "epoch": 0.16507000736919675, + "grad_norm": 6.370783353900673, + "learning_rate": 1.9942312847480032e-06, + "loss": 0.3223, + "step": 630 + }, + { + "epoch": 0.1663800867927618, + "grad_norm": 6.091101766679421, + "learning_rate": 1.993996565774384e-06, + "loss": 0.3247, + "step": 635 + }, + { + "epoch": 0.16769016621632687, + "grad_norm": 4.963319059049603, + "learning_rate": 1.9937571807607914e-06, + "loss": 0.3035, + "step": 640 + }, + { + "epoch": 0.1690002456398919, + "grad_norm": 5.763955927215438, + "learning_rate": 1.993513130830953e-06, + "loss": 0.3207, + "step": 645 + }, + { + "epoch": 0.17031032506345697, + "grad_norm": 5.873434630553368, + "learning_rate": 1.9932644171304922e-06, + "loss": 0.2886, + "step": 650 + }, + { + "epoch": 0.17162040448702204, + "grad_norm": 6.2646805543143165, + "learning_rate": 1.9930110408269265e-06, + "loss": 0.2844, + "step": 655 + }, + { + "epoch": 0.17293048391058707, + "grad_norm": 6.593113180342127, + "learning_rate": 1.992753003109661e-06, + "loss": 0.3156, + "step": 660 + }, + { + "epoch": 0.17424056333415214, + "grad_norm": 7.157645021880429, + "learning_rate": 1.9924903051899805e-06, + "loss": 0.2825, + "step": 665 + }, + { + "epoch": 0.17555064275771717, + "grad_norm": 8.060379496602742, + "learning_rate": 1.9922229483010486e-06, + "loss": 0.2938, + "step": 670 + }, + { + "epoch": 0.17686072218128224, + "grad_norm": 3.9961852237294413, + "learning_rate": 1.9919509336978966e-06, + "loss": 0.3503, + "step": 675 + }, + { + "epoch": 0.1781708016048473, + "grad_norm": 4.475800758356923, + "learning_rate": 1.9916742626574224e-06, + "loss": 0.3459, + "step": 680 + }, + { + "epoch": 0.17948088102841234, + "grad_norm": 4.593693404230203, + "learning_rate": 1.9913929364783804e-06, + "loss": 0.33, + "step": 685 + }, + { + "epoch": 0.1807909604519774, + "grad_norm": 5.413425332374862, + "learning_rate": 1.9911069564813783e-06, + "loss": 0.3051, + "step": 690 + }, + { + "epoch": 0.18210103987554246, + "grad_norm": 8.008275090324018, + "learning_rate": 1.9908163240088693e-06, + "loss": 0.3699, + "step": 695 + }, + { + "epoch": 0.1834111192991075, + "grad_norm": 5.528157261942457, + "learning_rate": 1.9905210404251465e-06, + "loss": 0.2891, + "step": 700 + }, + { + "epoch": 0.1834111192991075, + "eval_accuracy": 0.7104, + "eval_loss": 0.6656551957130432, + "eval_runtime": 142.4624, + "eval_samples_per_second": 8.774, + "eval_steps_per_second": 2.197, + "step": 700 + }, + { + "epoch": 0.18472119872267256, + "grad_norm": 3.6403780225909403, + "learning_rate": 1.9902211071163366e-06, + "loss": 0.287, + "step": 705 + }, + { + "epoch": 0.18603127814623763, + "grad_norm": 6.998780707143166, + "learning_rate": 1.989916525490393e-06, + "loss": 0.2794, + "step": 710 + }, + { + "epoch": 0.18734135756980266, + "grad_norm": 4.8809190411029055, + "learning_rate": 1.989607296977089e-06, + "loss": 0.3102, + "step": 715 + }, + { + "epoch": 0.18865143699336773, + "grad_norm": 4.433896161785547, + "learning_rate": 1.989293423028012e-06, + "loss": 0.3142, + "step": 720 + }, + { + "epoch": 0.18996151641693276, + "grad_norm": 4.968740406829342, + "learning_rate": 1.988974905116556e-06, + "loss": 0.2907, + "step": 725 + }, + { + "epoch": 0.19127159584049783, + "grad_norm": 5.0219581203222505, + "learning_rate": 1.988651744737914e-06, + "loss": 0.3004, + "step": 730 + }, + { + "epoch": 0.1925816752640629, + "grad_norm": 5.9858192432913215, + "learning_rate": 1.9883239434090727e-06, + "loss": 0.3099, + "step": 735 + }, + { + "epoch": 0.19389175468762793, + "grad_norm": 6.069766314309123, + "learning_rate": 1.9879915026688042e-06, + "loss": 0.3456, + "step": 740 + }, + { + "epoch": 0.195201834111193, + "grad_norm": 3.8932647987051365, + "learning_rate": 1.9876544240776593e-06, + "loss": 0.2827, + "step": 745 + }, + { + "epoch": 0.19651191353475805, + "grad_norm": 4.970870187009217, + "learning_rate": 1.987312709217959e-06, + "loss": 0.2862, + "step": 750 + }, + { + "epoch": 0.1978219929583231, + "grad_norm": 7.016260113036865, + "learning_rate": 1.9869663596937884e-06, + "loss": 0.2776, + "step": 755 + }, + { + "epoch": 0.19913207238188815, + "grad_norm": 9.66509560172156, + "learning_rate": 1.986615377130989e-06, + "loss": 0.2772, + "step": 760 + }, + { + "epoch": 0.20044215180545322, + "grad_norm": 5.77891746002653, + "learning_rate": 1.9862597631771508e-06, + "loss": 0.353, + "step": 765 + }, + { + "epoch": 0.20175223122901825, + "grad_norm": 6.520911408063365, + "learning_rate": 1.9858995195016044e-06, + "loss": 0.3101, + "step": 770 + }, + { + "epoch": 0.20306231065258332, + "grad_norm": 3.484594315471376, + "learning_rate": 1.9855346477954142e-06, + "loss": 0.2896, + "step": 775 + }, + { + "epoch": 0.20437239007614838, + "grad_norm": 4.684870944655888, + "learning_rate": 1.9851651497713672e-06, + "loss": 0.2596, + "step": 780 + }, + { + "epoch": 0.20568246949971342, + "grad_norm": 4.808149471025438, + "learning_rate": 1.9847910271639697e-06, + "loss": 0.3015, + "step": 785 + }, + { + "epoch": 0.20699254892327848, + "grad_norm": 4.156391998918141, + "learning_rate": 1.984412281729436e-06, + "loss": 0.2871, + "step": 790 + }, + { + "epoch": 0.20830262834684352, + "grad_norm": 8.382532262606357, + "learning_rate": 1.9840289152456814e-06, + "loss": 0.375, + "step": 795 + }, + { + "epoch": 0.20961270777040858, + "grad_norm": 4.571411739513223, + "learning_rate": 1.9836409295123127e-06, + "loss": 0.3006, + "step": 800 + }, + { + "epoch": 0.20961270777040858, + "eval_accuracy": 0.6704, + "eval_loss": 0.7644935250282288, + "eval_runtime": 138.0522, + "eval_samples_per_second": 9.055, + "eval_steps_per_second": 2.267, + "step": 800 + }, + { + "epoch": 0.21092278719397364, + "grad_norm": 2.825880146573424, + "learning_rate": 1.983248326350621e-06, + "loss": 0.2792, + "step": 805 + }, + { + "epoch": 0.21223286661753868, + "grad_norm": 4.072431366106783, + "learning_rate": 1.982851107603572e-06, + "loss": 0.256, + "step": 810 + }, + { + "epoch": 0.21354294604110374, + "grad_norm": 8.315895632884684, + "learning_rate": 1.982449275135799e-06, + "loss": 0.288, + "step": 815 + }, + { + "epoch": 0.2148530254646688, + "grad_norm": 6.617341511261604, + "learning_rate": 1.982042830833592e-06, + "loss": 0.2574, + "step": 820 + }, + { + "epoch": 0.21616310488823384, + "grad_norm": 7.4121422793295055, + "learning_rate": 1.981631776604892e-06, + "loss": 0.3751, + "step": 825 + }, + { + "epoch": 0.2174731843117989, + "grad_norm": 5.462630509848653, + "learning_rate": 1.9812161143792764e-06, + "loss": 0.3347, + "step": 830 + }, + { + "epoch": 0.21878326373536397, + "grad_norm": 4.397947143261118, + "learning_rate": 1.9807958461079574e-06, + "loss": 0.318, + "step": 835 + }, + { + "epoch": 0.220093343158929, + "grad_norm": 3.733736827246703, + "learning_rate": 1.980370973763767e-06, + "loss": 0.2653, + "step": 840 + }, + { + "epoch": 0.22140342258249407, + "grad_norm": 4.754151304483215, + "learning_rate": 1.9799414993411495e-06, + "loss": 0.2822, + "step": 845 + }, + { + "epoch": 0.2227135020060591, + "grad_norm": 5.574460765166766, + "learning_rate": 1.979507424856153e-06, + "loss": 0.336, + "step": 850 + }, + { + "epoch": 0.22402358142962417, + "grad_norm": 4.505363713646319, + "learning_rate": 1.97906875234642e-06, + "loss": 0.2928, + "step": 855 + }, + { + "epoch": 0.22533366085318923, + "grad_norm": 6.222786448829383, + "learning_rate": 1.9786254838711757e-06, + "loss": 0.2989, + "step": 860 + }, + { + "epoch": 0.22664374027675427, + "grad_norm": 6.1340912560899525, + "learning_rate": 1.9781776215112204e-06, + "loss": 0.2904, + "step": 865 + }, + { + "epoch": 0.22795381970031933, + "grad_norm": 4.747377998160214, + "learning_rate": 1.9777251673689198e-06, + "loss": 0.2786, + "step": 870 + }, + { + "epoch": 0.2292638991238844, + "grad_norm": 7.702780817923823, + "learning_rate": 1.9772681235681933e-06, + "loss": 0.3207, + "step": 875 + }, + { + "epoch": 0.23057397854744943, + "grad_norm": 3.7484591701091077, + "learning_rate": 1.976806492254506e-06, + "loss": 0.272, + "step": 880 + }, + { + "epoch": 0.2318840579710145, + "grad_norm": 6.027425431373605, + "learning_rate": 1.9763402755948574e-06, + "loss": 0.2878, + "step": 885 + }, + { + "epoch": 0.23319413739457956, + "grad_norm": 4.420243905346968, + "learning_rate": 1.975869475777772e-06, + "loss": 0.3112, + "step": 890 + }, + { + "epoch": 0.2345042168181446, + "grad_norm": 3.2086505540675474, + "learning_rate": 1.9753940950132874e-06, + "loss": 0.3328, + "step": 895 + }, + { + "epoch": 0.23581429624170966, + "grad_norm": 4.92351892053512, + "learning_rate": 1.9749141355329473e-06, + "loss": 0.3039, + "step": 900 + }, + { + "epoch": 0.23581429624170966, + "eval_accuracy": 0.7176, + "eval_loss": 0.6235886812210083, + "eval_runtime": 134.5821, + "eval_samples_per_second": 9.288, + "eval_steps_per_second": 2.326, + "step": 900 + }, + { + "epoch": 0.2371243756652747, + "grad_norm": 4.595098138514267, + "learning_rate": 1.9744295995897874e-06, + "loss": 0.3384, + "step": 905 + }, + { + "epoch": 0.23843445508883976, + "grad_norm": 5.875594511033142, + "learning_rate": 1.9739404894583262e-06, + "loss": 0.2493, + "step": 910 + }, + { + "epoch": 0.23974453451240482, + "grad_norm": 4.943964557160407, + "learning_rate": 1.9734468074345555e-06, + "loss": 0.3264, + "step": 915 + }, + { + "epoch": 0.24105461393596986, + "grad_norm": 3.4784751132842167, + "learning_rate": 1.9729485558359286e-06, + "loss": 0.2736, + "step": 920 + }, + { + "epoch": 0.24236469335953492, + "grad_norm": 5.689006473597506, + "learning_rate": 1.9724457370013474e-06, + "loss": 0.2991, + "step": 925 + }, + { + "epoch": 0.24367477278309999, + "grad_norm": 4.95697152099517, + "learning_rate": 1.971938353291156e-06, + "loss": 0.3023, + "step": 930 + }, + { + "epoch": 0.24498485220666502, + "grad_norm": 3.698150841899642, + "learning_rate": 1.9714264070871254e-06, + "loss": 0.3104, + "step": 935 + }, + { + "epoch": 0.24629493163023009, + "grad_norm": 3.485873023817148, + "learning_rate": 1.970909900792444e-06, + "loss": 0.296, + "step": 940 + }, + { + "epoch": 0.24760501105379515, + "grad_norm": 4.428297690192133, + "learning_rate": 1.9703888368317084e-06, + "loss": 0.349, + "step": 945 + }, + { + "epoch": 0.24891509047736018, + "grad_norm": 3.1990684592166447, + "learning_rate": 1.969863217650906e-06, + "loss": 0.2494, + "step": 950 + }, + { + "epoch": 0.25022516990092525, + "grad_norm": 5.662300170468204, + "learning_rate": 1.9693330457174113e-06, + "loss": 0.3193, + "step": 955 + }, + { + "epoch": 0.2515352493244903, + "grad_norm": 3.389632162628184, + "learning_rate": 1.968798323519968e-06, + "loss": 0.3378, + "step": 960 + }, + { + "epoch": 0.2528453287480554, + "grad_norm": 3.776539689852539, + "learning_rate": 1.9682590535686804e-06, + "loss": 0.2909, + "step": 965 + }, + { + "epoch": 0.2541554081716204, + "grad_norm": 2.739785839636218, + "learning_rate": 1.9677152383950014e-06, + "loss": 0.2877, + "step": 970 + }, + { + "epoch": 0.25546548759518545, + "grad_norm": 4.226352508831814, + "learning_rate": 1.9671668805517197e-06, + "loss": 0.2917, + "step": 975 + }, + { + "epoch": 0.25677556701875054, + "grad_norm": 4.092459875987079, + "learning_rate": 1.9666139826129482e-06, + "loss": 0.3101, + "step": 980 + }, + { + "epoch": 0.2580856464423156, + "grad_norm": 3.022031693799492, + "learning_rate": 1.9660565471741133e-06, + "loss": 0.2451, + "step": 985 + }, + { + "epoch": 0.2593957258658806, + "grad_norm": 5.445606320908549, + "learning_rate": 1.965494576851939e-06, + "loss": 0.2803, + "step": 990 + }, + { + "epoch": 0.26070580528944565, + "grad_norm": 6.789604538228405, + "learning_rate": 1.9649280742844383e-06, + "loss": 0.3155, + "step": 995 + }, + { + "epoch": 0.26201588471301074, + "grad_norm": 9.717312792399483, + "learning_rate": 1.9643570421309013e-06, + "loss": 0.354, + "step": 1000 + }, + { + "epoch": 0.26201588471301074, + "eval_accuracy": 0.7272, + "eval_loss": 0.641910970211029, + "eval_runtime": 135.3149, + "eval_samples_per_second": 9.238, + "eval_steps_per_second": 2.313, + "step": 1000 + }, + { + "epoch": 0.2633259641365758, + "grad_norm": 3.570212334750135, + "learning_rate": 1.9637814830718784e-06, + "loss": 0.2197, + "step": 1005 + }, + { + "epoch": 0.2646360435601408, + "grad_norm": 5.497952454503209, + "learning_rate": 1.9632013998091708e-06, + "loss": 0.2843, + "step": 1010 + }, + { + "epoch": 0.2659461229837059, + "grad_norm": 4.3351899402575595, + "learning_rate": 1.962616795065819e-06, + "loss": 0.3116, + "step": 1015 + }, + { + "epoch": 0.26725620240727094, + "grad_norm": 5.059296767345353, + "learning_rate": 1.962027671586086e-06, + "loss": 0.2732, + "step": 1020 + }, + { + "epoch": 0.268566281830836, + "grad_norm": 3.636972491245334, + "learning_rate": 1.961434032135448e-06, + "loss": 0.3015, + "step": 1025 + }, + { + "epoch": 0.26987636125440106, + "grad_norm": 2.886855538209043, + "learning_rate": 1.9608358795005805e-06, + "loss": 0.271, + "step": 1030 + }, + { + "epoch": 0.2711864406779661, + "grad_norm": 4.518933069509563, + "learning_rate": 1.960233216489344e-06, + "loss": 0.2875, + "step": 1035 + }, + { + "epoch": 0.27249652010153114, + "grad_norm": 6.776126720418537, + "learning_rate": 1.959626045930773e-06, + "loss": 0.3297, + "step": 1040 + }, + { + "epoch": 0.27380659952509623, + "grad_norm": 3.3329344336767175, + "learning_rate": 1.9590143706750595e-06, + "loss": 0.3023, + "step": 1045 + }, + { + "epoch": 0.27511667894866126, + "grad_norm": 8.048171208434075, + "learning_rate": 1.958398193593543e-06, + "loss": 0.334, + "step": 1050 + }, + { + "epoch": 0.2764267583722263, + "grad_norm": 3.78319855061, + "learning_rate": 1.9577775175786944e-06, + "loss": 0.2919, + "step": 1055 + }, + { + "epoch": 0.2777368377957914, + "grad_norm": 3.3427967602303164, + "learning_rate": 1.957152345544106e-06, + "loss": 0.3142, + "step": 1060 + }, + { + "epoch": 0.2790469172193564, + "grad_norm": 5.104886446268955, + "learning_rate": 1.9565226804244723e-06, + "loss": 0.3025, + "step": 1065 + }, + { + "epoch": 0.28035699664292146, + "grad_norm": 2.0521842311663114, + "learning_rate": 1.9558885251755814e-06, + "loss": 0.2591, + "step": 1070 + }, + { + "epoch": 0.28166707606648655, + "grad_norm": 2.77191138245992, + "learning_rate": 1.955249882774298e-06, + "loss": 0.3224, + "step": 1075 + }, + { + "epoch": 0.2829771554900516, + "grad_norm": 5.0369322317805185, + "learning_rate": 1.954606756218552e-06, + "loss": 0.3104, + "step": 1080 + }, + { + "epoch": 0.2842872349136166, + "grad_norm": 7.9829960694246624, + "learning_rate": 1.9539591485273207e-06, + "loss": 0.2774, + "step": 1085 + }, + { + "epoch": 0.2855973143371817, + "grad_norm": 6.1487449662425835, + "learning_rate": 1.953307062740619e-06, + "loss": 0.3271, + "step": 1090 + }, + { + "epoch": 0.28690739376074675, + "grad_norm": 4.902123463001405, + "learning_rate": 1.952650501919481e-06, + "loss": 0.3087, + "step": 1095 + }, + { + "epoch": 0.2882174731843118, + "grad_norm": 5.005642536094271, + "learning_rate": 1.9519894691459488e-06, + "loss": 0.3698, + "step": 1100 + }, + { + "epoch": 0.2882174731843118, + "eval_accuracy": 0.7104, + "eval_loss": 0.7176594734191895, + "eval_runtime": 136.7009, + "eval_samples_per_second": 9.144, + "eval_steps_per_second": 2.29, + "step": 1100 + }, + { + "epoch": 0.2895275526078768, + "grad_norm": 3.1207339769636837, + "learning_rate": 1.951323967523057e-06, + "loss": 0.3232, + "step": 1105 + }, + { + "epoch": 0.2908376320314419, + "grad_norm": 3.1159736941377147, + "learning_rate": 1.9506540001748172e-06, + "loss": 0.2797, + "step": 1110 + }, + { + "epoch": 0.29214771145500695, + "grad_norm": 4.293077428446999, + "learning_rate": 1.9499795702462047e-06, + "loss": 0.3155, + "step": 1115 + }, + { + "epoch": 0.293457790878572, + "grad_norm": 3.8028097270147794, + "learning_rate": 1.949300680903143e-06, + "loss": 0.21, + "step": 1120 + }, + { + "epoch": 0.2947678703021371, + "grad_norm": 4.84671496698013, + "learning_rate": 1.948617335332489e-06, + "loss": 0.2633, + "step": 1125 + }, + { + "epoch": 0.2960779497257021, + "grad_norm": 3.960211317187865, + "learning_rate": 1.947929536742018e-06, + "loss": 0.2954, + "step": 1130 + }, + { + "epoch": 0.29738802914926715, + "grad_norm": 6.0168036176339985, + "learning_rate": 1.947237288360408e-06, + "loss": 0.284, + "step": 1135 + }, + { + "epoch": 0.29869810857283224, + "grad_norm": 3.518422321473526, + "learning_rate": 1.946540593437228e-06, + "loss": 0.2759, + "step": 1140 + }, + { + "epoch": 0.3000081879963973, + "grad_norm": 5.514253446140628, + "learning_rate": 1.945839455242917e-06, + "loss": 0.2304, + "step": 1145 + }, + { + "epoch": 0.3013182674199623, + "grad_norm": 8.396945191804443, + "learning_rate": 1.945133877068773e-06, + "loss": 0.3492, + "step": 1150 + }, + { + "epoch": 0.3026283468435274, + "grad_norm": 2.9344408033676497, + "learning_rate": 1.9444238622269366e-06, + "loss": 0.2529, + "step": 1155 + }, + { + "epoch": 0.30393842626709244, + "grad_norm": 4.341884742564996, + "learning_rate": 1.9437094140503745e-06, + "loss": 0.2763, + "step": 1160 + }, + { + "epoch": 0.3052485056906575, + "grad_norm": 4.144835095588702, + "learning_rate": 1.9429905358928646e-06, + "loss": 0.2992, + "step": 1165 + }, + { + "epoch": 0.30655858511422257, + "grad_norm": 4.571896652829111, + "learning_rate": 1.9422672311289797e-06, + "loss": 0.2094, + "step": 1170 + }, + { + "epoch": 0.3078686645377876, + "grad_norm": 6.487182515831676, + "learning_rate": 1.9415395031540734e-06, + "loss": 0.3184, + "step": 1175 + }, + { + "epoch": 0.30917874396135264, + "grad_norm": 6.2890432701053784, + "learning_rate": 1.9408073553842614e-06, + "loss": 0.2885, + "step": 1180 + }, + { + "epoch": 0.31048882338491773, + "grad_norm": 5.261502639917455, + "learning_rate": 1.9400707912564078e-06, + "loss": 0.2425, + "step": 1185 + }, + { + "epoch": 0.31179890280848277, + "grad_norm": 5.4319235826756715, + "learning_rate": 1.939329814228107e-06, + "loss": 0.3138, + "step": 1190 + }, + { + "epoch": 0.3131089822320478, + "grad_norm": 4.569463003103509, + "learning_rate": 1.93858442777767e-06, + "loss": 0.2905, + "step": 1195 + }, + { + "epoch": 0.3144190616556129, + "grad_norm": 6.51841356160416, + "learning_rate": 1.9378346354041057e-06, + "loss": 0.2544, + "step": 1200 + }, + { + "epoch": 0.3144190616556129, + "eval_accuracy": 0.7032, + "eval_loss": 0.7195152640342712, + "eval_runtime": 139.9321, + "eval_samples_per_second": 8.933, + "eval_steps_per_second": 2.237, + "step": 1200 + }, + { + "epoch": 0.31572914107917793, + "grad_norm": 6.07329817723254, + "learning_rate": 1.9370804406271053e-06, + "loss": 0.3082, + "step": 1205 + }, + { + "epoch": 0.31703922050274297, + "grad_norm": 5.5739656052051565, + "learning_rate": 1.936321846987026e-06, + "loss": 0.2982, + "step": 1210 + }, + { + "epoch": 0.318349299926308, + "grad_norm": 3.19437710777185, + "learning_rate": 1.9355588580448743e-06, + "loss": 0.2404, + "step": 1215 + }, + { + "epoch": 0.3196593793498731, + "grad_norm": 4.020281104033066, + "learning_rate": 1.9347914773822897e-06, + "loss": 0.3113, + "step": 1220 + }, + { + "epoch": 0.32096945877343813, + "grad_norm": 4.494719920151199, + "learning_rate": 1.9340197086015267e-06, + "loss": 0.3129, + "step": 1225 + }, + { + "epoch": 0.32227953819700317, + "grad_norm": 4.154261834382855, + "learning_rate": 1.9332435553254386e-06, + "loss": 0.3315, + "step": 1230 + }, + { + "epoch": 0.32358961762056826, + "grad_norm": 4.748202443565547, + "learning_rate": 1.932463021197461e-06, + "loss": 0.2484, + "step": 1235 + }, + { + "epoch": 0.3248996970441333, + "grad_norm": 3.4246878831109404, + "learning_rate": 1.9316781098815938e-06, + "loss": 0.2892, + "step": 1240 + }, + { + "epoch": 0.32620977646769833, + "grad_norm": 2.680374700010348, + "learning_rate": 1.930888825062385e-06, + "loss": 0.2731, + "step": 1245 + }, + { + "epoch": 0.3275198558912634, + "grad_norm": 5.622466682481238, + "learning_rate": 1.9300951704449113e-06, + "loss": 0.3281, + "step": 1250 + }, + { + "epoch": 0.32882993531482846, + "grad_norm": 4.573189599074694, + "learning_rate": 1.929297149754764e-06, + "loss": 0.3044, + "step": 1255 + }, + { + "epoch": 0.3301400147383935, + "grad_norm": 3.977565315266715, + "learning_rate": 1.928494766738029e-06, + "loss": 0.3347, + "step": 1260 + }, + { + "epoch": 0.3314500941619586, + "grad_norm": 3.267696615745348, + "learning_rate": 1.927688025161269e-06, + "loss": 0.273, + "step": 1265 + }, + { + "epoch": 0.3327601735855236, + "grad_norm": 3.7685478337164535, + "learning_rate": 1.9268769288115083e-06, + "loss": 0.308, + "step": 1270 + }, + { + "epoch": 0.33407025300908866, + "grad_norm": 4.733168908675778, + "learning_rate": 1.9260614814962127e-06, + "loss": 0.2864, + "step": 1275 + }, + { + "epoch": 0.33538033243265375, + "grad_norm": 3.7829911630990756, + "learning_rate": 1.9252416870432723e-06, + "loss": 0.2763, + "step": 1280 + }, + { + "epoch": 0.3366904118562188, + "grad_norm": 4.817098610717063, + "learning_rate": 1.9244175493009836e-06, + "loss": 0.2661, + "step": 1285 + }, + { + "epoch": 0.3380004912797838, + "grad_norm": 5.155494294997122, + "learning_rate": 1.9235890721380323e-06, + "loss": 0.3272, + "step": 1290 + }, + { + "epoch": 0.3393105707033489, + "grad_norm": 3.9443526122441295, + "learning_rate": 1.9227562594434733e-06, + "loss": 0.3294, + "step": 1295 + }, + { + "epoch": 0.34062065012691395, + "grad_norm": 3.268646227982553, + "learning_rate": 1.9219191151267133e-06, + "loss": 0.2571, + "step": 1300 + }, + { + "epoch": 0.34062065012691395, + "eval_accuracy": 0.712, + "eval_loss": 0.7290279269218445, + "eval_runtime": 139.9623, + "eval_samples_per_second": 8.931, + "eval_steps_per_second": 2.236, + "step": 1300 + }, + { + "epoch": 0.341930729550479, + "grad_norm": 4.156705015549692, + "learning_rate": 1.9210776431174937e-06, + "loss": 0.296, + "step": 1305 + }, + { + "epoch": 0.3432408089740441, + "grad_norm": 4.1937292159776405, + "learning_rate": 1.9202318473658702e-06, + "loss": 0.2799, + "step": 1310 + }, + { + "epoch": 0.3445508883976091, + "grad_norm": 3.5322347963356866, + "learning_rate": 1.9193817318421952e-06, + "loss": 0.2803, + "step": 1315 + }, + { + "epoch": 0.34586096782117415, + "grad_norm": 5.317835964163557, + "learning_rate": 1.9185273005371e-06, + "loss": 0.2849, + "step": 1320 + }, + { + "epoch": 0.34717104724473924, + "grad_norm": 5.169820633932056, + "learning_rate": 1.9176685574614733e-06, + "loss": 0.2987, + "step": 1325 + }, + { + "epoch": 0.3484811266683043, + "grad_norm": 4.982709983606647, + "learning_rate": 1.9168055066464457e-06, + "loss": 0.2716, + "step": 1330 + }, + { + "epoch": 0.3497912060918693, + "grad_norm": 4.866013018973415, + "learning_rate": 1.9159381521433684e-06, + "loss": 0.2766, + "step": 1335 + }, + { + "epoch": 0.35110128551543435, + "grad_norm": 4.025011433913149, + "learning_rate": 1.9150664980237964e-06, + "loss": 0.2584, + "step": 1340 + }, + { + "epoch": 0.35241136493899944, + "grad_norm": 3.8599220124227545, + "learning_rate": 1.9141905483794664e-06, + "loss": 0.3204, + "step": 1345 + }, + { + "epoch": 0.3537214443625645, + "grad_norm": 3.79972737879995, + "learning_rate": 1.91331030732228e-06, + "loss": 0.2836, + "step": 1350 + }, + { + "epoch": 0.3550315237861295, + "grad_norm": 2.919813006103404, + "learning_rate": 1.9124257789842843e-06, + "loss": 0.2587, + "step": 1355 + }, + { + "epoch": 0.3563416032096946, + "grad_norm": 5.170605350757861, + "learning_rate": 1.9115369675176504e-06, + "loss": 0.3065, + "step": 1360 + }, + { + "epoch": 0.35765168263325964, + "grad_norm": 6.1588988469192065, + "learning_rate": 1.910643877094656e-06, + "loss": 0.3447, + "step": 1365 + }, + { + "epoch": 0.3589617620568247, + "grad_norm": 2.6672804726236303, + "learning_rate": 1.9097465119076665e-06, + "loss": 0.3036, + "step": 1370 + }, + { + "epoch": 0.36027184148038977, + "grad_norm": 3.4333435314335983, + "learning_rate": 1.908844876169112e-06, + "loss": 0.2682, + "step": 1375 + }, + { + "epoch": 0.3615819209039548, + "grad_norm": 2.6540494040134153, + "learning_rate": 1.9079389741114696e-06, + "loss": 0.2592, + "step": 1380 + }, + { + "epoch": 0.36289200032751984, + "grad_norm": 4.249055358619522, + "learning_rate": 1.9070288099872452e-06, + "loss": 0.2605, + "step": 1385 + }, + { + "epoch": 0.36420207975108493, + "grad_norm": 4.836139691290449, + "learning_rate": 1.9061143880689503e-06, + "loss": 0.2977, + "step": 1390 + }, + { + "epoch": 0.36551215917464996, + "grad_norm": 4.705615420915993, + "learning_rate": 1.905195712649084e-06, + "loss": 0.3444, + "step": 1395 + }, + { + "epoch": 0.366822238598215, + "grad_norm": 3.2537510054626515, + "learning_rate": 1.9042727880401122e-06, + "loss": 0.3558, + "step": 1400 + }, + { + "epoch": 0.366822238598215, + "eval_accuracy": 0.676, + "eval_loss": 0.7812010049819946, + "eval_runtime": 137.3509, + "eval_samples_per_second": 9.101, + "eval_steps_per_second": 2.279, + "step": 1400 + }, + { + "epoch": 0.3681323180217801, + "grad_norm": 2.7663311029982856, + "learning_rate": 1.9033456185744469e-06, + "loss": 0.2985, + "step": 1405 + }, + { + "epoch": 0.3694423974453451, + "grad_norm": 2.6580405529803808, + "learning_rate": 1.9024142086044277e-06, + "loss": 0.2834, + "step": 1410 + }, + { + "epoch": 0.37075247686891016, + "grad_norm": 3.6849674325031407, + "learning_rate": 1.9014785625022985e-06, + "loss": 0.2779, + "step": 1415 + }, + { + "epoch": 0.37206255629247525, + "grad_norm": 4.020792652442759, + "learning_rate": 1.9005386846601893e-06, + "loss": 0.2472, + "step": 1420 + }, + { + "epoch": 0.3733726357160403, + "grad_norm": 4.526317952946907, + "learning_rate": 1.8995945794900953e-06, + "loss": 0.2786, + "step": 1425 + }, + { + "epoch": 0.3746827151396053, + "grad_norm": 4.123722032882601, + "learning_rate": 1.8986462514238547e-06, + "loss": 0.2833, + "step": 1430 + }, + { + "epoch": 0.3759927945631704, + "grad_norm": 6.744679094753499, + "learning_rate": 1.8976937049131298e-06, + "loss": 0.3072, + "step": 1435 + }, + { + "epoch": 0.37730287398673545, + "grad_norm": 4.370200879451724, + "learning_rate": 1.8967369444293847e-06, + "loss": 0.25, + "step": 1440 + }, + { + "epoch": 0.3786129534103005, + "grad_norm": 2.6567231964667424, + "learning_rate": 1.8957759744638651e-06, + "loss": 0.2461, + "step": 1445 + }, + { + "epoch": 0.3799230328338655, + "grad_norm": 4.993924185098615, + "learning_rate": 1.8948107995275761e-06, + "loss": 0.2457, + "step": 1450 + }, + { + "epoch": 0.3812331122574306, + "grad_norm": 5.890536303510147, + "learning_rate": 1.8938414241512637e-06, + "loss": 0.3337, + "step": 1455 + }, + { + "epoch": 0.38254319168099565, + "grad_norm": 6.086658941241191, + "learning_rate": 1.8928678528853895e-06, + "loss": 0.261, + "step": 1460 + }, + { + "epoch": 0.3838532711045607, + "grad_norm": 2.5029858692820444, + "learning_rate": 1.8918900903001136e-06, + "loss": 0.2623, + "step": 1465 + }, + { + "epoch": 0.3851633505281258, + "grad_norm": 3.1670655034154347, + "learning_rate": 1.8909081409852692e-06, + "loss": 0.3239, + "step": 1470 + }, + { + "epoch": 0.3864734299516908, + "grad_norm": 6.92258040722732, + "learning_rate": 1.8899220095503442e-06, + "loss": 0.3251, + "step": 1475 + }, + { + "epoch": 0.38778350937525585, + "grad_norm": 2.459950138600639, + "learning_rate": 1.888931700624458e-06, + "loss": 0.2865, + "step": 1480 + }, + { + "epoch": 0.38909358879882094, + "grad_norm": 4.566006943218545, + "learning_rate": 1.8879372188563396e-06, + "loss": 0.2919, + "step": 1485 + }, + { + "epoch": 0.390403668222386, + "grad_norm": 6.1169971997341515, + "learning_rate": 1.8869385689143069e-06, + "loss": 0.3248, + "step": 1490 + }, + { + "epoch": 0.391713747645951, + "grad_norm": 4.321962895847691, + "learning_rate": 1.885935755486244e-06, + "loss": 0.2497, + "step": 1495 + }, + { + "epoch": 0.3930238270695161, + "grad_norm": 3.786195016300289, + "learning_rate": 1.8849287832795785e-06, + "loss": 0.2842, + "step": 1500 + }, + { + "epoch": 0.3930238270695161, + "eval_accuracy": 0.6952, + "eval_loss": 0.7388889789581299, + "eval_runtime": 140.4298, + "eval_samples_per_second": 8.901, + "eval_steps_per_second": 2.229, + "step": 1500 + }, + { + "epoch": 0.39433390649308114, + "grad_norm": 4.144022239569388, + "learning_rate": 1.8839176570212619e-06, + "loss": 0.2776, + "step": 1505 + }, + { + "epoch": 0.3956439859166462, + "grad_norm": 3.254154282020282, + "learning_rate": 1.882902381457744e-06, + "loss": 0.3046, + "step": 1510 + }, + { + "epoch": 0.39695406534021127, + "grad_norm": 3.7972516111759465, + "learning_rate": 1.8818829613549532e-06, + "loss": 0.3571, + "step": 1515 + }, + { + "epoch": 0.3982641447637763, + "grad_norm": 2.7098946167159217, + "learning_rate": 1.8808594014982736e-06, + "loss": 0.3086, + "step": 1520 + }, + { + "epoch": 0.39957422418734134, + "grad_norm": 2.395987116328653, + "learning_rate": 1.879831706692521e-06, + "loss": 0.2955, + "step": 1525 + }, + { + "epoch": 0.40088430361090643, + "grad_norm": 5.567909535649407, + "learning_rate": 1.8787998817619233e-06, + "loss": 0.3045, + "step": 1530 + }, + { + "epoch": 0.40219438303447147, + "grad_norm": 4.680059669737792, + "learning_rate": 1.8777639315500945e-06, + "loss": 0.2648, + "step": 1535 + }, + { + "epoch": 0.4035044624580365, + "grad_norm": 8.387574343664538, + "learning_rate": 1.876723860920015e-06, + "loss": 0.3123, + "step": 1540 + }, + { + "epoch": 0.4048145418816016, + "grad_norm": 2.023022886159982, + "learning_rate": 1.8756796747540057e-06, + "loss": 0.2561, + "step": 1545 + }, + { + "epoch": 0.40612462130516663, + "grad_norm": 4.634489921732028, + "learning_rate": 1.8746313779537087e-06, + "loss": 0.3115, + "step": 1550 + }, + { + "epoch": 0.40743470072873167, + "grad_norm": 3.401314650673904, + "learning_rate": 1.8735789754400603e-06, + "loss": 0.2493, + "step": 1555 + }, + { + "epoch": 0.40874478015229676, + "grad_norm": 2.586199787021852, + "learning_rate": 1.8725224721532715e-06, + "loss": 0.2521, + "step": 1560 + }, + { + "epoch": 0.4100548595758618, + "grad_norm": 4.393631624962878, + "learning_rate": 1.8714618730528024e-06, + "loss": 0.2817, + "step": 1565 + }, + { + "epoch": 0.41136493899942683, + "grad_norm": 5.57797489979425, + "learning_rate": 1.8703971831173405e-06, + "loss": 0.2937, + "step": 1570 + }, + { + "epoch": 0.41267501842299187, + "grad_norm": 3.4687053444228235, + "learning_rate": 1.8693284073447755e-06, + "loss": 0.3344, + "step": 1575 + }, + { + "epoch": 0.41398509784655696, + "grad_norm": 4.141659335286871, + "learning_rate": 1.868255550752178e-06, + "loss": 0.2546, + "step": 1580 + }, + { + "epoch": 0.415295177270122, + "grad_norm": 7.531913864092293, + "learning_rate": 1.8671786183757741e-06, + "loss": 0.2992, + "step": 1585 + }, + { + "epoch": 0.41660525669368703, + "grad_norm": 5.209397304260616, + "learning_rate": 1.866097615270923e-06, + "loss": 0.2978, + "step": 1590 + }, + { + "epoch": 0.4179153361172521, + "grad_norm": 4.9952265985686966, + "learning_rate": 1.865012546512092e-06, + "loss": 0.2386, + "step": 1595 + }, + { + "epoch": 0.41922541554081716, + "grad_norm": 4.620451674592193, + "learning_rate": 1.863923417192835e-06, + "loss": 0.3012, + "step": 1600 + }, + { + "epoch": 0.41922541554081716, + "eval_accuracy": 0.7088, + "eval_loss": 0.7305626273155212, + "eval_runtime": 137.487, + "eval_samples_per_second": 9.092, + "eval_steps_per_second": 2.277, + "step": 1600 + }, + { + "epoch": 0.4205354949643822, + "grad_norm": 3.8216787722513335, + "learning_rate": 1.8628302324257664e-06, + "loss": 0.2886, + "step": 1605 + }, + { + "epoch": 0.4218455743879473, + "grad_norm": 4.302516444293675, + "learning_rate": 1.8617329973425364e-06, + "loss": 0.2986, + "step": 1610 + }, + { + "epoch": 0.4231556538115123, + "grad_norm": 2.5052399973675823, + "learning_rate": 1.86063171709381e-06, + "loss": 0.2977, + "step": 1615 + }, + { + "epoch": 0.42446573323507736, + "grad_norm": 4.561793077013278, + "learning_rate": 1.8595263968492407e-06, + "loss": 0.3231, + "step": 1620 + }, + { + "epoch": 0.42577581265864245, + "grad_norm": 7.8865966163916426, + "learning_rate": 1.8584170417974465e-06, + "loss": 0.3202, + "step": 1625 + }, + { + "epoch": 0.4270858920822075, + "grad_norm": 3.674689043482507, + "learning_rate": 1.857303657145985e-06, + "loss": 0.2683, + "step": 1630 + }, + { + "epoch": 0.4283959715057725, + "grad_norm": 2.8123441864934, + "learning_rate": 1.8561862481213313e-06, + "loss": 0.2893, + "step": 1635 + }, + { + "epoch": 0.4297060509293376, + "grad_norm": 2.5957429423912854, + "learning_rate": 1.85506481996885e-06, + "loss": 0.3001, + "step": 1640 + }, + { + "epoch": 0.43101613035290265, + "grad_norm": 6.272609371298048, + "learning_rate": 1.8539393779527735e-06, + "loss": 0.2944, + "step": 1645 + }, + { + "epoch": 0.4323262097764677, + "grad_norm": 5.300171596731147, + "learning_rate": 1.8528099273561754e-06, + "loss": 0.2443, + "step": 1650 + }, + { + "epoch": 0.4336362892000328, + "grad_norm": 4.309301759126504, + "learning_rate": 1.8516764734809475e-06, + "loss": 0.2504, + "step": 1655 + }, + { + "epoch": 0.4349463686235978, + "grad_norm": 2.716951228905198, + "learning_rate": 1.8505390216477732e-06, + "loss": 0.2625, + "step": 1660 + }, + { + "epoch": 0.43625644804716285, + "grad_norm": 4.606417589142611, + "learning_rate": 1.8493975771961026e-06, + "loss": 0.2715, + "step": 1665 + }, + { + "epoch": 0.43756652747072794, + "grad_norm": 3.7628490993032444, + "learning_rate": 1.8482521454841296e-06, + "loss": 0.3187, + "step": 1670 + }, + { + "epoch": 0.438876606894293, + "grad_norm": 3.6806362340314878, + "learning_rate": 1.8471027318887632e-06, + "loss": 0.2446, + "step": 1675 + }, + { + "epoch": 0.440186686317858, + "grad_norm": 2.8358474618960554, + "learning_rate": 1.8459493418056064e-06, + "loss": 0.2803, + "step": 1680 + }, + { + "epoch": 0.44149676574142305, + "grad_norm": 3.559305323246588, + "learning_rate": 1.8447919806489272e-06, + "loss": 0.3376, + "step": 1685 + }, + { + "epoch": 0.44280684516498814, + "grad_norm": 3.18546710084024, + "learning_rate": 1.8436306538516348e-06, + "loss": 0.2526, + "step": 1690 + }, + { + "epoch": 0.4441169245885532, + "grad_norm": 2.2770988367831317, + "learning_rate": 1.8424653668652548e-06, + "loss": 0.2878, + "step": 1695 + }, + { + "epoch": 0.4454270040121182, + "grad_norm": 2.6311103553113186, + "learning_rate": 1.8412961251599021e-06, + "loss": 0.323, + "step": 1700 + }, + { + "epoch": 0.4454270040121182, + "eval_accuracy": 0.7104, + "eval_loss": 0.7182445526123047, + "eval_runtime": 139.4262, + "eval_samples_per_second": 8.965, + "eval_steps_per_second": 2.245, + "step": 1700 + }, + { + "epoch": 0.4467370834356833, + "grad_norm": 2.7960747447704275, + "learning_rate": 1.8401229342242564e-06, + "loss": 0.3345, + "step": 1705 + }, + { + "epoch": 0.44804716285924834, + "grad_norm": 2.303028253758041, + "learning_rate": 1.8389457995655354e-06, + "loss": 0.2837, + "step": 1710 + }, + { + "epoch": 0.4493572422828134, + "grad_norm": 3.6261923466611763, + "learning_rate": 1.8377647267094699e-06, + "loss": 0.2656, + "step": 1715 + }, + { + "epoch": 0.45066732170637847, + "grad_norm": 4.89861955448886, + "learning_rate": 1.8365797212002777e-06, + "loss": 0.276, + "step": 1720 + }, + { + "epoch": 0.4519774011299435, + "grad_norm": 6.264789410236653, + "learning_rate": 1.8353907886006369e-06, + "loss": 0.3056, + "step": 1725 + }, + { + "epoch": 0.45328748055350854, + "grad_norm": 2.51930771111435, + "learning_rate": 1.8341979344916601e-06, + "loss": 0.2885, + "step": 1730 + }, + { + "epoch": 0.45459755997707363, + "grad_norm": 6.288406487126684, + "learning_rate": 1.833001164472869e-06, + "loss": 0.3229, + "step": 1735 + }, + { + "epoch": 0.45590763940063866, + "grad_norm": 5.444204531738827, + "learning_rate": 1.8318004841621666e-06, + "loss": 0.2589, + "step": 1740 + }, + { + "epoch": 0.4572177188242037, + "grad_norm": 6.474274016450882, + "learning_rate": 1.8305958991958126e-06, + "loss": 0.2912, + "step": 1745 + }, + { + "epoch": 0.4585277982477688, + "grad_norm": 6.449122812307309, + "learning_rate": 1.8293874152283952e-06, + "loss": 0.2992, + "step": 1750 + }, + { + "epoch": 0.45983787767133383, + "grad_norm": 4.641695980313187, + "learning_rate": 1.8281750379328061e-06, + "loss": 0.3278, + "step": 1755 + }, + { + "epoch": 0.46114795709489886, + "grad_norm": 2.681087051955473, + "learning_rate": 1.8269587730002125e-06, + "loss": 0.255, + "step": 1760 + }, + { + "epoch": 0.46245803651846396, + "grad_norm": 6.151626591739763, + "learning_rate": 1.8257386261400316e-06, + "loss": 0.2494, + "step": 1765 + }, + { + "epoch": 0.463768115942029, + "grad_norm": 4.505233549633642, + "learning_rate": 1.8245146030799025e-06, + "loss": 0.3442, + "step": 1770 + }, + { + "epoch": 0.465078195365594, + "grad_norm": 4.205788305228986, + "learning_rate": 1.8232867095656608e-06, + "loss": 0.3093, + "step": 1775 + }, + { + "epoch": 0.4663882747891591, + "grad_norm": 3.5089694683317174, + "learning_rate": 1.8220549513613104e-06, + "loss": 0.2846, + "step": 1780 + }, + { + "epoch": 0.46769835421272415, + "grad_norm": 4.514848486626711, + "learning_rate": 1.820819334248997e-06, + "loss": 0.3689, + "step": 1785 + }, + { + "epoch": 0.4690084336362892, + "grad_norm": 2.320338638195368, + "learning_rate": 1.8195798640289807e-06, + "loss": 0.2559, + "step": 1790 + }, + { + "epoch": 0.4703185130598542, + "grad_norm": 4.819290066996436, + "learning_rate": 1.8183365465196099e-06, + "loss": 0.2729, + "step": 1795 + }, + { + "epoch": 0.4716285924834193, + "grad_norm": 2.8951610503894734, + "learning_rate": 1.8170893875572916e-06, + "loss": 0.2502, + "step": 1800 + }, + { + "epoch": 0.4716285924834193, + "eval_accuracy": 0.7248, + "eval_loss": 0.6544848680496216, + "eval_runtime": 140.79, + "eval_samples_per_second": 8.878, + "eval_steps_per_second": 2.223, + "step": 1800 + }, + { + "epoch": 0.47293867190698435, + "grad_norm": 6.1281331169132045, + "learning_rate": 1.8158383929964665e-06, + "loss": 0.2792, + "step": 1805 + }, + { + "epoch": 0.4742487513305494, + "grad_norm": 4.051837237000961, + "learning_rate": 1.8145835687095797e-06, + "loss": 0.3106, + "step": 1810 + }, + { + "epoch": 0.4755588307541145, + "grad_norm": 5.166640184431374, + "learning_rate": 1.8133249205870547e-06, + "loss": 0.3153, + "step": 1815 + }, + { + "epoch": 0.4768689101776795, + "grad_norm": 4.026299173963558, + "learning_rate": 1.8120624545372643e-06, + "loss": 0.2343, + "step": 1820 + }, + { + "epoch": 0.47817898960124455, + "grad_norm": 4.295951177165347, + "learning_rate": 1.8107961764865033e-06, + "loss": 0.2883, + "step": 1825 + }, + { + "epoch": 0.47948906902480964, + "grad_norm": 3.4331157371118945, + "learning_rate": 1.8095260923789617e-06, + "loss": 0.2696, + "step": 1830 + }, + { + "epoch": 0.4807991484483747, + "grad_norm": 4.813504074638746, + "learning_rate": 1.8082522081766953e-06, + "loss": 0.3209, + "step": 1835 + }, + { + "epoch": 0.4821092278719397, + "grad_norm": 3.589632561094004, + "learning_rate": 1.8069745298595992e-06, + "loss": 0.2516, + "step": 1840 + }, + { + "epoch": 0.4834193072955048, + "grad_norm": 3.7263852570627143, + "learning_rate": 1.805693063425377e-06, + "loss": 0.3106, + "step": 1845 + }, + { + "epoch": 0.48472938671906984, + "grad_norm": 4.05564679184159, + "learning_rate": 1.8044078148895174e-06, + "loss": 0.2901, + "step": 1850 + }, + { + "epoch": 0.4860394661426349, + "grad_norm": 3.53180167912472, + "learning_rate": 1.8031187902852607e-06, + "loss": 0.2981, + "step": 1855 + }, + { + "epoch": 0.48734954556619997, + "grad_norm": 3.70131644140251, + "learning_rate": 1.801825995663574e-06, + "loss": 0.266, + "step": 1860 + }, + { + "epoch": 0.488659624989765, + "grad_norm": 3.9187161597018214, + "learning_rate": 1.8005294370931217e-06, + "loss": 0.2921, + "step": 1865 + }, + { + "epoch": 0.48996970441333004, + "grad_norm": 2.365181839837511, + "learning_rate": 1.7992291206602366e-06, + "loss": 0.292, + "step": 1870 + }, + { + "epoch": 0.49127978383689513, + "grad_norm": 3.6772154023031014, + "learning_rate": 1.797925052468892e-06, + "loss": 0.2926, + "step": 1875 + }, + { + "epoch": 0.49258986326046017, + "grad_norm": 3.169897885563791, + "learning_rate": 1.7966172386406728e-06, + "loss": 0.3069, + "step": 1880 + }, + { + "epoch": 0.4938999426840252, + "grad_norm": 3.623768040249494, + "learning_rate": 1.7953056853147466e-06, + "loss": 0.2728, + "step": 1885 + }, + { + "epoch": 0.4952100221075903, + "grad_norm": 3.8930495139893884, + "learning_rate": 1.7939903986478354e-06, + "loss": 0.2497, + "step": 1890 + }, + { + "epoch": 0.49652010153115533, + "grad_norm": 3.054365703792985, + "learning_rate": 1.7926713848141856e-06, + "loss": 0.2798, + "step": 1895 + }, + { + "epoch": 0.49783018095472037, + "grad_norm": 5.479009666538431, + "learning_rate": 1.7913486500055402e-06, + "loss": 0.3357, + "step": 1900 + }, + { + "epoch": 0.49783018095472037, + "eval_accuracy": 0.7184, + "eval_loss": 0.6975212097167969, + "eval_runtime": 134.9313, + "eval_samples_per_second": 9.264, + "eval_steps_per_second": 2.32, + "step": 1900 + }, + { + "epoch": 0.49914026037828546, + "grad_norm": 2.433308128925003, + "learning_rate": 1.7900222004311098e-06, + "loss": 0.28, + "step": 1905 + }, + { + "epoch": 0.5004503398018505, + "grad_norm": 3.0724802951664896, + "learning_rate": 1.788692042317542e-06, + "loss": 0.2741, + "step": 1910 + }, + { + "epoch": 0.5017604192254156, + "grad_norm": 2.9281942193789563, + "learning_rate": 1.7873581819088937e-06, + "loss": 0.2622, + "step": 1915 + }, + { + "epoch": 0.5030704986489806, + "grad_norm": 3.2439911646866415, + "learning_rate": 1.786020625466601e-06, + "loss": 0.2706, + "step": 1920 + }, + { + "epoch": 0.5043805780725457, + "grad_norm": 3.4446296154345175, + "learning_rate": 1.7846793792694497e-06, + "loss": 0.2596, + "step": 1925 + }, + { + "epoch": 0.5056906574961108, + "grad_norm": 3.7556256418902905, + "learning_rate": 1.7833344496135467e-06, + "loss": 0.3073, + "step": 1930 + }, + { + "epoch": 0.5070007369196757, + "grad_norm": 4.400597629681425, + "learning_rate": 1.7819858428122893e-06, + "loss": 0.2764, + "step": 1935 + }, + { + "epoch": 0.5083108163432408, + "grad_norm": 3.7380324234060143, + "learning_rate": 1.7806335651963372e-06, + "loss": 0.2906, + "step": 1940 + }, + { + "epoch": 0.5096208957668059, + "grad_norm": 4.614102737000705, + "learning_rate": 1.7792776231135802e-06, + "loss": 0.2898, + "step": 1945 + }, + { + "epoch": 0.5109309751903709, + "grad_norm": 2.7297343017049287, + "learning_rate": 1.7779180229291105e-06, + "loss": 0.23, + "step": 1950 + }, + { + "epoch": 0.512241054613936, + "grad_norm": 5.463732251912146, + "learning_rate": 1.7765547710251935e-06, + "loss": 0.2813, + "step": 1955 + }, + { + "epoch": 0.5135511340375011, + "grad_norm": 2.478093229460874, + "learning_rate": 1.7751878738012346e-06, + "loss": 0.2119, + "step": 1960 + }, + { + "epoch": 0.5148612134610661, + "grad_norm": 5.4823008219173595, + "learning_rate": 1.7738173376737522e-06, + "loss": 0.2642, + "step": 1965 + }, + { + "epoch": 0.5161712928846311, + "grad_norm": 3.9944643614596638, + "learning_rate": 1.7724431690763462e-06, + "loss": 0.2575, + "step": 1970 + }, + { + "epoch": 0.5174813723081961, + "grad_norm": 5.526636384041946, + "learning_rate": 1.7710653744596687e-06, + "loss": 0.3462, + "step": 1975 + }, + { + "epoch": 0.5187914517317612, + "grad_norm": 5.965066611467832, + "learning_rate": 1.7696839602913925e-06, + "loss": 0.3024, + "step": 1980 + }, + { + "epoch": 0.5201015311553263, + "grad_norm": 2.9808391808623247, + "learning_rate": 1.7682989330561813e-06, + "loss": 0.2729, + "step": 1985 + }, + { + "epoch": 0.5214116105788913, + "grad_norm": 3.6106553367793746, + "learning_rate": 1.7669102992556601e-06, + "loss": 0.2461, + "step": 1990 + }, + { + "epoch": 0.5227216900024564, + "grad_norm": 3.2772498647327732, + "learning_rate": 1.7655180654083832e-06, + "loss": 0.2842, + "step": 1995 + }, + { + "epoch": 0.5240317694260215, + "grad_norm": 6.885168575642456, + "learning_rate": 1.7641222380498044e-06, + "loss": 0.3379, + "step": 2000 + }, + { + "epoch": 0.5240317694260215, + "eval_accuracy": 0.7288, + "eval_loss": 0.673081636428833, + "eval_runtime": 136.835, + "eval_samples_per_second": 9.135, + "eval_steps_per_second": 2.287, + "step": 2000 + }, + { + "epoch": 0.5253418488495865, + "grad_norm": 3.6157723487154128, + "learning_rate": 1.7627228237322466e-06, + "loss": 0.2985, + "step": 2005 + }, + { + "epoch": 0.5266519282731515, + "grad_norm": 2.6275278120467105, + "learning_rate": 1.7613198290248706e-06, + "loss": 0.2281, + "step": 2010 + }, + { + "epoch": 0.5279620076967166, + "grad_norm": 3.3363602312332175, + "learning_rate": 1.7599132605136436e-06, + "loss": 0.3043, + "step": 2015 + }, + { + "epoch": 0.5292720871202816, + "grad_norm": 2.4277559410443175, + "learning_rate": 1.7585031248013106e-06, + "loss": 0.202, + "step": 2020 + }, + { + "epoch": 0.5305821665438467, + "grad_norm": 5.072153681054817, + "learning_rate": 1.7570894285073599e-06, + "loss": 0.2483, + "step": 2025 + }, + { + "epoch": 0.5318922459674118, + "grad_norm": 5.204973135313382, + "learning_rate": 1.7556721782679956e-06, + "loss": 0.3329, + "step": 2030 + }, + { + "epoch": 0.5332023253909768, + "grad_norm": 2.806879239003485, + "learning_rate": 1.7542513807361037e-06, + "loss": 0.2548, + "step": 2035 + }, + { + "epoch": 0.5345124048145419, + "grad_norm": 3.0228433116007327, + "learning_rate": 1.7528270425812228e-06, + "loss": 0.2651, + "step": 2040 + }, + { + "epoch": 0.535822484238107, + "grad_norm": 2.6869261489897736, + "learning_rate": 1.7513991704895112e-06, + "loss": 0.2844, + "step": 2045 + }, + { + "epoch": 0.537132563661672, + "grad_norm": 4.5723366454832215, + "learning_rate": 1.7499677711637171e-06, + "loss": 0.3071, + "step": 2050 + }, + { + "epoch": 0.538442643085237, + "grad_norm": 4.179299164940856, + "learning_rate": 1.7485328513231453e-06, + "loss": 0.2774, + "step": 2055 + }, + { + "epoch": 0.5397527225088021, + "grad_norm": 3.356708836608762, + "learning_rate": 1.7470944177036277e-06, + "loss": 0.2927, + "step": 2060 + }, + { + "epoch": 0.5410628019323671, + "grad_norm": 4.017539637774881, + "learning_rate": 1.74565247705749e-06, + "loss": 0.3008, + "step": 2065 + }, + { + "epoch": 0.5423728813559322, + "grad_norm": 3.974226727917273, + "learning_rate": 1.744207036153521e-06, + "loss": 0.2742, + "step": 2070 + }, + { + "epoch": 0.5436829607794973, + "grad_norm": 3.160288278087426, + "learning_rate": 1.7427581017769404e-06, + "loss": 0.3134, + "step": 2075 + }, + { + "epoch": 0.5449930402030623, + "grad_norm": 4.995568511985773, + "learning_rate": 1.741305680729367e-06, + "loss": 0.2927, + "step": 2080 + }, + { + "epoch": 0.5463031196266274, + "grad_norm": 2.5038111120195192, + "learning_rate": 1.7398497798287863e-06, + "loss": 0.2442, + "step": 2085 + }, + { + "epoch": 0.5476131990501925, + "grad_norm": 3.4536557863053035, + "learning_rate": 1.7383904059095202e-06, + "loss": 0.2592, + "step": 2090 + }, + { + "epoch": 0.5489232784737574, + "grad_norm": 4.410374261613443, + "learning_rate": 1.7369275658221926e-06, + "loss": 0.3117, + "step": 2095 + }, + { + "epoch": 0.5502333578973225, + "grad_norm": 2.538812009761459, + "learning_rate": 1.735461266433699e-06, + "loss": 0.2717, + "step": 2100 + }, + { + "epoch": 0.5502333578973225, + "eval_accuracy": 0.7424, + "eval_loss": 0.671869158744812, + "eval_runtime": 135.9633, + "eval_samples_per_second": 9.194, + "eval_steps_per_second": 2.302, + "step": 2100 + }, + { + "epoch": 0.5515434373208876, + "grad_norm": 3.627287673105934, + "learning_rate": 1.7339915146271732e-06, + "loss": 0.269, + "step": 2105 + }, + { + "epoch": 0.5528535167444526, + "grad_norm": 2.889042058590513, + "learning_rate": 1.7325183173019556e-06, + "loss": 0.2357, + "step": 2110 + }, + { + "epoch": 0.5541635961680177, + "grad_norm": 3.5905021176878775, + "learning_rate": 1.731041681373561e-06, + "loss": 0.2349, + "step": 2115 + }, + { + "epoch": 0.5554736755915828, + "grad_norm": 4.874519688988458, + "learning_rate": 1.729561613773645e-06, + "loss": 0.2886, + "step": 2120 + }, + { + "epoch": 0.5567837550151478, + "grad_norm": 3.281500493007926, + "learning_rate": 1.7280781214499727e-06, + "loss": 0.282, + "step": 2125 + }, + { + "epoch": 0.5580938344387129, + "grad_norm": 1.90376998294021, + "learning_rate": 1.7265912113663857e-06, + "loss": 0.2952, + "step": 2130 + }, + { + "epoch": 0.559403913862278, + "grad_norm": 4.157189646380671, + "learning_rate": 1.7251008905027692e-06, + "loss": 0.2913, + "step": 2135 + }, + { + "epoch": 0.5607139932858429, + "grad_norm": 7.357071844148765, + "learning_rate": 1.7236071658550191e-06, + "loss": 0.3016, + "step": 2140 + }, + { + "epoch": 0.562024072709408, + "grad_norm": 3.3548044431409214, + "learning_rate": 1.7221100444350099e-06, + "loss": 0.2526, + "step": 2145 + }, + { + "epoch": 0.5633341521329731, + "grad_norm": 2.070755143349665, + "learning_rate": 1.7206095332705608e-06, + "loss": 0.2859, + "step": 2150 + }, + { + "epoch": 0.5646442315565381, + "grad_norm": 2.9975997234107123, + "learning_rate": 1.7191056394054035e-06, + "loss": 0.2739, + "step": 2155 + }, + { + "epoch": 0.5659543109801032, + "grad_norm": 4.385338795716017, + "learning_rate": 1.7175983698991488e-06, + "loss": 0.312, + "step": 2160 + }, + { + "epoch": 0.5672643904036683, + "grad_norm": 3.112159603233555, + "learning_rate": 1.7160877318272537e-06, + "loss": 0.272, + "step": 2165 + }, + { + "epoch": 0.5685744698272333, + "grad_norm": 2.353537092467099, + "learning_rate": 1.7145737322809876e-06, + "loss": 0.2534, + "step": 2170 + }, + { + "epoch": 0.5698845492507983, + "grad_norm": 2.6947118285924385, + "learning_rate": 1.7130563783674e-06, + "loss": 0.2702, + "step": 2175 + }, + { + "epoch": 0.5711946286743634, + "grad_norm": 4.573113056178734, + "learning_rate": 1.7115356772092855e-06, + "loss": 0.3059, + "step": 2180 + }, + { + "epoch": 0.5725047080979284, + "grad_norm": 2.465542000370307, + "learning_rate": 1.7100116359451523e-06, + "loss": 0.2602, + "step": 2185 + }, + { + "epoch": 0.5738147875214935, + "grad_norm": 3.340565744796501, + "learning_rate": 1.7084842617291874e-06, + "loss": 0.2824, + "step": 2190 + }, + { + "epoch": 0.5751248669450586, + "grad_norm": 2.292406930546328, + "learning_rate": 1.706953561731224e-06, + "loss": 0.2696, + "step": 2195 + }, + { + "epoch": 0.5764349463686236, + "grad_norm": 3.3346832292412967, + "learning_rate": 1.705419543136707e-06, + "loss": 0.317, + "step": 2200 + }, + { + "epoch": 0.5764349463686236, + "eval_accuracy": 0.724, + "eval_loss": 0.6991069912910461, + "eval_runtime": 140.893, + "eval_samples_per_second": 8.872, + "eval_steps_per_second": 2.222, + "step": 2200 + }, + { + "epoch": 0.5777450257921887, + "grad_norm": 2.3310945353240435, + "learning_rate": 1.7038822131466583e-06, + "loss": 0.2504, + "step": 2205 + }, + { + "epoch": 0.5790551052157537, + "grad_norm": 2.3747884427134713, + "learning_rate": 1.7023415789776463e-06, + "loss": 0.298, + "step": 2210 + }, + { + "epoch": 0.5803651846393187, + "grad_norm": 3.133493758446318, + "learning_rate": 1.7007976478617484e-06, + "loss": 0.2376, + "step": 2215 + }, + { + "epoch": 0.5816752640628838, + "grad_norm": 4.12633676209407, + "learning_rate": 1.6992504270465193e-06, + "loss": 0.2944, + "step": 2220 + }, + { + "epoch": 0.5829853434864488, + "grad_norm": 3.9130822922585144, + "learning_rate": 1.697699923794956e-06, + "loss": 0.2993, + "step": 2225 + }, + { + "epoch": 0.5842954229100139, + "grad_norm": 2.7907750454465203, + "learning_rate": 1.696146145385464e-06, + "loss": 0.2868, + "step": 2230 + }, + { + "epoch": 0.585605502333579, + "grad_norm": 3.8858008383873397, + "learning_rate": 1.6945890991118236e-06, + "loss": 0.3234, + "step": 2235 + }, + { + "epoch": 0.586915581757144, + "grad_norm": 2.4869248403820308, + "learning_rate": 1.6930287922831546e-06, + "loss": 0.2584, + "step": 2240 + }, + { + "epoch": 0.5882256611807091, + "grad_norm": 2.9309269678483525, + "learning_rate": 1.6914652322238824e-06, + "loss": 0.2303, + "step": 2245 + }, + { + "epoch": 0.5895357406042742, + "grad_norm": 4.8721568277124625, + "learning_rate": 1.6898984262737046e-06, + "loss": 0.2216, + "step": 2250 + }, + { + "epoch": 0.5908458200278391, + "grad_norm": 4.919818554906796, + "learning_rate": 1.6883283817875546e-06, + "loss": 0.2742, + "step": 2255 + }, + { + "epoch": 0.5921558994514042, + "grad_norm": 7.35406987379939, + "learning_rate": 1.6867551061355696e-06, + "loss": 0.2984, + "step": 2260 + }, + { + "epoch": 0.5934659788749693, + "grad_norm": 2.5259386632956864, + "learning_rate": 1.6851786067030535e-06, + "loss": 0.2001, + "step": 2265 + }, + { + "epoch": 0.5947760582985343, + "grad_norm": 4.776608491229436, + "learning_rate": 1.6835988908904437e-06, + "loss": 0.3169, + "step": 2270 + }, + { + "epoch": 0.5960861377220994, + "grad_norm": 2.473422082161941, + "learning_rate": 1.6820159661132763e-06, + "loss": 0.2355, + "step": 2275 + }, + { + "epoch": 0.5973962171456645, + "grad_norm": 3.0778943971784165, + "learning_rate": 1.6804298398021501e-06, + "loss": 0.2308, + "step": 2280 + }, + { + "epoch": 0.5987062965692295, + "grad_norm": 3.5878133464375646, + "learning_rate": 1.6788405194026937e-06, + "loss": 0.2586, + "step": 2285 + }, + { + "epoch": 0.6000163759927946, + "grad_norm": 3.9888619741299194, + "learning_rate": 1.6772480123755288e-06, + "loss": 0.3039, + "step": 2290 + }, + { + "epoch": 0.6013264554163597, + "grad_norm": 4.413706147317179, + "learning_rate": 1.6756523261962361e-06, + "loss": 0.3061, + "step": 2295 + }, + { + "epoch": 0.6026365348399246, + "grad_norm": 6.997366587082385, + "learning_rate": 1.6740534683553197e-06, + "loss": 0.2696, + "step": 2300 + }, + { + "epoch": 0.6026365348399246, + "eval_accuracy": 0.7272, + "eval_loss": 0.7597007751464844, + "eval_runtime": 141.0016, + "eval_samples_per_second": 8.865, + "eval_steps_per_second": 2.22, + "step": 2300 + }, + { + "epoch": 0.6039466142634897, + "grad_norm": 5.753280459621215, + "learning_rate": 1.6724514463581727e-06, + "loss": 0.2935, + "step": 2305 + }, + { + "epoch": 0.6052566936870548, + "grad_norm": 3.6663692285540996, + "learning_rate": 1.6708462677250405e-06, + "loss": 0.2493, + "step": 2310 + }, + { + "epoch": 0.6065667731106198, + "grad_norm": 4.557277515775418, + "learning_rate": 1.6692379399909876e-06, + "loss": 0.3299, + "step": 2315 + }, + { + "epoch": 0.6078768525341849, + "grad_norm": 2.960623480418135, + "learning_rate": 1.6676264707058599e-06, + "loss": 0.3056, + "step": 2320 + }, + { + "epoch": 0.60918693195775, + "grad_norm": 5.423095170753771, + "learning_rate": 1.6660118674342515e-06, + "loss": 0.341, + "step": 2325 + }, + { + "epoch": 0.610497011381315, + "grad_norm": 2.6332110051555113, + "learning_rate": 1.6643941377554675e-06, + "loss": 0.2743, + "step": 2330 + }, + { + "epoch": 0.61180709080488, + "grad_norm": 2.2505359636330207, + "learning_rate": 1.6627732892634893e-06, + "loss": 0.2578, + "step": 2335 + }, + { + "epoch": 0.6131171702284451, + "grad_norm": 4.154284637955047, + "learning_rate": 1.6611493295669386e-06, + "loss": 0.3286, + "step": 2340 + }, + { + "epoch": 0.6144272496520101, + "grad_norm": 4.577727996526487, + "learning_rate": 1.6595222662890418e-06, + "loss": 0.2868, + "step": 2345 + }, + { + "epoch": 0.6157373290755752, + "grad_norm": 4.653567093852986, + "learning_rate": 1.657892107067594e-06, + "loss": 0.2551, + "step": 2350 + }, + { + "epoch": 0.6170474084991403, + "grad_norm": 4.19440150981268, + "learning_rate": 1.6562588595549235e-06, + "loss": 0.2847, + "step": 2355 + }, + { + "epoch": 0.6183574879227053, + "grad_norm": 4.490108240777686, + "learning_rate": 1.654622531417856e-06, + "loss": 0.319, + "step": 2360 + }, + { + "epoch": 0.6196675673462704, + "grad_norm": 3.896691949712352, + "learning_rate": 1.6529831303376787e-06, + "loss": 0.2833, + "step": 2365 + }, + { + "epoch": 0.6209776467698355, + "grad_norm": 4.158481682638508, + "learning_rate": 1.651340664010102e-06, + "loss": 0.2759, + "step": 2370 + }, + { + "epoch": 0.6222877261934004, + "grad_norm": 2.4673087844419337, + "learning_rate": 1.6496951401452272e-06, + "loss": 0.2068, + "step": 2375 + }, + { + "epoch": 0.6235978056169655, + "grad_norm": 4.873523893306045, + "learning_rate": 1.6480465664675078e-06, + "loss": 0.2822, + "step": 2380 + }, + { + "epoch": 0.6249078850405306, + "grad_norm": 4.513011014916301, + "learning_rate": 1.6463949507157131e-06, + "loss": 0.311, + "step": 2385 + }, + { + "epoch": 0.6262179644640956, + "grad_norm": 4.41040904711801, + "learning_rate": 1.644740300642894e-06, + "loss": 0.2894, + "step": 2390 + }, + { + "epoch": 0.6275280438876607, + "grad_norm": 7.808507472292027, + "learning_rate": 1.6430826240163436e-06, + "loss": 0.3345, + "step": 2395 + }, + { + "epoch": 0.6288381233112258, + "grad_norm": 2.615243925080016, + "learning_rate": 1.6414219286175635e-06, + "loss": 0.2465, + "step": 2400 + }, + { + "epoch": 0.6288381233112258, + "eval_accuracy": 0.7408, + "eval_loss": 0.7380235195159912, + "eval_runtime": 136.2377, + "eval_samples_per_second": 9.175, + "eval_steps_per_second": 2.297, + "step": 2400 + }, + { + "epoch": 0.6301482027347908, + "grad_norm": 3.567322311755078, + "learning_rate": 1.639758222242225e-06, + "loss": 0.2349, + "step": 2405 + }, + { + "epoch": 0.6314582821583559, + "grad_norm": 5.503722999681643, + "learning_rate": 1.638091512700135e-06, + "loss": 0.2486, + "step": 2410 + }, + { + "epoch": 0.632768361581921, + "grad_norm": 5.060199812880774, + "learning_rate": 1.6364218078151963e-06, + "loss": 0.3254, + "step": 2415 + }, + { + "epoch": 0.6340784410054859, + "grad_norm": 2.967797600289104, + "learning_rate": 1.6347491154253738e-06, + "loss": 0.3049, + "step": 2420 + }, + { + "epoch": 0.635388520429051, + "grad_norm": 3.6123691523694297, + "learning_rate": 1.6330734433826562e-06, + "loss": 0.3079, + "step": 2425 + }, + { + "epoch": 0.636698599852616, + "grad_norm": 2.520240254040695, + "learning_rate": 1.6313947995530187e-06, + "loss": 0.2677, + "step": 2430 + }, + { + "epoch": 0.6380086792761811, + "grad_norm": 3.700192416490241, + "learning_rate": 1.6297131918163874e-06, + "loss": 0.2393, + "step": 2435 + }, + { + "epoch": 0.6393187586997462, + "grad_norm": 3.4585291941554797, + "learning_rate": 1.6280286280666011e-06, + "loss": 0.253, + "step": 2440 + }, + { + "epoch": 0.6406288381233112, + "grad_norm": 5.9407205540242884, + "learning_rate": 1.6263411162113752e-06, + "loss": 0.2991, + "step": 2445 + }, + { + "epoch": 0.6419389175468763, + "grad_norm": 3.7257997296487546, + "learning_rate": 1.624650664172264e-06, + "loss": 0.3, + "step": 2450 + }, + { + "epoch": 0.6432489969704414, + "grad_norm": 7.091164226358698, + "learning_rate": 1.6229572798846233e-06, + "loss": 0.2964, + "step": 2455 + }, + { + "epoch": 0.6445590763940063, + "grad_norm": 3.945065293498854, + "learning_rate": 1.6212609712975746e-06, + "loss": 0.3003, + "step": 2460 + }, + { + "epoch": 0.6458691558175714, + "grad_norm": 4.547862400421276, + "learning_rate": 1.6195617463739657e-06, + "loss": 0.312, + "step": 2465 + }, + { + "epoch": 0.6471792352411365, + "grad_norm": 3.792620720497921, + "learning_rate": 1.6178596130903343e-06, + "loss": 0.2689, + "step": 2470 + }, + { + "epoch": 0.6484893146647015, + "grad_norm": 4.549047060259805, + "learning_rate": 1.6161545794368712e-06, + "loss": 0.3019, + "step": 2475 + }, + { + "epoch": 0.6497993940882666, + "grad_norm": 2.3008964624889114, + "learning_rate": 1.614446653417382e-06, + "loss": 0.2427, + "step": 2480 + }, + { + "epoch": 0.6511094735118317, + "grad_norm": 3.448390528740768, + "learning_rate": 1.6127358430492496e-06, + "loss": 0.2733, + "step": 2485 + }, + { + "epoch": 0.6524195529353967, + "grad_norm": 3.115685764931888, + "learning_rate": 1.6110221563633966e-06, + "loss": 0.2813, + "step": 2490 + }, + { + "epoch": 0.6537296323589618, + "grad_norm": 2.7637030640180056, + "learning_rate": 1.6093056014042476e-06, + "loss": 0.316, + "step": 2495 + }, + { + "epoch": 0.6550397117825268, + "grad_norm": 2.8028721777171763, + "learning_rate": 1.6075861862296918e-06, + "loss": 0.2465, + "step": 2500 + }, + { + "epoch": 0.6550397117825268, + "eval_accuracy": 0.7504, + "eval_loss": 0.7594350576400757, + "eval_runtime": 136.5188, + "eval_samples_per_second": 9.156, + "eval_steps_per_second": 2.293, + "step": 2500 + }, + { + "epoch": 0.6563497912060918, + "grad_norm": 2.201406591183069, + "learning_rate": 1.6058639189110448e-06, + "loss": 0.2579, + "step": 2505 + }, + { + "epoch": 0.6576598706296569, + "grad_norm": 2.743123705211622, + "learning_rate": 1.6041388075330104e-06, + "loss": 0.2671, + "step": 2510 + }, + { + "epoch": 0.658969950053222, + "grad_norm": 3.35926627410109, + "learning_rate": 1.6024108601936441e-06, + "loss": 0.2722, + "step": 2515 + }, + { + "epoch": 0.660280029476787, + "grad_norm": 4.177142712614172, + "learning_rate": 1.600680085004313e-06, + "loss": 0.255, + "step": 2520 + }, + { + "epoch": 0.6615901089003521, + "grad_norm": 5.607535570859494, + "learning_rate": 1.5989464900896584e-06, + "loss": 0.2808, + "step": 2525 + }, + { + "epoch": 0.6629001883239172, + "grad_norm": 4.111155214094161, + "learning_rate": 1.5972100835875596e-06, + "loss": 0.2749, + "step": 2530 + }, + { + "epoch": 0.6642102677474822, + "grad_norm": 3.852938068841201, + "learning_rate": 1.5954708736490927e-06, + "loss": 0.374, + "step": 2535 + }, + { + "epoch": 0.6655203471710472, + "grad_norm": 4.998114409517782, + "learning_rate": 1.5937288684384948e-06, + "loss": 0.2988, + "step": 2540 + }, + { + "epoch": 0.6668304265946123, + "grad_norm": 3.2234837676131036, + "learning_rate": 1.5919840761331233e-06, + "loss": 0.2926, + "step": 2545 + }, + { + "epoch": 0.6681405060181773, + "grad_norm": 2.6128145021675135, + "learning_rate": 1.59023650492342e-06, + "loss": 0.2685, + "step": 2550 + }, + { + "epoch": 0.6694505854417424, + "grad_norm": 2.746049149593303, + "learning_rate": 1.588486163012871e-06, + "loss": 0.276, + "step": 2555 + }, + { + "epoch": 0.6707606648653075, + "grad_norm": 12.230075077506218, + "learning_rate": 1.5867330586179692e-06, + "loss": 0.3356, + "step": 2560 + }, + { + "epoch": 0.6720707442888725, + "grad_norm": 3.609150884738277, + "learning_rate": 1.5849771999681744e-06, + "loss": 0.2876, + "step": 2565 + }, + { + "epoch": 0.6733808237124376, + "grad_norm": 3.696663482780853, + "learning_rate": 1.583218595305876e-06, + "loss": 0.2801, + "step": 2570 + }, + { + "epoch": 0.6746909031360027, + "grad_norm": 2.4314233621566674, + "learning_rate": 1.5814572528863537e-06, + "loss": 0.246, + "step": 2575 + }, + { + "epoch": 0.6760009825595676, + "grad_norm": 3.410781191475223, + "learning_rate": 1.5796931809777387e-06, + "loss": 0.2854, + "step": 2580 + }, + { + "epoch": 0.6773110619831327, + "grad_norm": 7.965165743041762, + "learning_rate": 1.5779263878609752e-06, + "loss": 0.3286, + "step": 2585 + }, + { + "epoch": 0.6786211414066978, + "grad_norm": 2.8880233729881333, + "learning_rate": 1.5761568818297814e-06, + "loss": 0.3273, + "step": 2590 + }, + { + "epoch": 0.6799312208302628, + "grad_norm": 4.279782991468597, + "learning_rate": 1.5743846711906103e-06, + "loss": 0.2907, + "step": 2595 + }, + { + "epoch": 0.6812413002538279, + "grad_norm": 4.080828216313647, + "learning_rate": 1.5726097642626112e-06, + "loss": 0.3034, + "step": 2600 + }, + { + "epoch": 0.6812413002538279, + "eval_accuracy": 0.7576, + "eval_loss": 0.6795002818107605, + "eval_runtime": 138.009, + "eval_samples_per_second": 9.057, + "eval_steps_per_second": 2.268, + "step": 2600 + }, + { + "epoch": 0.682551379677393, + "grad_norm": 3.704947001007602, + "learning_rate": 1.5708321693775901e-06, + "loss": 0.2779, + "step": 2605 + }, + { + "epoch": 0.683861459100958, + "grad_norm": 3.7633025557802045, + "learning_rate": 1.569051894879971e-06, + "loss": 0.2513, + "step": 2610 + }, + { + "epoch": 0.6851715385245231, + "grad_norm": 4.533908569894921, + "learning_rate": 1.5672689491267565e-06, + "loss": 0.2519, + "step": 2615 + }, + { + "epoch": 0.6864816179480882, + "grad_norm": 7.825654578480919, + "learning_rate": 1.5654833404874889e-06, + "loss": 0.3064, + "step": 2620 + }, + { + "epoch": 0.6877916973716531, + "grad_norm": 6.701186216511913, + "learning_rate": 1.5636950773442107e-06, + "loss": 0.2888, + "step": 2625 + }, + { + "epoch": 0.6891017767952182, + "grad_norm": 5.08064257662279, + "learning_rate": 1.5619041680914244e-06, + "loss": 0.2841, + "step": 2630 + }, + { + "epoch": 0.6904118562187833, + "grad_norm": 3.3431125663033403, + "learning_rate": 1.560110621136055e-06, + "loss": 0.33, + "step": 2635 + }, + { + "epoch": 0.6917219356423483, + "grad_norm": 3.348150218520968, + "learning_rate": 1.5583144448974092e-06, + "loss": 0.2425, + "step": 2640 + }, + { + "epoch": 0.6930320150659134, + "grad_norm": 2.6434575784544485, + "learning_rate": 1.556515647807136e-06, + "loss": 0.2892, + "step": 2645 + }, + { + "epoch": 0.6943420944894785, + "grad_norm": 3.4776108132319514, + "learning_rate": 1.5547142383091868e-06, + "loss": 0.2468, + "step": 2650 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 4.831568463736758, + "learning_rate": 1.5529102248597772e-06, + "loss": 0.2789, + "step": 2655 + }, + { + "epoch": 0.6969622533366086, + "grad_norm": 2.093319788857653, + "learning_rate": 1.5511036159273452e-06, + "loss": 0.287, + "step": 2660 + }, + { + "epoch": 0.6982723327601735, + "grad_norm": 3.5718053321237213, + "learning_rate": 1.5492944199925133e-06, + "loss": 0.2576, + "step": 2665 + }, + { + "epoch": 0.6995824121837386, + "grad_norm": 2.6423916949191173, + "learning_rate": 1.5474826455480486e-06, + "loss": 0.3232, + "step": 2670 + }, + { + "epoch": 0.7008924916073037, + "grad_norm": 2.4967928837848996, + "learning_rate": 1.5456683010988203e-06, + "loss": 0.2656, + "step": 2675 + }, + { + "epoch": 0.7022025710308687, + "grad_norm": 2.280284760261213, + "learning_rate": 1.5438513951617637e-06, + "loss": 0.223, + "step": 2680 + }, + { + "epoch": 0.7035126504544338, + "grad_norm": 3.9018364135960497, + "learning_rate": 1.5420319362658373e-06, + "loss": 0.2352, + "step": 2685 + }, + { + "epoch": 0.7048227298779989, + "grad_norm": 4.281704624864632, + "learning_rate": 1.5402099329519845e-06, + "loss": 0.2683, + "step": 2690 + }, + { + "epoch": 0.7061328093015639, + "grad_norm": 6.450930230248805, + "learning_rate": 1.5383853937730916e-06, + "loss": 0.2804, + "step": 2695 + }, + { + "epoch": 0.707442888725129, + "grad_norm": 3.6301375705851835, + "learning_rate": 1.53655832729395e-06, + "loss": 0.256, + "step": 2700 + }, + { + "epoch": 0.707442888725129, + "eval_accuracy": 0.7624, + "eval_loss": 0.7787925004959106, + "eval_runtime": 138.9559, + "eval_samples_per_second": 8.996, + "eval_steps_per_second": 2.253, + "step": 2700 + }, + { + "epoch": 0.708752968148694, + "grad_norm": 4.1331580241606725, + "learning_rate": 1.534728742091214e-06, + "loss": 0.3178, + "step": 2705 + }, + { + "epoch": 0.710063047572259, + "grad_norm": 4.1609025912552005, + "learning_rate": 1.532896646753362e-06, + "loss": 0.2764, + "step": 2710 + }, + { + "epoch": 0.7113731269958241, + "grad_norm": 2.4782882085210884, + "learning_rate": 1.5310620498806548e-06, + "loss": 0.2497, + "step": 2715 + }, + { + "epoch": 0.7126832064193892, + "grad_norm": 4.503219440050312, + "learning_rate": 1.5292249600850966e-06, + "loss": 0.2618, + "step": 2720 + }, + { + "epoch": 0.7139932858429542, + "grad_norm": 4.86090545111869, + "learning_rate": 1.5273853859903935e-06, + "loss": 0.2522, + "step": 2725 + }, + { + "epoch": 0.7153033652665193, + "grad_norm": 4.018354852882808, + "learning_rate": 1.525543336231914e-06, + "loss": 0.3052, + "step": 2730 + }, + { + "epoch": 0.7166134446900844, + "grad_norm": 4.797568374404226, + "learning_rate": 1.5236988194566469e-06, + "loss": 0.3183, + "step": 2735 + }, + { + "epoch": 0.7179235241136493, + "grad_norm": 4.8386270061207055, + "learning_rate": 1.5218518443231628e-06, + "loss": 0.2763, + "step": 2740 + }, + { + "epoch": 0.7192336035372144, + "grad_norm": 4.215400128326543, + "learning_rate": 1.5200024195015719e-06, + "loss": 0.2661, + "step": 2745 + }, + { + "epoch": 0.7205436829607795, + "grad_norm": 4.56588429028685, + "learning_rate": 1.5181505536734835e-06, + "loss": 0.283, + "step": 2750 + }, + { + "epoch": 0.7218537623843445, + "grad_norm": 6.619608414847504, + "learning_rate": 1.5162962555319664e-06, + "loss": 0.271, + "step": 2755 + }, + { + "epoch": 0.7231638418079096, + "grad_norm": 2.4274939604447385, + "learning_rate": 1.5144395337815063e-06, + "loss": 0.313, + "step": 2760 + }, + { + "epoch": 0.7244739212314747, + "grad_norm": 5.626984953335138, + "learning_rate": 1.5125803971379665e-06, + "loss": 0.2866, + "step": 2765 + }, + { + "epoch": 0.7257840006550397, + "grad_norm": 4.285823933441923, + "learning_rate": 1.5107188543285454e-06, + "loss": 0.2603, + "step": 2770 + }, + { + "epoch": 0.7270940800786048, + "grad_norm": 4.38863656110863, + "learning_rate": 1.5088549140917381e-06, + "loss": 0.3184, + "step": 2775 + }, + { + "epoch": 0.7284041595021699, + "grad_norm": 2.9328782019117465, + "learning_rate": 1.506988585177292e-06, + "loss": 0.2389, + "step": 2780 + }, + { + "epoch": 0.7297142389257348, + "grad_norm": 3.483606480673357, + "learning_rate": 1.505119876346168e-06, + "loss": 0.276, + "step": 2785 + }, + { + "epoch": 0.7310243183492999, + "grad_norm": 4.504190498010961, + "learning_rate": 1.5032487963705003e-06, + "loss": 0.1977, + "step": 2790 + }, + { + "epoch": 0.732334397772865, + "grad_norm": 4.184926339697806, + "learning_rate": 1.5013753540335517e-06, + "loss": 0.2972, + "step": 2795 + }, + { + "epoch": 0.73364447719643, + "grad_norm": 3.8006093754774195, + "learning_rate": 1.499499558129676e-06, + "loss": 0.2776, + "step": 2800 + }, + { + "epoch": 0.73364447719643, + "eval_accuracy": 0.7504, + "eval_loss": 0.7540197372436523, + "eval_runtime": 142.6507, + "eval_samples_per_second": 8.763, + "eval_steps_per_second": 2.194, + "step": 2800 + }, + { + "epoch": 0.7349545566199951, + "grad_norm": 3.614639963171112, + "learning_rate": 1.497621417464274e-06, + "loss": 0.2199, + "step": 2805 + }, + { + "epoch": 0.7362646360435602, + "grad_norm": 3.4753724939982367, + "learning_rate": 1.4957409408537535e-06, + "loss": 0.2842, + "step": 2810 + }, + { + "epoch": 0.7375747154671252, + "grad_norm": 2.829347092202445, + "learning_rate": 1.493858137125489e-06, + "loss": 0.2054, + "step": 2815 + }, + { + "epoch": 0.7388847948906903, + "grad_norm": 4.607528640210262, + "learning_rate": 1.4919730151177773e-06, + "loss": 0.2488, + "step": 2820 + }, + { + "epoch": 0.7401948743142553, + "grad_norm": 4.424154310853472, + "learning_rate": 1.4900855836797995e-06, + "loss": 0.3079, + "step": 2825 + }, + { + "epoch": 0.7415049537378203, + "grad_norm": 2.8286263481877434, + "learning_rate": 1.4881958516715757e-06, + "loss": 0.267, + "step": 2830 + }, + { + "epoch": 0.7428150331613854, + "grad_norm": 4.694968243877861, + "learning_rate": 1.4863038279639268e-06, + "loss": 0.2903, + "step": 2835 + }, + { + "epoch": 0.7441251125849505, + "grad_norm": 3.672549062096689, + "learning_rate": 1.4844095214384309e-06, + "loss": 0.2583, + "step": 2840 + }, + { + "epoch": 0.7454351920085155, + "grad_norm": 3.4410420172535887, + "learning_rate": 1.4825129409873822e-06, + "loss": 0.3213, + "step": 2845 + }, + { + "epoch": 0.7467452714320806, + "grad_norm": 3.4101727347068382, + "learning_rate": 1.4806140955137495e-06, + "loss": 0.2537, + "step": 2850 + }, + { + "epoch": 0.7480553508556457, + "grad_norm": 4.519383622184218, + "learning_rate": 1.4787129939311337e-06, + "loss": 0.2929, + "step": 2855 + }, + { + "epoch": 0.7493654302792107, + "grad_norm": 3.4774712459804404, + "learning_rate": 1.4768096451637272e-06, + "loss": 0.2682, + "step": 2860 + }, + { + "epoch": 0.7506755097027757, + "grad_norm": 2.6479749188555575, + "learning_rate": 1.4749040581462694e-06, + "loss": 0.2519, + "step": 2865 + }, + { + "epoch": 0.7519855891263408, + "grad_norm": 4.16913561566471, + "learning_rate": 1.4729962418240086e-06, + "loss": 0.2619, + "step": 2870 + }, + { + "epoch": 0.7532956685499058, + "grad_norm": 2.0678666370348324, + "learning_rate": 1.471086205152657e-06, + "loss": 0.319, + "step": 2875 + }, + { + "epoch": 0.7546057479734709, + "grad_norm": 2.976384517477917, + "learning_rate": 1.469173957098349e-06, + "loss": 0.3259, + "step": 2880 + }, + { + "epoch": 0.755915827397036, + "grad_norm": 3.658100772623381, + "learning_rate": 1.4672595066376015e-06, + "loss": 0.2506, + "step": 2885 + }, + { + "epoch": 0.757225906820601, + "grad_norm": 6.101815938265203, + "learning_rate": 1.4653428627572674e-06, + "loss": 0.2655, + "step": 2890 + }, + { + "epoch": 0.7585359862441661, + "grad_norm": 2.8143348607782337, + "learning_rate": 1.4634240344544988e-06, + "loss": 0.2684, + "step": 2895 + }, + { + "epoch": 0.759846065667731, + "grad_norm": 2.105144871048026, + "learning_rate": 1.4615030307366998e-06, + "loss": 0.2804, + "step": 2900 + }, + { + "epoch": 0.759846065667731, + "eval_accuracy": 0.748, + "eval_loss": 0.7601897716522217, + "eval_runtime": 139.2011, + "eval_samples_per_second": 8.98, + "eval_steps_per_second": 2.249, + "step": 2900 + }, + { + "epoch": 0.7611561450912961, + "grad_norm": 3.998393869040855, + "learning_rate": 1.459579860621488e-06, + "loss": 0.2674, + "step": 2905 + }, + { + "epoch": 0.7624662245148612, + "grad_norm": 2.2268642108185053, + "learning_rate": 1.4576545331366488e-06, + "loss": 0.2702, + "step": 2910 + }, + { + "epoch": 0.7637763039384262, + "grad_norm": 6.218849876814229, + "learning_rate": 1.4557270573200962e-06, + "loss": 0.2864, + "step": 2915 + }, + { + "epoch": 0.7650863833619913, + "grad_norm": 4.323897051065836, + "learning_rate": 1.4537974422198285e-06, + "loss": 0.2636, + "step": 2920 + }, + { + "epoch": 0.7663964627855564, + "grad_norm": 5.750558481033307, + "learning_rate": 1.451865696893886e-06, + "loss": 0.2319, + "step": 2925 + }, + { + "epoch": 0.7677065422091214, + "grad_norm": 6.231068587651604, + "learning_rate": 1.4499318304103097e-06, + "loss": 0.2912, + "step": 2930 + }, + { + "epoch": 0.7690166216326865, + "grad_norm": 3.8621492036274545, + "learning_rate": 1.447995851847096e-06, + "loss": 0.2594, + "step": 2935 + }, + { + "epoch": 0.7703267010562516, + "grad_norm": 4.8680458967049285, + "learning_rate": 1.4460577702921577e-06, + "loss": 0.2787, + "step": 2940 + }, + { + "epoch": 0.7716367804798165, + "grad_norm": 2.277649540761437, + "learning_rate": 1.4441175948432784e-06, + "loss": 0.2722, + "step": 2945 + }, + { + "epoch": 0.7729468599033816, + "grad_norm": 3.335412545156817, + "learning_rate": 1.4421753346080714e-06, + "loss": 0.2614, + "step": 2950 + }, + { + "epoch": 0.7742569393269467, + "grad_norm": 3.194957371530983, + "learning_rate": 1.4402309987039365e-06, + "loss": 0.3021, + "step": 2955 + }, + { + "epoch": 0.7755670187505117, + "grad_norm": 4.092383074593162, + "learning_rate": 1.4382845962580165e-06, + "loss": 0.2532, + "step": 2960 + }, + { + "epoch": 0.7768770981740768, + "grad_norm": 2.9160857878852595, + "learning_rate": 1.436336136407156e-06, + "loss": 0.3102, + "step": 2965 + }, + { + "epoch": 0.7781871775976419, + "grad_norm": 2.6117926566384377, + "learning_rate": 1.4343856282978565e-06, + "loss": 0.2532, + "step": 2970 + }, + { + "epoch": 0.7794972570212069, + "grad_norm": 2.514471217178964, + "learning_rate": 1.4324330810862354e-06, + "loss": 0.2709, + "step": 2975 + }, + { + "epoch": 0.780807336444772, + "grad_norm": 2.391957064033006, + "learning_rate": 1.430478503937981e-06, + "loss": 0.2655, + "step": 2980 + }, + { + "epoch": 0.782117415868337, + "grad_norm": 8.037406108008932, + "learning_rate": 1.4285219060283119e-06, + "loss": 0.3229, + "step": 2985 + }, + { + "epoch": 0.783427495291902, + "grad_norm": 2.024818465241261, + "learning_rate": 1.4265632965419311e-06, + "loss": 0.2476, + "step": 2990 + }, + { + "epoch": 0.7847375747154671, + "grad_norm": 4.525940151913456, + "learning_rate": 1.4246026846729864e-06, + "loss": 0.2801, + "step": 2995 + }, + { + "epoch": 0.7860476541390322, + "grad_norm": 5.616203546892296, + "learning_rate": 1.422640079625023e-06, + "loss": 0.2893, + "step": 3000 + }, + { + "epoch": 0.7860476541390322, + "eval_accuracy": 0.7448, + "eval_loss": 0.8274851441383362, + "eval_runtime": 139.0761, + "eval_samples_per_second": 8.988, + "eval_steps_per_second": 2.251, + "step": 3000 + }, + { + "epoch": 0.7873577335625972, + "grad_norm": 3.641490270919579, + "learning_rate": 1.420675490610944e-06, + "loss": 0.2927, + "step": 3005 + }, + { + "epoch": 0.7886678129861623, + "grad_norm": 2.8550119558130884, + "learning_rate": 1.418708926852965e-06, + "loss": 0.2525, + "step": 3010 + }, + { + "epoch": 0.7899778924097274, + "grad_norm": 3.081716905966947, + "learning_rate": 1.4167403975825726e-06, + "loss": 0.2494, + "step": 3015 + }, + { + "epoch": 0.7912879718332924, + "grad_norm": 4.676494817613609, + "learning_rate": 1.4147699120404775e-06, + "loss": 0.2858, + "step": 3020 + }, + { + "epoch": 0.7925980512568574, + "grad_norm": 1.9991869906134794, + "learning_rate": 1.4127974794765764e-06, + "loss": 0.2937, + "step": 3025 + }, + { + "epoch": 0.7939081306804225, + "grad_norm": 2.6233135206447398, + "learning_rate": 1.410823109149904e-06, + "loss": 0.2932, + "step": 3030 + }, + { + "epoch": 0.7952182101039875, + "grad_norm": 2.2451673120874176, + "learning_rate": 1.408846810328592e-06, + "loss": 0.2594, + "step": 3035 + }, + { + "epoch": 0.7965282895275526, + "grad_norm": 2.7009560292084336, + "learning_rate": 1.4068685922898244e-06, + "loss": 0.3115, + "step": 3040 + }, + { + "epoch": 0.7978383689511177, + "grad_norm": 2.433826115411649, + "learning_rate": 1.4048884643197947e-06, + "loss": 0.268, + "step": 3045 + }, + { + "epoch": 0.7991484483746827, + "grad_norm": 3.5050872305744663, + "learning_rate": 1.4029064357136626e-06, + "loss": 0.266, + "step": 3050 + }, + { + "epoch": 0.8004585277982478, + "grad_norm": 3.1713770891943462, + "learning_rate": 1.4009225157755085e-06, + "loss": 0.2807, + "step": 3055 + }, + { + "epoch": 0.8017686072218129, + "grad_norm": 3.3467606419772697, + "learning_rate": 1.3989367138182924e-06, + "loss": 0.2641, + "step": 3060 + }, + { + "epoch": 0.8030786866453778, + "grad_norm": 3.1069787059795946, + "learning_rate": 1.396949039163808e-06, + "loss": 0.277, + "step": 3065 + }, + { + "epoch": 0.8043887660689429, + "grad_norm": 4.763920028748174, + "learning_rate": 1.3949595011426407e-06, + "loss": 0.2625, + "step": 3070 + }, + { + "epoch": 0.805698845492508, + "grad_norm": 2.8182907657903606, + "learning_rate": 1.392968109094122e-06, + "loss": 0.2487, + "step": 3075 + }, + { + "epoch": 0.807008924916073, + "grad_norm": 2.72524792612828, + "learning_rate": 1.3909748723662871e-06, + "loss": 0.2513, + "step": 3080 + }, + { + "epoch": 0.8083190043396381, + "grad_norm": 3.3492249191737677, + "learning_rate": 1.3889798003158312e-06, + "loss": 0.2844, + "step": 3085 + }, + { + "epoch": 0.8096290837632032, + "grad_norm": 4.142567741582466, + "learning_rate": 1.3869829023080636e-06, + "loss": 0.2978, + "step": 3090 + }, + { + "epoch": 0.8109391631867682, + "grad_norm": 4.435078753474168, + "learning_rate": 1.384984187716866e-06, + "loss": 0.217, + "step": 3095 + }, + { + "epoch": 0.8122492426103333, + "grad_norm": 6.294316726095601, + "learning_rate": 1.3829836659246473e-06, + "loss": 0.3141, + "step": 3100 + }, + { + "epoch": 0.8122492426103333, + "eval_accuracy": 0.7392, + "eval_loss": 0.7475783824920654, + "eval_runtime": 139.032, + "eval_samples_per_second": 8.991, + "eval_steps_per_second": 2.251, + "step": 3100 + }, + { + "epoch": 0.8135593220338984, + "grad_norm": 2.2935780929180707, + "learning_rate": 1.3809813463222995e-06, + "loss": 0.2432, + "step": 3105 + }, + { + "epoch": 0.8148694014574633, + "grad_norm": 3.0405694359199367, + "learning_rate": 1.3789772383091542e-06, + "loss": 0.234, + "step": 3110 + }, + { + "epoch": 0.8161794808810284, + "grad_norm": 2.6036641756583974, + "learning_rate": 1.3769713512929384e-06, + "loss": 0.2513, + "step": 3115 + }, + { + "epoch": 0.8174895603045935, + "grad_norm": 3.0854001671426166, + "learning_rate": 1.37496369468973e-06, + "loss": 0.3248, + "step": 3120 + }, + { + "epoch": 0.8187996397281585, + "grad_norm": 2.7741805327089253, + "learning_rate": 1.3729542779239133e-06, + "loss": 0.2183, + "step": 3125 + }, + { + "epoch": 0.8201097191517236, + "grad_norm": 3.154383132924369, + "learning_rate": 1.370943110428136e-06, + "loss": 0.2318, + "step": 3130 + }, + { + "epoch": 0.8214197985752886, + "grad_norm": 2.728987342059349, + "learning_rate": 1.3689302016432628e-06, + "loss": 0.2505, + "step": 3135 + }, + { + "epoch": 0.8227298779988537, + "grad_norm": 4.260828009892338, + "learning_rate": 1.3669155610183336e-06, + "loss": 0.2859, + "step": 3140 + }, + { + "epoch": 0.8240399574224188, + "grad_norm": 4.698119734994208, + "learning_rate": 1.364899198010518e-06, + "loss": 0.3126, + "step": 3145 + }, + { + "epoch": 0.8253500368459837, + "grad_norm": 2.9266861851773336, + "learning_rate": 1.3628811220850703e-06, + "loss": 0.2524, + "step": 3150 + }, + { + "epoch": 0.8266601162695488, + "grad_norm": 3.3660376635311744, + "learning_rate": 1.3608613427152854e-06, + "loss": 0.26, + "step": 3155 + }, + { + "epoch": 0.8279701956931139, + "grad_norm": 2.7452509590309027, + "learning_rate": 1.358839869382455e-06, + "loss": 0.2787, + "step": 3160 + }, + { + "epoch": 0.8292802751166789, + "grad_norm": 2.2871077739708348, + "learning_rate": 1.356816711575823e-06, + "loss": 0.2774, + "step": 3165 + }, + { + "epoch": 0.830590354540244, + "grad_norm": 2.353498321089704, + "learning_rate": 1.3547918787925392e-06, + "loss": 0.1922, + "step": 3170 + }, + { + "epoch": 0.8319004339638091, + "grad_norm": 2.8043846357680895, + "learning_rate": 1.352765380537618e-06, + "loss": 0.2457, + "step": 3175 + }, + { + "epoch": 0.8332105133873741, + "grad_norm": 6.287288851930004, + "learning_rate": 1.3507372263238901e-06, + "loss": 0.2882, + "step": 3180 + }, + { + "epoch": 0.8345205928109392, + "grad_norm": 5.293879458072892, + "learning_rate": 1.3487074256719608e-06, + "loss": 0.2908, + "step": 3185 + }, + { + "epoch": 0.8358306722345042, + "grad_norm": 3.70662303230532, + "learning_rate": 1.3466759881101637e-06, + "loss": 0.2343, + "step": 3190 + }, + { + "epoch": 0.8371407516580692, + "grad_norm": 5.617247281731303, + "learning_rate": 1.344642923174517e-06, + "loss": 0.3469, + "step": 3195 + }, + { + "epoch": 0.8384508310816343, + "grad_norm": 5.831422377330226, + "learning_rate": 1.3426082404086772e-06, + "loss": 0.3464, + "step": 3200 + }, + { + "epoch": 0.8384508310816343, + "eval_accuracy": 0.7464, + "eval_loss": 0.6823216080665588, + "eval_runtime": 137.8591, + "eval_samples_per_second": 9.067, + "eval_steps_per_second": 2.27, + "step": 3200 + }, + { + "epoch": 0.8397609105051994, + "grad_norm": 5.229782066538766, + "learning_rate": 1.3405719493638959e-06, + "loss": 0.2926, + "step": 3205 + }, + { + "epoch": 0.8410709899287644, + "grad_norm": 1.8257926894676517, + "learning_rate": 1.3385340595989738e-06, + "loss": 0.2532, + "step": 3210 + }, + { + "epoch": 0.8423810693523295, + "grad_norm": 2.1256870704370434, + "learning_rate": 1.3364945806802173e-06, + "loss": 0.2456, + "step": 3215 + }, + { + "epoch": 0.8436911487758946, + "grad_norm": 2.7507619238311065, + "learning_rate": 1.3344535221813915e-06, + "loss": 0.2556, + "step": 3220 + }, + { + "epoch": 0.8450012281994596, + "grad_norm": 3.313724807442175, + "learning_rate": 1.3324108936836775e-06, + "loss": 0.2604, + "step": 3225 + }, + { + "epoch": 0.8463113076230246, + "grad_norm": 3.2583479898589385, + "learning_rate": 1.330366704775625e-06, + "loss": 0.2566, + "step": 3230 + }, + { + "epoch": 0.8476213870465897, + "grad_norm": 4.463854161721075, + "learning_rate": 1.3283209650531098e-06, + "loss": 0.3077, + "step": 3235 + }, + { + "epoch": 0.8489314664701547, + "grad_norm": 2.7758922633618868, + "learning_rate": 1.326273684119287e-06, + "loss": 0.2555, + "step": 3240 + }, + { + "epoch": 0.8502415458937198, + "grad_norm": 6.067311625889626, + "learning_rate": 1.3242248715845468e-06, + "loss": 0.3606, + "step": 3245 + }, + { + "epoch": 0.8515516253172849, + "grad_norm": 2.7176945371959658, + "learning_rate": 1.3221745370664689e-06, + "loss": 0.2035, + "step": 3250 + }, + { + "epoch": 0.8528617047408499, + "grad_norm": 5.014963951139648, + "learning_rate": 1.3201226901897773e-06, + "loss": 0.3122, + "step": 3255 + }, + { + "epoch": 0.854171784164415, + "grad_norm": 3.1409689213203262, + "learning_rate": 1.318069340586296e-06, + "loss": 0.2756, + "step": 3260 + }, + { + "epoch": 0.8554818635879801, + "grad_norm": 2.6726747362164613, + "learning_rate": 1.316014497894902e-06, + "loss": 0.2037, + "step": 3265 + }, + { + "epoch": 0.856791943011545, + "grad_norm": 4.397327066527509, + "learning_rate": 1.3139581717614822e-06, + "loss": 0.2166, + "step": 3270 + }, + { + "epoch": 0.8581020224351101, + "grad_norm": 5.515919631566852, + "learning_rate": 1.311900371838887e-06, + "loss": 0.3015, + "step": 3275 + }, + { + "epoch": 0.8594121018586752, + "grad_norm": 2.8756311830991206, + "learning_rate": 1.3098411077868846e-06, + "loss": 0.2597, + "step": 3280 + }, + { + "epoch": 0.8607221812822402, + "grad_norm": 5.970001237473167, + "learning_rate": 1.3077803892721166e-06, + "loss": 0.2328, + "step": 3285 + }, + { + "epoch": 0.8620322607058053, + "grad_norm": 5.921050843170067, + "learning_rate": 1.3057182259680517e-06, + "loss": 0.235, + "step": 3290 + }, + { + "epoch": 0.8633423401293704, + "grad_norm": 3.740850134478578, + "learning_rate": 1.3036546275549416e-06, + "loss": 0.2827, + "step": 3295 + }, + { + "epoch": 0.8646524195529354, + "grad_norm": 3.5537486719044873, + "learning_rate": 1.3015896037197737e-06, + "loss": 0.2382, + "step": 3300 + }, + { + "epoch": 0.8646524195529354, + "eval_accuracy": 0.7336, + "eval_loss": 0.791848361492157, + "eval_runtime": 137.8386, + "eval_samples_per_second": 9.069, + "eval_steps_per_second": 2.271, + "step": 3300 + }, + { + "epoch": 0.8659624989765005, + "grad_norm": 3.7119905199979706, + "learning_rate": 1.2995231641562276e-06, + "loss": 0.255, + "step": 3305 + }, + { + "epoch": 0.8672725784000656, + "grad_norm": 3.383107638588926, + "learning_rate": 1.2974553185646275e-06, + "loss": 0.2459, + "step": 3310 + }, + { + "epoch": 0.8685826578236305, + "grad_norm": 3.48789577540892, + "learning_rate": 1.295386076651899e-06, + "loss": 0.2969, + "step": 3315 + }, + { + "epoch": 0.8698927372471956, + "grad_norm": 3.9640319112857556, + "learning_rate": 1.2933154481315219e-06, + "loss": 0.2857, + "step": 3320 + }, + { + "epoch": 0.8712028166707607, + "grad_norm": 3.1582073938250077, + "learning_rate": 1.2912434427234841e-06, + "loss": 0.254, + "step": 3325 + }, + { + "epoch": 0.8725128960943257, + "grad_norm": 2.276234259584371, + "learning_rate": 1.289170070154239e-06, + "loss": 0.2445, + "step": 3330 + }, + { + "epoch": 0.8738229755178908, + "grad_norm": 5.00541090993625, + "learning_rate": 1.2870953401566555e-06, + "loss": 0.2843, + "step": 3335 + }, + { + "epoch": 0.8751330549414559, + "grad_norm": 3.159470503361849, + "learning_rate": 1.285019262469976e-06, + "loss": 0.2521, + "step": 3340 + }, + { + "epoch": 0.8764431343650209, + "grad_norm": 5.2597276914314435, + "learning_rate": 1.282941846839769e-06, + "loss": 0.2499, + "step": 3345 + }, + { + "epoch": 0.877753213788586, + "grad_norm": 3.595455023325043, + "learning_rate": 1.2808631030178834e-06, + "loss": 0.2818, + "step": 3350 + }, + { + "epoch": 0.8790632932121509, + "grad_norm": 3.5145264435052934, + "learning_rate": 1.278783040762403e-06, + "loss": 0.3035, + "step": 3355 + }, + { + "epoch": 0.880373372635716, + "grad_norm": 1.9251613140804913, + "learning_rate": 1.2767016698376002e-06, + "loss": 0.2244, + "step": 3360 + }, + { + "epoch": 0.8816834520592811, + "grad_norm": 4.237713316911567, + "learning_rate": 1.2746190000138915e-06, + "loss": 0.2627, + "step": 3365 + }, + { + "epoch": 0.8829935314828461, + "grad_norm": 2.6561528116215474, + "learning_rate": 1.27253504106779e-06, + "loss": 0.273, + "step": 3370 + }, + { + "epoch": 0.8843036109064112, + "grad_norm": 3.109317673581231, + "learning_rate": 1.2704498027818603e-06, + "loss": 0.2651, + "step": 3375 + }, + { + "epoch": 0.8856136903299763, + "grad_norm": 2.0153319806341403, + "learning_rate": 1.2683632949446726e-06, + "loss": 0.2476, + "step": 3380 + }, + { + "epoch": 0.8869237697535413, + "grad_norm": 5.576097183261757, + "learning_rate": 1.266275527350757e-06, + "loss": 0.235, + "step": 3385 + }, + { + "epoch": 0.8882338491771063, + "grad_norm": 3.7365883269489784, + "learning_rate": 1.2641865098005564e-06, + "loss": 0.2446, + "step": 3390 + }, + { + "epoch": 0.8895439286006714, + "grad_norm": 2.5559798536789153, + "learning_rate": 1.2620962521003824e-06, + "loss": 0.2616, + "step": 3395 + }, + { + "epoch": 0.8908540080242364, + "grad_norm": 3.7982545775172545, + "learning_rate": 1.260004764062367e-06, + "loss": 0.3298, + "step": 3400 + }, + { + "epoch": 0.8908540080242364, + "eval_accuracy": 0.7328, + "eval_loss": 0.8060081601142883, + "eval_runtime": 137.0834, + "eval_samples_per_second": 9.119, + "eval_steps_per_second": 2.283, + "step": 3400 + }, + { + "epoch": 0.8921640874478015, + "grad_norm": 3.7011223316651165, + "learning_rate": 1.2579120555044183e-06, + "loss": 0.2734, + "step": 3405 + }, + { + "epoch": 0.8934741668713666, + "grad_norm": 3.1941707471283496, + "learning_rate": 1.2558181362501733e-06, + "loss": 0.2535, + "step": 3410 + }, + { + "epoch": 0.8947842462949316, + "grad_norm": 2.791095240654008, + "learning_rate": 1.2537230161289536e-06, + "loss": 0.264, + "step": 3415 + }, + { + "epoch": 0.8960943257184967, + "grad_norm": 3.6185337902204244, + "learning_rate": 1.2516267049757156e-06, + "loss": 0.2472, + "step": 3420 + }, + { + "epoch": 0.8974044051420618, + "grad_norm": 3.85855921456429, + "learning_rate": 1.249529212631009e-06, + "loss": 0.3052, + "step": 3425 + }, + { + "epoch": 0.8987144845656267, + "grad_norm": 3.6343729382527146, + "learning_rate": 1.247430548940927e-06, + "loss": 0.2441, + "step": 3430 + }, + { + "epoch": 0.9000245639891918, + "grad_norm": 3.2162815125864164, + "learning_rate": 1.2453307237570617e-06, + "loss": 0.2659, + "step": 3435 + }, + { + "epoch": 0.9013346434127569, + "grad_norm": 3.1960651267976896, + "learning_rate": 1.2432297469364569e-06, + "loss": 0.2555, + "step": 3440 + }, + { + "epoch": 0.9026447228363219, + "grad_norm": 3.523205821552062, + "learning_rate": 1.2411276283415638e-06, + "loss": 0.2867, + "step": 3445 + }, + { + "epoch": 0.903954802259887, + "grad_norm": 3.455735398408258, + "learning_rate": 1.2390243778401927e-06, + "loss": 0.2998, + "step": 3450 + }, + { + "epoch": 0.9052648816834521, + "grad_norm": 2.6520324497661822, + "learning_rate": 1.2369200053054663e-06, + "loss": 0.2581, + "step": 3455 + }, + { + "epoch": 0.9065749611070171, + "grad_norm": 2.2579348470775655, + "learning_rate": 1.2348145206157758e-06, + "loss": 0.2196, + "step": 3460 + }, + { + "epoch": 0.9078850405305822, + "grad_norm": 6.681015179121082, + "learning_rate": 1.232707933654732e-06, + "loss": 0.3075, + "step": 3465 + }, + { + "epoch": 0.9091951199541473, + "grad_norm": 4.24745491945866, + "learning_rate": 1.2306002543111215e-06, + "loss": 0.2822, + "step": 3470 + }, + { + "epoch": 0.9105051993777122, + "grad_norm": 2.769756448881865, + "learning_rate": 1.2284914924788568e-06, + "loss": 0.2628, + "step": 3475 + }, + { + "epoch": 0.9118152788012773, + "grad_norm": 2.5369009573995407, + "learning_rate": 1.2263816580569333e-06, + "loss": 0.2338, + "step": 3480 + }, + { + "epoch": 0.9131253582248424, + "grad_norm": 3.165436055957326, + "learning_rate": 1.224270760949381e-06, + "loss": 0.3067, + "step": 3485 + }, + { + "epoch": 0.9144354376484074, + "grad_norm": 3.351223367009085, + "learning_rate": 1.2221588110652183e-06, + "loss": 0.3004, + "step": 3490 + }, + { + "epoch": 0.9157455170719725, + "grad_norm": 3.3328215960308305, + "learning_rate": 1.220045818318406e-06, + "loss": 0.2857, + "step": 3495 + }, + { + "epoch": 0.9170555964955376, + "grad_norm": 5.5246139033024635, + "learning_rate": 1.2179317926277987e-06, + "loss": 0.2715, + "step": 3500 + }, + { + "epoch": 0.9170555964955376, + "eval_accuracy": 0.7488, + "eval_loss": 0.7673412561416626, + "eval_runtime": 137.1036, + "eval_samples_per_second": 9.117, + "eval_steps_per_second": 2.283, + "step": 3500 + }, + { + "epoch": 0.9183656759191026, + "grad_norm": 2.8554155032311366, + "learning_rate": 1.2158167439171026e-06, + "loss": 0.2767, + "step": 3505 + }, + { + "epoch": 0.9196757553426677, + "grad_norm": 3.134856252459499, + "learning_rate": 1.2137006821148234e-06, + "loss": 0.296, + "step": 3510 + }, + { + "epoch": 0.9209858347662327, + "grad_norm": 3.993316808863827, + "learning_rate": 1.2115836171542243e-06, + "loss": 0.3058, + "step": 3515 + }, + { + "epoch": 0.9222959141897977, + "grad_norm": 3.939068373566409, + "learning_rate": 1.2094655589732773e-06, + "loss": 0.2605, + "step": 3520 + }, + { + "epoch": 0.9236059936133628, + "grad_norm": 2.3253660721101825, + "learning_rate": 1.2073465175146159e-06, + "loss": 0.2342, + "step": 3525 + }, + { + "epoch": 0.9249160730369279, + "grad_norm": 4.9098246531853675, + "learning_rate": 1.2052265027254904e-06, + "loss": 0.2824, + "step": 3530 + }, + { + "epoch": 0.9262261524604929, + "grad_norm": 4.153863783851212, + "learning_rate": 1.203105524557719e-06, + "loss": 0.2884, + "step": 3535 + }, + { + "epoch": 0.927536231884058, + "grad_norm": 2.8597337529967786, + "learning_rate": 1.2009835929676435e-06, + "loss": 0.2527, + "step": 3540 + }, + { + "epoch": 0.9288463113076231, + "grad_norm": 3.7214300730819603, + "learning_rate": 1.19886071791608e-06, + "loss": 0.2642, + "step": 3545 + }, + { + "epoch": 0.930156390731188, + "grad_norm": 2.2141165902052826, + "learning_rate": 1.196736909368275e-06, + "loss": 0.1855, + "step": 3550 + }, + { + "epoch": 0.9314664701547531, + "grad_norm": 4.015370663796761, + "learning_rate": 1.1946121772938554e-06, + "loss": 0.2747, + "step": 3555 + }, + { + "epoch": 0.9327765495783182, + "grad_norm": 2.7879854190831366, + "learning_rate": 1.1924865316667839e-06, + "loss": 0.2768, + "step": 3560 + }, + { + "epoch": 0.9340866290018832, + "grad_norm": 10.049756094566998, + "learning_rate": 1.190359982465312e-06, + "loss": 0.2436, + "step": 3565 + }, + { + "epoch": 0.9353967084254483, + "grad_norm": 3.644259827300173, + "learning_rate": 1.1882325396719323e-06, + "loss": 0.2508, + "step": 3570 + }, + { + "epoch": 0.9367067878490134, + "grad_norm": 3.612418668345567, + "learning_rate": 1.1861042132733328e-06, + "loss": 0.269, + "step": 3575 + }, + { + "epoch": 0.9380168672725784, + "grad_norm": 3.7040505997852735, + "learning_rate": 1.1839750132603486e-06, + "loss": 0.2481, + "step": 3580 + }, + { + "epoch": 0.9393269466961435, + "grad_norm": 4.288075933572893, + "learning_rate": 1.1818449496279159e-06, + "loss": 0.2708, + "step": 3585 + }, + { + "epoch": 0.9406370261197085, + "grad_norm": 2.940632362279229, + "learning_rate": 1.1797140323750249e-06, + "loss": 0.2669, + "step": 3590 + }, + { + "epoch": 0.9419471055432735, + "grad_norm": 3.7558809010488394, + "learning_rate": 1.1775822715046736e-06, + "loss": 0.2544, + "step": 3595 + }, + { + "epoch": 0.9432571849668386, + "grad_norm": 3.414999704185975, + "learning_rate": 1.175449677023819e-06, + "loss": 0.2229, + "step": 3600 + }, + { + "epoch": 0.9432571849668386, + "eval_accuracy": 0.7704, + "eval_loss": 0.7259599566459656, + "eval_runtime": 137.2184, + "eval_samples_per_second": 9.11, + "eval_steps_per_second": 2.281, + "step": 3600 + }, + { + "epoch": 0.9445672643904036, + "grad_norm": 3.8559190297316244, + "learning_rate": 1.173316258943332e-06, + "loss": 0.2573, + "step": 3605 + }, + { + "epoch": 0.9458773438139687, + "grad_norm": 3.862447153928952, + "learning_rate": 1.1711820272779497e-06, + "loss": 0.2706, + "step": 3610 + }, + { + "epoch": 0.9471874232375338, + "grad_norm": 3.11275213461627, + "learning_rate": 1.1690469920462276e-06, + "loss": 0.242, + "step": 3615 + }, + { + "epoch": 0.9484975026610988, + "grad_norm": 2.7086439962254265, + "learning_rate": 1.166911163270494e-06, + "loss": 0.2639, + "step": 3620 + }, + { + "epoch": 0.9498075820846639, + "grad_norm": 2.7066495290470205, + "learning_rate": 1.1647745509768025e-06, + "loss": 0.2526, + "step": 3625 + }, + { + "epoch": 0.951117661508229, + "grad_norm": 3.9839085079101735, + "learning_rate": 1.1626371651948836e-06, + "loss": 0.2045, + "step": 3630 + }, + { + "epoch": 0.9524277409317939, + "grad_norm": 4.411550498275121, + "learning_rate": 1.1604990159580998e-06, + "loss": 0.2613, + "step": 3635 + }, + { + "epoch": 0.953737820355359, + "grad_norm": 3.830942956470282, + "learning_rate": 1.1583601133033973e-06, + "loss": 0.3089, + "step": 3640 + }, + { + "epoch": 0.9550478997789241, + "grad_norm": 4.664311559495326, + "learning_rate": 1.1562204672712583e-06, + "loss": 0.2669, + "step": 3645 + }, + { + "epoch": 0.9563579792024891, + "grad_norm": 2.3992079061884515, + "learning_rate": 1.1540800879056554e-06, + "loss": 0.2524, + "step": 3650 + }, + { + "epoch": 0.9576680586260542, + "grad_norm": 2.8182995787216627, + "learning_rate": 1.1519389852540032e-06, + "loss": 0.2641, + "step": 3655 + }, + { + "epoch": 0.9589781380496193, + "grad_norm": 4.0781942860428675, + "learning_rate": 1.1497971693671113e-06, + "loss": 0.2646, + "step": 3660 + }, + { + "epoch": 0.9602882174731843, + "grad_norm": 4.213894313755887, + "learning_rate": 1.147654650299138e-06, + "loss": 0.239, + "step": 3665 + }, + { + "epoch": 0.9615982968967494, + "grad_norm": 2.8090197504712737, + "learning_rate": 1.1455114381075423e-06, + "loss": 0.2587, + "step": 3670 + }, + { + "epoch": 0.9629083763203145, + "grad_norm": 3.858779958604277, + "learning_rate": 1.1433675428530366e-06, + "loss": 0.2865, + "step": 3675 + }, + { + "epoch": 0.9642184557438794, + "grad_norm": 3.865527824121575, + "learning_rate": 1.14122297459954e-06, + "loss": 0.2328, + "step": 3680 + }, + { + "epoch": 0.9655285351674445, + "grad_norm": 1.6264730559645344, + "learning_rate": 1.1390777434141306e-06, + "loss": 0.2631, + "step": 3685 + }, + { + "epoch": 0.9668386145910096, + "grad_norm": 3.1179917555516834, + "learning_rate": 1.1369318593669988e-06, + "loss": 0.2577, + "step": 3690 + }, + { + "epoch": 0.9681486940145746, + "grad_norm": 4.097463115035321, + "learning_rate": 1.1347853325313993e-06, + "loss": 0.2727, + "step": 3695 + }, + { + "epoch": 0.9694587734381397, + "grad_norm": 2.5547154186995145, + "learning_rate": 1.1326381729836045e-06, + "loss": 0.225, + "step": 3700 + }, + { + "epoch": 0.9694587734381397, + "eval_accuracy": 0.7512, + "eval_loss": 0.6980738043785095, + "eval_runtime": 138.6178, + "eval_samples_per_second": 9.018, + "eval_steps_per_second": 2.258, + "step": 3700 + }, + { + "epoch": 0.9707688528617048, + "grad_norm": 2.383269216589878, + "learning_rate": 1.1304903908028569e-06, + "loss": 0.2568, + "step": 3705 + }, + { + "epoch": 0.9720789322852698, + "grad_norm": 4.510012606255057, + "learning_rate": 1.1283419960713212e-06, + "loss": 0.3083, + "step": 3710 + }, + { + "epoch": 0.9733890117088349, + "grad_norm": 2.280997080205734, + "learning_rate": 1.126192998874038e-06, + "loss": 0.2523, + "step": 3715 + }, + { + "epoch": 0.9746990911323999, + "grad_norm": 6.594544711402591, + "learning_rate": 1.1240434092988764e-06, + "loss": 0.2407, + "step": 3720 + }, + { + "epoch": 0.9760091705559649, + "grad_norm": 4.135213461207541, + "learning_rate": 1.1218932374364855e-06, + "loss": 0.2893, + "step": 3725 + }, + { + "epoch": 0.97731924997953, + "grad_norm": 2.861868089040975, + "learning_rate": 1.1197424933802485e-06, + "loss": 0.2204, + "step": 3730 + }, + { + "epoch": 0.9786293294030951, + "grad_norm": 5.266728439797464, + "learning_rate": 1.1175911872262332e-06, + "loss": 0.3179, + "step": 3735 + }, + { + "epoch": 0.9799394088266601, + "grad_norm": 3.5669822038480405, + "learning_rate": 1.1154393290731483e-06, + "loss": 0.2392, + "step": 3740 + }, + { + "epoch": 0.9812494882502252, + "grad_norm": 3.360906639915107, + "learning_rate": 1.1132869290222917e-06, + "loss": 0.2802, + "step": 3745 + }, + { + "epoch": 0.9825595676737903, + "grad_norm": 4.200005976718456, + "learning_rate": 1.111133997177506e-06, + "loss": 0.3154, + "step": 3750 + }, + { + "epoch": 0.9838696470973552, + "grad_norm": 2.262745756657975, + "learning_rate": 1.1089805436451303e-06, + "loss": 0.2222, + "step": 3755 + }, + { + "epoch": 0.9851797265209203, + "grad_norm": 2.6653368398425723, + "learning_rate": 1.1068265785339518e-06, + "loss": 0.2718, + "step": 3760 + }, + { + "epoch": 0.9864898059444854, + "grad_norm": 4.296893472198315, + "learning_rate": 1.1046721119551598e-06, + "loss": 0.3262, + "step": 3765 + }, + { + "epoch": 0.9877998853680504, + "grad_norm": 4.189321257109803, + "learning_rate": 1.1025171540222977e-06, + "loss": 0.2656, + "step": 3770 + }, + { + "epoch": 0.9891099647916155, + "grad_norm": 4.22478244416407, + "learning_rate": 1.1003617148512149e-06, + "loss": 0.2863, + "step": 3775 + }, + { + "epoch": 0.9904200442151806, + "grad_norm": 3.0639361862726995, + "learning_rate": 1.0982058045600205e-06, + "loss": 0.2578, + "step": 3780 + }, + { + "epoch": 0.9917301236387456, + "grad_norm": 3.242022108834711, + "learning_rate": 1.0960494332690342e-06, + "loss": 0.2316, + "step": 3785 + }, + { + "epoch": 0.9930402030623107, + "grad_norm": 3.563673193533161, + "learning_rate": 1.093892611100741e-06, + "loss": 0.2838, + "step": 3790 + }, + { + "epoch": 0.9943502824858758, + "grad_norm": 4.493703507987354, + "learning_rate": 1.0917353481797412e-06, + "loss": 0.2579, + "step": 3795 + }, + { + "epoch": 0.9956603619094407, + "grad_norm": 6.8325710898527054, + "learning_rate": 1.089577654632705e-06, + "loss": 0.2317, + "step": 3800 + }, + { + "epoch": 0.9956603619094407, + "eval_accuracy": 0.7432, + "eval_loss": 0.6873839497566223, + "eval_runtime": 147.4303, + "eval_samples_per_second": 8.479, + "eval_steps_per_second": 2.123, + "step": 3800 + }, + { + "epoch": 0.9969704413330058, + "grad_norm": 3.8200894747277454, + "learning_rate": 1.0874195405883231e-06, + "loss": 0.2404, + "step": 3805 + }, + { + "epoch": 0.9982805207565709, + "grad_norm": 2.60177554342581, + "learning_rate": 1.085261016177261e-06, + "loss": 0.2528, + "step": 3810 + }, + { + "epoch": 0.9995906001801359, + "grad_norm": 2.373688391522338, + "learning_rate": 1.0831020915321109e-06, + "loss": 0.2214, + "step": 3815 + }, + { + "epoch": 1.000900679603701, + "grad_norm": 2.2734563557794907, + "learning_rate": 1.080942776787342e-06, + "loss": 0.1921, + "step": 3820 + }, + { + "epoch": 1.002210759027266, + "grad_norm": 2.3069259940839606, + "learning_rate": 1.0787830820792566e-06, + "loss": 0.2056, + "step": 3825 + }, + { + "epoch": 1.0035208384508312, + "grad_norm": 6.478254156842445, + "learning_rate": 1.0766230175459394e-06, + "loss": 0.1716, + "step": 3830 + }, + { + "epoch": 1.0048309178743962, + "grad_norm": 2.849832491085133, + "learning_rate": 1.0744625933272118e-06, + "loss": 0.1632, + "step": 3835 + }, + { + "epoch": 1.0061409972979611, + "grad_norm": 3.009110354194684, + "learning_rate": 1.0723018195645835e-06, + "loss": 0.1915, + "step": 3840 + }, + { + "epoch": 1.0074510767215263, + "grad_norm": 7.183583057327153, + "learning_rate": 1.070140706401205e-06, + "loss": 0.1776, + "step": 3845 + }, + { + "epoch": 1.0087611561450913, + "grad_norm": 7.645123151827477, + "learning_rate": 1.0679792639818199e-06, + "loss": 0.2206, + "step": 3850 + }, + { + "epoch": 1.0100712355686563, + "grad_norm": 2.0069646269660875, + "learning_rate": 1.0658175024527175e-06, + "loss": 0.1073, + "step": 3855 + }, + { + "epoch": 1.0113813149922215, + "grad_norm": 2.0486815490375325, + "learning_rate": 1.0636554319616853e-06, + "loss": 0.1817, + "step": 3860 + }, + { + "epoch": 1.0126913944157865, + "grad_norm": 7.2346007294382435, + "learning_rate": 1.0614930626579603e-06, + "loss": 0.2206, + "step": 3865 + }, + { + "epoch": 1.0140014738393515, + "grad_norm": 4.437819204972086, + "learning_rate": 1.0593304046921838e-06, + "loss": 0.1944, + "step": 3870 + }, + { + "epoch": 1.0153115532629167, + "grad_norm": 4.917690349323733, + "learning_rate": 1.0571674682163504e-06, + "loss": 0.1716, + "step": 3875 + }, + { + "epoch": 1.0166216326864816, + "grad_norm": 4.465444787397066, + "learning_rate": 1.0550042633837629e-06, + "loss": 0.1873, + "step": 3880 + }, + { + "epoch": 1.0179317121100466, + "grad_norm": 5.034120197481139, + "learning_rate": 1.052840800348984e-06, + "loss": 0.1971, + "step": 3885 + }, + { + "epoch": 1.0192417915336118, + "grad_norm": 4.650074157777241, + "learning_rate": 1.050677089267788e-06, + "loss": 0.1936, + "step": 3890 + }, + { + "epoch": 1.0205518709571768, + "grad_norm": 2.482941440321424, + "learning_rate": 1.0485131402971142e-06, + "loss": 0.1653, + "step": 3895 + }, + { + "epoch": 1.0218619503807418, + "grad_norm": 2.852065540223939, + "learning_rate": 1.0463489635950179e-06, + "loss": 0.1846, + "step": 3900 + }, + { + "epoch": 1.0218619503807418, + "eval_accuracy": 0.7488, + "eval_loss": 0.8795240521430969, + "eval_runtime": 142.5175, + "eval_samples_per_second": 8.771, + "eval_steps_per_second": 2.196, + "step": 3900 + }, + { + "epoch": 1.023172029804307, + "grad_norm": 1.8616921778346158, + "learning_rate": 1.0441845693206237e-06, + "loss": 0.1646, + "step": 3905 + }, + { + "epoch": 1.024482109227872, + "grad_norm": 2.432644973681949, + "learning_rate": 1.0420199676340777e-06, + "loss": 0.1653, + "step": 3910 + }, + { + "epoch": 1.025792188651437, + "grad_norm": 6.272988090712415, + "learning_rate": 1.0398551686964993e-06, + "loss": 0.181, + "step": 3915 + }, + { + "epoch": 1.0271022680750022, + "grad_norm": 5.836433358300422, + "learning_rate": 1.0376901826699347e-06, + "loss": 0.225, + "step": 3920 + }, + { + "epoch": 1.0284123474985671, + "grad_norm": 4.8783777162924205, + "learning_rate": 1.0355250197173066e-06, + "loss": 0.193, + "step": 3925 + }, + { + "epoch": 1.0297224269221321, + "grad_norm": 3.7860774323233537, + "learning_rate": 1.0333596900023702e-06, + "loss": 0.1351, + "step": 3930 + }, + { + "epoch": 1.031032506345697, + "grad_norm": 3.3891904081042243, + "learning_rate": 1.0311942036896623e-06, + "loss": 0.1365, + "step": 3935 + }, + { + "epoch": 1.0323425857692623, + "grad_norm": 4.255144512646531, + "learning_rate": 1.0290285709444556e-06, + "loss": 0.1947, + "step": 3940 + }, + { + "epoch": 1.0336526651928273, + "grad_norm": 3.551773880971244, + "learning_rate": 1.0268628019327088e-06, + "loss": 0.1691, + "step": 3945 + }, + { + "epoch": 1.0349627446163923, + "grad_norm": 3.21813090542831, + "learning_rate": 1.0246969068210217e-06, + "loss": 0.1839, + "step": 3950 + }, + { + "epoch": 1.0362728240399575, + "grad_norm": 2.166114985115052, + "learning_rate": 1.022530895776586e-06, + "loss": 0.1386, + "step": 3955 + }, + { + "epoch": 1.0375829034635224, + "grad_norm": 3.7664288882285573, + "learning_rate": 1.0203647789671364e-06, + "loss": 0.1829, + "step": 3960 + }, + { + "epoch": 1.0388929828870874, + "grad_norm": 4.568624206840217, + "learning_rate": 1.0181985665609051e-06, + "loss": 0.1606, + "step": 3965 + }, + { + "epoch": 1.0402030623106526, + "grad_norm": 4.12795033279393, + "learning_rate": 1.0160322687265728e-06, + "loss": 0.2144, + "step": 3970 + }, + { + "epoch": 1.0415131417342176, + "grad_norm": 6.066279389724659, + "learning_rate": 1.013865895633221e-06, + "loss": 0.153, + "step": 3975 + }, + { + "epoch": 1.0428232211577826, + "grad_norm": 5.99335728744553, + "learning_rate": 1.0116994574502853e-06, + "loss": 0.1776, + "step": 3980 + }, + { + "epoch": 1.0441333005813478, + "grad_norm": 3.323982778252669, + "learning_rate": 1.0095329643475056e-06, + "loss": 0.1258, + "step": 3985 + }, + { + "epoch": 1.0454433800049128, + "grad_norm": 4.0821616859221415, + "learning_rate": 1.0073664264948803e-06, + "loss": 0.141, + "step": 3990 + }, + { + "epoch": 1.0467534594284778, + "grad_norm": 4.303578582045898, + "learning_rate": 1.005199854062618e-06, + "loss": 0.1888, + "step": 3995 + }, + { + "epoch": 1.048063538852043, + "grad_norm": 6.9723723361771865, + "learning_rate": 1.0030332572210896e-06, + "loss": 0.1624, + "step": 4000 + }, + { + "epoch": 1.048063538852043, + "eval_accuracy": 0.748, + "eval_loss": 1.1003224849700928, + "eval_runtime": 138.6989, + "eval_samples_per_second": 9.012, + "eval_steps_per_second": 2.257, + "step": 4000 + }, + { + "epoch": 1.049373618275608, + "grad_norm": 4.415041744009451, + "learning_rate": 1.00086664614078e-06, + "loss": 0.167, + "step": 4005 + }, + { + "epoch": 1.050683697699173, + "grad_norm": 1.7135670096102869, + "learning_rate": 9.987000309922417e-07, + "loss": 0.1711, + "step": 4010 + }, + { + "epoch": 1.0519937771227381, + "grad_norm": 7.478625018041275, + "learning_rate": 9.965334219460455e-07, + "loss": 0.1731, + "step": 4015 + }, + { + "epoch": 1.053303856546303, + "grad_norm": 2.7314624626438206, + "learning_rate": 9.943668291727344e-07, + "loss": 0.1859, + "step": 4020 + }, + { + "epoch": 1.054613935969868, + "grad_norm": 5.770554889506936, + "learning_rate": 9.922002628427742e-07, + "loss": 0.1597, + "step": 4025 + }, + { + "epoch": 1.0559240153934333, + "grad_norm": 3.3610794660047834, + "learning_rate": 9.900337331265077e-07, + "loss": 0.187, + "step": 4030 + }, + { + "epoch": 1.0572340948169983, + "grad_norm": 5.42834738035381, + "learning_rate": 9.878672501941045e-07, + "loss": 0.1698, + "step": 4035 + }, + { + "epoch": 1.0585441742405632, + "grad_norm": 2.9976736613034665, + "learning_rate": 9.857008242155152e-07, + "loss": 0.1254, + "step": 4040 + }, + { + "epoch": 1.0598542536641284, + "grad_norm": 4.911495425759969, + "learning_rate": 9.83534465360423e-07, + "loss": 0.136, + "step": 4045 + }, + { + "epoch": 1.0611643330876934, + "grad_norm": 3.02741302534027, + "learning_rate": 9.813681837981966e-07, + "loss": 0.1938, + "step": 4050 + }, + { + "epoch": 1.0624744125112584, + "grad_norm": 11.241222743138435, + "learning_rate": 9.792019896978412e-07, + "loss": 0.1745, + "step": 4055 + }, + { + "epoch": 1.0637844919348236, + "grad_norm": 3.613601159548242, + "learning_rate": 9.77035893227951e-07, + "loss": 0.1792, + "step": 4060 + }, + { + "epoch": 1.0650945713583886, + "grad_norm": 2.560108792032753, + "learning_rate": 9.748699045566625e-07, + "loss": 0.173, + "step": 4065 + }, + { + "epoch": 1.0664046507819536, + "grad_norm": 4.2940503388019495, + "learning_rate": 9.727040338516066e-07, + "loss": 0.1496, + "step": 4070 + }, + { + "epoch": 1.0677147302055188, + "grad_norm": 4.675724374175031, + "learning_rate": 9.705382912798596e-07, + "loss": 0.2138, + "step": 4075 + }, + { + "epoch": 1.0690248096290837, + "grad_norm": 5.628200856430463, + "learning_rate": 9.683726870078971e-07, + "loss": 0.2194, + "step": 4080 + }, + { + "epoch": 1.0703348890526487, + "grad_norm": 5.2076046235205125, + "learning_rate": 9.662072312015445e-07, + "loss": 0.2401, + "step": 4085 + }, + { + "epoch": 1.071644968476214, + "grad_norm": 1.9418784608600421, + "learning_rate": 9.640419340259311e-07, + "loss": 0.1514, + "step": 4090 + }, + { + "epoch": 1.072955047899779, + "grad_norm": 7.120141596864124, + "learning_rate": 9.618768056454415e-07, + "loss": 0.157, + "step": 4095 + }, + { + "epoch": 1.074265127323344, + "grad_norm": 3.705216890309473, + "learning_rate": 9.597118562236679e-07, + "loss": 0.1456, + "step": 4100 + }, + { + "epoch": 1.074265127323344, + "eval_accuracy": 0.7608, + "eval_loss": 0.9698547124862671, + "eval_runtime": 139.0187, + "eval_samples_per_second": 8.992, + "eval_steps_per_second": 2.251, + "step": 4100 + }, + { + "epoch": 1.075575206746909, + "grad_norm": 3.085456133975227, + "learning_rate": 9.575470959233612e-07, + "loss": 0.1856, + "step": 4105 + }, + { + "epoch": 1.076885286170474, + "grad_norm": 1.823222530695256, + "learning_rate": 9.553825349063864e-07, + "loss": 0.1667, + "step": 4110 + }, + { + "epoch": 1.078195365594039, + "grad_norm": 3.7383958453668096, + "learning_rate": 9.532181833336721e-07, + "loss": 0.1391, + "step": 4115 + }, + { + "epoch": 1.0795054450176043, + "grad_norm": 3.8448773992307514, + "learning_rate": 9.510540513651637e-07, + "loss": 0.1542, + "step": 4120 + }, + { + "epoch": 1.0808155244411692, + "grad_norm": 4.78668688388694, + "learning_rate": 9.488901491597761e-07, + "loss": 0.1696, + "step": 4125 + }, + { + "epoch": 1.0821256038647342, + "grad_norm": 3.2055677447348923, + "learning_rate": 9.46726486875345e-07, + "loss": 0.2188, + "step": 4130 + }, + { + "epoch": 1.0834356832882994, + "grad_norm": 5.891308978767123, + "learning_rate": 9.445630746685806e-07, + "loss": 0.1885, + "step": 4135 + }, + { + "epoch": 1.0847457627118644, + "grad_norm": 4.8421132337125545, + "learning_rate": 9.423999226950185e-07, + "loss": 0.1609, + "step": 4140 + }, + { + "epoch": 1.0860558421354294, + "grad_norm": 5.9154276009344136, + "learning_rate": 9.402370411089732e-07, + "loss": 0.1527, + "step": 4145 + }, + { + "epoch": 1.0873659215589946, + "grad_norm": 7.1723306345970865, + "learning_rate": 9.380744400634903e-07, + "loss": 0.1594, + "step": 4150 + }, + { + "epoch": 1.0886760009825596, + "grad_norm": 3.6603352773642106, + "learning_rate": 9.35912129710297e-07, + "loss": 0.1706, + "step": 4155 + }, + { + "epoch": 1.0899860804061245, + "grad_norm": 1.8190671058733112, + "learning_rate": 9.337501201997573e-07, + "loss": 0.1687, + "step": 4160 + }, + { + "epoch": 1.0912961598296897, + "grad_norm": 6.103828451150774, + "learning_rate": 9.315884216808226e-07, + "loss": 0.1543, + "step": 4165 + }, + { + "epoch": 1.0926062392532547, + "grad_norm": 3.51452822849322, + "learning_rate": 9.294270443009847e-07, + "loss": 0.168, + "step": 4170 + }, + { + "epoch": 1.0939163186768197, + "grad_norm": 3.9256660164579267, + "learning_rate": 9.27265998206227e-07, + "loss": 0.1447, + "step": 4175 + }, + { + "epoch": 1.095226398100385, + "grad_norm": 6.647259650468985, + "learning_rate": 9.251052935409783e-07, + "loss": 0.219, + "step": 4180 + }, + { + "epoch": 1.09653647752395, + "grad_norm": 2.8205270117655767, + "learning_rate": 9.229449404480653e-07, + "loss": 0.1496, + "step": 4185 + }, + { + "epoch": 1.0978465569475149, + "grad_norm": 6.128991520594209, + "learning_rate": 9.207849490686636e-07, + "loss": 0.2047, + "step": 4190 + }, + { + "epoch": 1.09915663637108, + "grad_norm": 4.140511690244823, + "learning_rate": 9.186253295422514e-07, + "loss": 0.2245, + "step": 4195 + }, + { + "epoch": 1.100466715794645, + "grad_norm": 2.0987997572217303, + "learning_rate": 9.1646609200656e-07, + "loss": 0.1966, + "step": 4200 + }, + { + "epoch": 1.100466715794645, + "eval_accuracy": 0.7504, + "eval_loss": 0.9615470767021179, + "eval_runtime": 145.8533, + "eval_samples_per_second": 8.57, + "eval_steps_per_second": 2.146, + "step": 4200 + }, + { + "epoch": 1.10177679521821, + "grad_norm": 6.7544463505960195, + "learning_rate": 9.14307246597529e-07, + "loss": 0.1695, + "step": 4205 + }, + { + "epoch": 1.1030868746417752, + "grad_norm": 3.881058148850053, + "learning_rate": 9.121488034492568e-07, + "loss": 0.1736, + "step": 4210 + }, + { + "epoch": 1.1043969540653402, + "grad_norm": 5.101818081783207, + "learning_rate": 9.099907726939533e-07, + "loss": 0.2124, + "step": 4215 + }, + { + "epoch": 1.1057070334889052, + "grad_norm": 4.13924770593388, + "learning_rate": 9.078331644618934e-07, + "loss": 0.149, + "step": 4220 + }, + { + "epoch": 1.1070171129124704, + "grad_norm": 4.56496678907466, + "learning_rate": 9.056759888813668e-07, + "loss": 0.1696, + "step": 4225 + }, + { + "epoch": 1.1083271923360354, + "grad_norm": 6.020881682990206, + "learning_rate": 9.035192560786338e-07, + "loss": 0.2085, + "step": 4230 + }, + { + "epoch": 1.1096372717596004, + "grad_norm": 3.020245719833497, + "learning_rate": 9.013629761778757e-07, + "loss": 0.1503, + "step": 4235 + }, + { + "epoch": 1.1109473511831656, + "grad_norm": 5.745920685976253, + "learning_rate": 8.99207159301148e-07, + "loss": 0.1883, + "step": 4240 + }, + { + "epoch": 1.1122574306067305, + "grad_norm": 6.338038152438336, + "learning_rate": 8.970518155683324e-07, + "loss": 0.1612, + "step": 4245 + }, + { + "epoch": 1.1135675100302955, + "grad_norm": 4.337625038324552, + "learning_rate": 8.948969550970894e-07, + "loss": 0.1276, + "step": 4250 + }, + { + "epoch": 1.1148775894538607, + "grad_norm": 5.983331324725646, + "learning_rate": 8.927425880028113e-07, + "loss": 0.1572, + "step": 4255 + }, + { + "epoch": 1.1161876688774257, + "grad_norm": 5.471826595967007, + "learning_rate": 8.905887243985743e-07, + "loss": 0.1733, + "step": 4260 + }, + { + "epoch": 1.1174977483009907, + "grad_norm": 9.369736563628495, + "learning_rate": 8.884353743950915e-07, + "loss": 0.1768, + "step": 4265 + }, + { + "epoch": 1.118807827724556, + "grad_norm": 4.302075004357904, + "learning_rate": 8.862825481006637e-07, + "loss": 0.1676, + "step": 4270 + }, + { + "epoch": 1.1201179071481209, + "grad_norm": 3.995934439864761, + "learning_rate": 8.841302556211348e-07, + "loss": 0.1556, + "step": 4275 + }, + { + "epoch": 1.1214279865716859, + "grad_norm": 5.71645655203888, + "learning_rate": 8.81978507059842e-07, + "loss": 0.173, + "step": 4280 + }, + { + "epoch": 1.122738065995251, + "grad_norm": 3.863778158057957, + "learning_rate": 8.798273125175697e-07, + "loss": 0.1905, + "step": 4285 + }, + { + "epoch": 1.124048145418816, + "grad_norm": 8.652156181110675, + "learning_rate": 8.776766820925016e-07, + "loss": 0.2137, + "step": 4290 + }, + { + "epoch": 1.125358224842381, + "grad_norm": 4.502859011012491, + "learning_rate": 8.755266258801725e-07, + "loss": 0.1615, + "step": 4295 + }, + { + "epoch": 1.1266683042659462, + "grad_norm": 6.956098402947135, + "learning_rate": 8.73377153973423e-07, + "loss": 0.203, + "step": 4300 + }, + { + "epoch": 1.1266683042659462, + "eval_accuracy": 0.7576, + "eval_loss": 1.0583382844924927, + "eval_runtime": 149.7241, + "eval_samples_per_second": 8.349, + "eval_steps_per_second": 2.091, + "step": 4300 + }, + { + "epoch": 1.1279783836895112, + "grad_norm": 4.881012007634449, + "learning_rate": 8.712282764623495e-07, + "loss": 0.1625, + "step": 4305 + }, + { + "epoch": 1.1292884631130762, + "grad_norm": 2.1915201124191523, + "learning_rate": 8.690800034342593e-07, + "loss": 0.1598, + "step": 4310 + }, + { + "epoch": 1.1305985425366414, + "grad_norm": 6.988605240677998, + "learning_rate": 8.669323449736223e-07, + "loss": 0.1763, + "step": 4315 + }, + { + "epoch": 1.1319086219602064, + "grad_norm": 8.099073728074591, + "learning_rate": 8.647853111620213e-07, + "loss": 0.2026, + "step": 4320 + }, + { + "epoch": 1.1332187013837713, + "grad_norm": 6.4285970895379, + "learning_rate": 8.626389120781096e-07, + "loss": 0.1622, + "step": 4325 + }, + { + "epoch": 1.1345287808073365, + "grad_norm": 2.9631902369392145, + "learning_rate": 8.604931577975591e-07, + "loss": 0.1983, + "step": 4330 + }, + { + "epoch": 1.1358388602309015, + "grad_norm": 1.8505065261163445, + "learning_rate": 8.583480583930162e-07, + "loss": 0.1276, + "step": 4335 + }, + { + "epoch": 1.1371489396544665, + "grad_norm": 3.4127484907896175, + "learning_rate": 8.562036239340519e-07, + "loss": 0.1559, + "step": 4340 + }, + { + "epoch": 1.1384590190780317, + "grad_norm": 4.817799074804773, + "learning_rate": 8.540598644871166e-07, + "loss": 0.2032, + "step": 4345 + }, + { + "epoch": 1.1397690985015967, + "grad_norm": 3.9208306819766907, + "learning_rate": 8.519167901154915e-07, + "loss": 0.1249, + "step": 4350 + }, + { + "epoch": 1.1410791779251617, + "grad_norm": 3.258083851351543, + "learning_rate": 8.497744108792429e-07, + "loss": 0.167, + "step": 4355 + }, + { + "epoch": 1.1423892573487269, + "grad_norm": 5.712624458894274, + "learning_rate": 8.476327368351731e-07, + "loss": 0.1821, + "step": 4360 + }, + { + "epoch": 1.1436993367722919, + "grad_norm": 5.622711654282518, + "learning_rate": 8.454917780367738e-07, + "loss": 0.1426, + "step": 4365 + }, + { + "epoch": 1.1450094161958568, + "grad_norm": 2.9714920706791603, + "learning_rate": 8.433515445341798e-07, + "loss": 0.1508, + "step": 4370 + }, + { + "epoch": 1.146319495619422, + "grad_norm": 8.145950090881742, + "learning_rate": 8.412120463741213e-07, + "loss": 0.1911, + "step": 4375 + }, + { + "epoch": 1.147629575042987, + "grad_norm": 7.816310979213919, + "learning_rate": 8.390732935998762e-07, + "loss": 0.1972, + "step": 4380 + }, + { + "epoch": 1.148939654466552, + "grad_norm": 10.997788887744328, + "learning_rate": 8.369352962512241e-07, + "loss": 0.2195, + "step": 4385 + }, + { + "epoch": 1.1502497338901172, + "grad_norm": 2.7493033712936668, + "learning_rate": 8.347980643643972e-07, + "loss": 0.1853, + "step": 4390 + }, + { + "epoch": 1.1515598133136822, + "grad_norm": 3.181837621729024, + "learning_rate": 8.326616079720356e-07, + "loss": 0.1779, + "step": 4395 + }, + { + "epoch": 1.1528698927372472, + "grad_norm": 3.4603211671345084, + "learning_rate": 8.305259371031385e-07, + "loss": 0.1975, + "step": 4400 + }, + { + "epoch": 1.1528698927372472, + "eval_accuracy": 0.756, + "eval_loss": 0.9896759986877441, + "eval_runtime": 146.8037, + "eval_samples_per_second": 8.515, + "eval_steps_per_second": 2.132, + "step": 4400 + }, + { + "epoch": 1.1541799721608124, + "grad_norm": 6.84507170824882, + "learning_rate": 8.283910617830185e-07, + "loss": 0.2055, + "step": 4405 + }, + { + "epoch": 1.1554900515843773, + "grad_norm": 2.9623090673688295, + "learning_rate": 8.262569920332522e-07, + "loss": 0.1344, + "step": 4410 + }, + { + "epoch": 1.1568001310079423, + "grad_norm": 2.676649590370874, + "learning_rate": 8.241237378716357e-07, + "loss": 0.1341, + "step": 4415 + }, + { + "epoch": 1.1581102104315075, + "grad_norm": 3.3462527937569284, + "learning_rate": 8.219913093121367e-07, + "loss": 0.1479, + "step": 4420 + }, + { + "epoch": 1.1594202898550725, + "grad_norm": 4.922809803079367, + "learning_rate": 8.198597163648466e-07, + "loss": 0.1377, + "step": 4425 + }, + { + "epoch": 1.1607303692786375, + "grad_norm": 7.567987448628956, + "learning_rate": 8.177289690359354e-07, + "loss": 0.2551, + "step": 4430 + }, + { + "epoch": 1.1620404487022027, + "grad_norm": 4.269347526317818, + "learning_rate": 8.155990773276022e-07, + "loss": 0.1511, + "step": 4435 + }, + { + "epoch": 1.1633505281257677, + "grad_norm": 5.196415940319941, + "learning_rate": 8.134700512380304e-07, + "loss": 0.2124, + "step": 4440 + }, + { + "epoch": 1.1646606075493326, + "grad_norm": 4.346263990105768, + "learning_rate": 8.113419007613399e-07, + "loss": 0.1708, + "step": 4445 + }, + { + "epoch": 1.1659706869728979, + "grad_norm": 4.993849508082759, + "learning_rate": 8.092146358875405e-07, + "loss": 0.147, + "step": 4450 + }, + { + "epoch": 1.1672807663964628, + "grad_norm": 4.5209534974011945, + "learning_rate": 8.070882666024847e-07, + "loss": 0.1311, + "step": 4455 + }, + { + "epoch": 1.1685908458200278, + "grad_norm": 9.122486214875625, + "learning_rate": 8.049628028878199e-07, + "loss": 0.179, + "step": 4460 + }, + { + "epoch": 1.169900925243593, + "grad_norm": 4.302140098325049, + "learning_rate": 8.02838254720944e-07, + "loss": 0.1912, + "step": 4465 + }, + { + "epoch": 1.171211004667158, + "grad_norm": 5.885855438044166, + "learning_rate": 8.007146320749565e-07, + "loss": 0.209, + "step": 4470 + }, + { + "epoch": 1.172521084090723, + "grad_norm": 5.113483218057625, + "learning_rate": 7.985919449186122e-07, + "loss": 0.138, + "step": 4475 + }, + { + "epoch": 1.1738311635142882, + "grad_norm": 7.107404436549728, + "learning_rate": 7.964702032162748e-07, + "loss": 0.1443, + "step": 4480 + }, + { + "epoch": 1.1751412429378532, + "grad_norm": 5.613324141288739, + "learning_rate": 7.943494169278694e-07, + "loss": 0.1659, + "step": 4485 + }, + { + "epoch": 1.1764513223614181, + "grad_norm": 7.621228727692981, + "learning_rate": 7.922295960088366e-07, + "loss": 0.2055, + "step": 4490 + }, + { + "epoch": 1.1777614017849831, + "grad_norm": 5.157419189737596, + "learning_rate": 7.901107504100851e-07, + "loss": 0.1951, + "step": 4495 + }, + { + "epoch": 1.1790714812085483, + "grad_norm": 6.255459111664694, + "learning_rate": 7.879928900779455e-07, + "loss": 0.1878, + "step": 4500 + }, + { + "epoch": 1.1790714812085483, + "eval_accuracy": 0.7544, + "eval_loss": 1.0830539464950562, + "eval_runtime": 147.9333, + "eval_samples_per_second": 8.45, + "eval_steps_per_second": 2.116, + "step": 4500 + }, + { + "epoch": 1.1803815606321133, + "grad_norm": 5.071030425233318, + "learning_rate": 7.858760249541227e-07, + "loss": 0.1376, + "step": 4505 + }, + { + "epoch": 1.1816916400556783, + "grad_norm": 4.613895284146258, + "learning_rate": 7.837601649756507e-07, + "loss": 0.1871, + "step": 4510 + }, + { + "epoch": 1.1830017194792435, + "grad_norm": 5.343062025394728, + "learning_rate": 7.816453200748445e-07, + "loss": 0.1557, + "step": 4515 + }, + { + "epoch": 1.1843117989028085, + "grad_norm": 2.3488119081864878, + "learning_rate": 7.795315001792545e-07, + "loss": 0.1275, + "step": 4520 + }, + { + "epoch": 1.1856218783263734, + "grad_norm": 7.211896790191278, + "learning_rate": 7.774187152116195e-07, + "loss": 0.1795, + "step": 4525 + }, + { + "epoch": 1.1869319577499386, + "grad_norm": 6.629116565849999, + "learning_rate": 7.753069750898195e-07, + "loss": 0.1694, + "step": 4530 + }, + { + "epoch": 1.1882420371735036, + "grad_norm": 6.7686458510507945, + "learning_rate": 7.731962897268304e-07, + "loss": 0.1823, + "step": 4535 + }, + { + "epoch": 1.1895521165970686, + "grad_norm": 9.499227541023666, + "learning_rate": 7.710866690306767e-07, + "loss": 0.1973, + "step": 4540 + }, + { + "epoch": 1.1908621960206338, + "grad_norm": 5.305703998887908, + "learning_rate": 7.689781229043852e-07, + "loss": 0.1417, + "step": 4545 + }, + { + "epoch": 1.1921722754441988, + "grad_norm": 6.807389274265624, + "learning_rate": 7.668706612459386e-07, + "loss": 0.1309, + "step": 4550 + }, + { + "epoch": 1.1934823548677638, + "grad_norm": 2.37846342555138, + "learning_rate": 7.647642939482276e-07, + "loss": 0.2224, + "step": 4555 + }, + { + "epoch": 1.194792434291329, + "grad_norm": 7.282824648401145, + "learning_rate": 7.626590308990073e-07, + "loss": 0.1746, + "step": 4560 + }, + { + "epoch": 1.196102513714894, + "grad_norm": 10.05223632714189, + "learning_rate": 7.605548819808485e-07, + "loss": 0.1777, + "step": 4565 + }, + { + "epoch": 1.197412593138459, + "grad_norm": 3.5055904042182653, + "learning_rate": 7.584518570710923e-07, + "loss": 0.182, + "step": 4570 + }, + { + "epoch": 1.1987226725620241, + "grad_norm": 5.58019032163587, + "learning_rate": 7.56349966041803e-07, + "loss": 0.1708, + "step": 4575 + }, + { + "epoch": 1.2000327519855891, + "grad_norm": 4.895119266207443, + "learning_rate": 7.542492187597227e-07, + "loss": 0.1614, + "step": 4580 + }, + { + "epoch": 1.201342831409154, + "grad_norm": 6.978583024065426, + "learning_rate": 7.52149625086224e-07, + "loss": 0.1561, + "step": 4585 + }, + { + "epoch": 1.2026529108327193, + "grad_norm": 6.610019216750305, + "learning_rate": 7.500511948772649e-07, + "loss": 0.1557, + "step": 4590 + }, + { + "epoch": 1.2039629902562843, + "grad_norm": 8.135947467914335, + "learning_rate": 7.479539379833417e-07, + "loss": 0.1616, + "step": 4595 + }, + { + "epoch": 1.2052730696798493, + "grad_norm": 3.5915184810255667, + "learning_rate": 7.458578642494417e-07, + "loss": 0.1177, + "step": 4600 + }, + { + "epoch": 1.2052730696798493, + "eval_accuracy": 0.7656, + "eval_loss": 1.0870707035064697, + "eval_runtime": 143.0548, + "eval_samples_per_second": 8.738, + "eval_steps_per_second": 2.188, + "step": 4600 + }, + { + "epoch": 1.2065831491034145, + "grad_norm": 8.379654070687748, + "learning_rate": 7.437629835149997e-07, + "loss": 0.1494, + "step": 4605 + }, + { + "epoch": 1.2078932285269794, + "grad_norm": 5.7531869850518165, + "learning_rate": 7.416693056138496e-07, + "loss": 0.15, + "step": 4610 + }, + { + "epoch": 1.2092033079505444, + "grad_norm": 4.807799748254105, + "learning_rate": 7.395768403741793e-07, + "loss": 0.1665, + "step": 4615 + }, + { + "epoch": 1.2105133873741096, + "grad_norm": 5.192570962379375, + "learning_rate": 7.37485597618484e-07, + "loss": 0.1866, + "step": 4620 + }, + { + "epoch": 1.2118234667976746, + "grad_norm": 7.151185425257774, + "learning_rate": 7.353955871635194e-07, + "loss": 0.1781, + "step": 4625 + }, + { + "epoch": 1.2131335462212396, + "grad_norm": 7.814709553590773, + "learning_rate": 7.33306818820258e-07, + "loss": 0.1362, + "step": 4630 + }, + { + "epoch": 1.2144436256448048, + "grad_norm": 4.846484187047676, + "learning_rate": 7.312193023938411e-07, + "loss": 0.1624, + "step": 4635 + }, + { + "epoch": 1.2157537050683698, + "grad_norm": 4.026530959170853, + "learning_rate": 7.291330476835327e-07, + "loss": 0.1428, + "step": 4640 + }, + { + "epoch": 1.2170637844919348, + "grad_norm": 4.135011637724928, + "learning_rate": 7.270480644826749e-07, + "loss": 0.1685, + "step": 4645 + }, + { + "epoch": 1.2183738639155, + "grad_norm": 4.129753177959848, + "learning_rate": 7.249643625786396e-07, + "loss": 0.1385, + "step": 4650 + }, + { + "epoch": 1.219683943339065, + "grad_norm": 3.5058969060009972, + "learning_rate": 7.228819517527853e-07, + "loss": 0.1573, + "step": 4655 + }, + { + "epoch": 1.22099402276263, + "grad_norm": 5.141143930253066, + "learning_rate": 7.208008417804097e-07, + "loss": 0.1667, + "step": 4660 + }, + { + "epoch": 1.2223041021861951, + "grad_norm": 5.713198603618221, + "learning_rate": 7.18721042430704e-07, + "loss": 0.1665, + "step": 4665 + }, + { + "epoch": 1.22361418160976, + "grad_norm": 6.881035052397601, + "learning_rate": 7.166425634667061e-07, + "loss": 0.0995, + "step": 4670 + }, + { + "epoch": 1.224924261033325, + "grad_norm": 9.101390365069749, + "learning_rate": 7.14565414645257e-07, + "loss": 0.1738, + "step": 4675 + }, + { + "epoch": 1.2262343404568903, + "grad_norm": 4.609339453067137, + "learning_rate": 7.124896057169532e-07, + "loss": 0.1568, + "step": 4680 + }, + { + "epoch": 1.2275444198804553, + "grad_norm": 4.95075728149686, + "learning_rate": 7.104151464261012e-07, + "loss": 0.1443, + "step": 4685 + }, + { + "epoch": 1.2288544993040202, + "grad_norm": 5.718250117967298, + "learning_rate": 7.083420465106727e-07, + "loss": 0.145, + "step": 4690 + }, + { + "epoch": 1.2301645787275854, + "grad_norm": 4.025790690698405, + "learning_rate": 7.062703157022571e-07, + "loss": 0.2297, + "step": 4695 + }, + { + "epoch": 1.2314746581511504, + "grad_norm": 5.85692684018953, + "learning_rate": 7.041999637260179e-07, + "loss": 0.1599, + "step": 4700 + }, + { + "epoch": 1.2314746581511504, + "eval_accuracy": 0.7528, + "eval_loss": 1.1270846128463745, + "eval_runtime": 144.8062, + "eval_samples_per_second": 8.632, + "eval_steps_per_second": 2.162, + "step": 4700 + }, + { + "epoch": 1.2327847375747154, + "grad_norm": 3.639396650071254, + "learning_rate": 7.021310003006458e-07, + "loss": 0.1767, + "step": 4705 + }, + { + "epoch": 1.2340948169982806, + "grad_norm": 4.992262473685122, + "learning_rate": 7.00063435138313e-07, + "loss": 0.1965, + "step": 4710 + }, + { + "epoch": 1.2354048964218456, + "grad_norm": 4.404778717860058, + "learning_rate": 6.979972779446288e-07, + "loss": 0.1772, + "step": 4715 + }, + { + "epoch": 1.2367149758454106, + "grad_norm": 6.471861850114097, + "learning_rate": 6.959325384185916e-07, + "loss": 0.1849, + "step": 4720 + }, + { + "epoch": 1.2380250552689758, + "grad_norm": 4.178248349318392, + "learning_rate": 6.938692262525463e-07, + "loss": 0.1845, + "step": 4725 + }, + { + "epoch": 1.2393351346925408, + "grad_norm": 3.328304867825855, + "learning_rate": 6.918073511321372e-07, + "loss": 0.1609, + "step": 4730 + }, + { + "epoch": 1.2406452141161057, + "grad_norm": 6.825787570815529, + "learning_rate": 6.897469227362626e-07, + "loss": 0.2165, + "step": 4735 + }, + { + "epoch": 1.241955293539671, + "grad_norm": 3.6473987849811573, + "learning_rate": 6.876879507370296e-07, + "loss": 0.1681, + "step": 4740 + }, + { + "epoch": 1.243265372963236, + "grad_norm": 5.7577805362937395, + "learning_rate": 6.856304447997087e-07, + "loss": 0.1393, + "step": 4745 + }, + { + "epoch": 1.244575452386801, + "grad_norm": 2.988983071415241, + "learning_rate": 6.835744145826883e-07, + "loss": 0.1293, + "step": 4750 + }, + { + "epoch": 1.245885531810366, + "grad_norm": 4.094951732166031, + "learning_rate": 6.815198697374295e-07, + "loss": 0.1986, + "step": 4755 + }, + { + "epoch": 1.247195611233931, + "grad_norm": 5.3643705834327555, + "learning_rate": 6.794668199084211e-07, + "loss": 0.1561, + "step": 4760 + }, + { + "epoch": 1.248505690657496, + "grad_norm": 6.488709549888938, + "learning_rate": 6.774152747331327e-07, + "loss": 0.1506, + "step": 4765 + }, + { + "epoch": 1.2498157700810613, + "grad_norm": 8.658130020703828, + "learning_rate": 6.753652438419724e-07, + "loss": 0.1462, + "step": 4770 + }, + { + "epoch": 1.2511258495046262, + "grad_norm": 3.6729089118316143, + "learning_rate": 6.733167368582387e-07, + "loss": 0.1754, + "step": 4775 + }, + { + "epoch": 1.2524359289281912, + "grad_norm": 4.412486038113313, + "learning_rate": 6.71269763398077e-07, + "loss": 0.1524, + "step": 4780 + }, + { + "epoch": 1.2537460083517562, + "grad_norm": 7.05308577872481, + "learning_rate": 6.692243330704345e-07, + "loss": 0.1955, + "step": 4785 + }, + { + "epoch": 1.2550560877753214, + "grad_norm": 3.707641518167412, + "learning_rate": 6.671804554770134e-07, + "loss": 0.1519, + "step": 4790 + }, + { + "epoch": 1.2563661671988864, + "grad_norm": 5.375891710975129, + "learning_rate": 6.651381402122279e-07, + "loss": 0.175, + "step": 4795 + }, + { + "epoch": 1.2576762466224514, + "grad_norm": 5.686007779957575, + "learning_rate": 6.630973968631582e-07, + "loss": 0.1541, + "step": 4800 + }, + { + "epoch": 1.2576762466224514, + "eval_accuracy": 0.7504, + "eval_loss": 1.1022791862487793, + "eval_runtime": 147.8534, + "eval_samples_per_second": 8.454, + "eval_steps_per_second": 2.117, + "step": 4800 + }, + { + "epoch": 1.2589863260460166, + "grad_norm": 4.67723472749942, + "learning_rate": 6.610582350095056e-07, + "loss": 0.1378, + "step": 4805 + }, + { + "epoch": 1.2602964054695815, + "grad_norm": 4.617066533461443, + "learning_rate": 6.590206642235469e-07, + "loss": 0.1512, + "step": 4810 + }, + { + "epoch": 1.2616064848931465, + "grad_norm": 6.057922955780923, + "learning_rate": 6.569846940700905e-07, + "loss": 0.1826, + "step": 4815 + }, + { + "epoch": 1.2629165643167117, + "grad_norm": 6.250001047253521, + "learning_rate": 6.549503341064315e-07, + "loss": 0.1458, + "step": 4820 + }, + { + "epoch": 1.2642266437402767, + "grad_norm": 4.27455152251786, + "learning_rate": 6.529175938823059e-07, + "loss": 0.1333, + "step": 4825 + }, + { + "epoch": 1.2655367231638417, + "grad_norm": 4.878557343525659, + "learning_rate": 6.508864829398464e-07, + "loss": 0.16, + "step": 4830 + }, + { + "epoch": 1.266846802587407, + "grad_norm": 5.406320166959371, + "learning_rate": 6.488570108135375e-07, + "loss": 0.1777, + "step": 4835 + }, + { + "epoch": 1.2681568820109719, + "grad_norm": 2.433179347244675, + "learning_rate": 6.468291870301707e-07, + "loss": 0.1715, + "step": 4840 + }, + { + "epoch": 1.2694669614345369, + "grad_norm": 9.875918507487867, + "learning_rate": 6.448030211087997e-07, + "loss": 0.1599, + "step": 4845 + }, + { + "epoch": 1.270777040858102, + "grad_norm": 6.6713245625990245, + "learning_rate": 6.427785225606961e-07, + "loss": 0.1406, + "step": 4850 + }, + { + "epoch": 1.272087120281667, + "grad_norm": 5.027821637791478, + "learning_rate": 6.40755700889305e-07, + "loss": 0.187, + "step": 4855 + }, + { + "epoch": 1.273397199705232, + "grad_norm": 2.7556993105477527, + "learning_rate": 6.38734565590198e-07, + "loss": 0.159, + "step": 4860 + }, + { + "epoch": 1.2747072791287972, + "grad_norm": 5.364669134362788, + "learning_rate": 6.367151261510324e-07, + "loss": 0.2186, + "step": 4865 + }, + { + "epoch": 1.2760173585523622, + "grad_norm": 3.5785337661276673, + "learning_rate": 6.346973920515039e-07, + "loss": 0.1364, + "step": 4870 + }, + { + "epoch": 1.2773274379759272, + "grad_norm": 5.414858492393773, + "learning_rate": 6.326813727633034e-07, + "loss": 0.1825, + "step": 4875 + }, + { + "epoch": 1.2786375173994924, + "grad_norm": 2.4129442164086536, + "learning_rate": 6.306670777500718e-07, + "loss": 0.1197, + "step": 4880 + }, + { + "epoch": 1.2799475968230574, + "grad_norm": 5.264909047367014, + "learning_rate": 6.286545164673555e-07, + "loss": 0.2254, + "step": 4885 + }, + { + "epoch": 1.2812576762466223, + "grad_norm": 2.0451302251453956, + "learning_rate": 6.26643698362563e-07, + "loss": 0.1347, + "step": 4890 + }, + { + "epoch": 1.2825677556701875, + "grad_norm": 3.7829205343945733, + "learning_rate": 6.246346328749199e-07, + "loss": 0.1552, + "step": 4895 + }, + { + "epoch": 1.2838778350937525, + "grad_norm": 5.594121118174163, + "learning_rate": 6.226273294354247e-07, + "loss": 0.1621, + "step": 4900 + }, + { + "epoch": 1.2838778350937525, + "eval_accuracy": 0.7496, + "eval_loss": 1.1256372928619385, + "eval_runtime": 143.3507, + "eval_samples_per_second": 8.72, + "eval_steps_per_second": 2.183, + "step": 4900 + }, + { + "epoch": 1.2851879145173175, + "grad_norm": 5.282433980211254, + "learning_rate": 6.206217974668034e-07, + "loss": 0.1379, + "step": 4905 + }, + { + "epoch": 1.2864979939408827, + "grad_norm": 3.277253081058706, + "learning_rate": 6.186180463834675e-07, + "loss": 0.1338, + "step": 4910 + }, + { + "epoch": 1.2878080733644477, + "grad_norm": 6.466429706589474, + "learning_rate": 6.166160855914683e-07, + "loss": 0.1542, + "step": 4915 + }, + { + "epoch": 1.2891181527880127, + "grad_norm": 4.3752571537021625, + "learning_rate": 6.146159244884533e-07, + "loss": 0.204, + "step": 4920 + }, + { + "epoch": 1.2904282322115779, + "grad_norm": 9.612960014494304, + "learning_rate": 6.126175724636213e-07, + "loss": 0.1666, + "step": 4925 + }, + { + "epoch": 1.2917383116351429, + "grad_norm": 7.388843981840605, + "learning_rate": 6.106210388976792e-07, + "loss": 0.1676, + "step": 4930 + }, + { + "epoch": 1.2930483910587078, + "grad_norm": 5.525130280779438, + "learning_rate": 6.086263331627975e-07, + "loss": 0.1371, + "step": 4935 + }, + { + "epoch": 1.294358470482273, + "grad_norm": 15.7136065256586, + "learning_rate": 6.066334646225669e-07, + "loss": 0.2647, + "step": 4940 + }, + { + "epoch": 1.295668549905838, + "grad_norm": 4.509787064035042, + "learning_rate": 6.046424426319534e-07, + "loss": 0.186, + "step": 4945 + }, + { + "epoch": 1.296978629329403, + "grad_norm": 3.034726602811886, + "learning_rate": 6.026532765372556e-07, + "loss": 0.1689, + "step": 4950 + }, + { + "epoch": 1.2982887087529682, + "grad_norm": 9.470147103767804, + "learning_rate": 6.006659756760587e-07, + "loss": 0.1738, + "step": 4955 + }, + { + "epoch": 1.2995987881765332, + "grad_norm": 3.723471628172551, + "learning_rate": 5.986805493771933e-07, + "loss": 0.1699, + "step": 4960 + }, + { + "epoch": 1.3009088676000982, + "grad_norm": 2.4954433173901998, + "learning_rate": 5.966970069606905e-07, + "loss": 0.1066, + "step": 4965 + }, + { + "epoch": 1.3022189470236634, + "grad_norm": 3.8824997751755332, + "learning_rate": 5.947153577377372e-07, + "loss": 0.1243, + "step": 4970 + }, + { + "epoch": 1.3035290264472283, + "grad_norm": 7.155179589969273, + "learning_rate": 5.927356110106335e-07, + "loss": 0.1868, + "step": 4975 + }, + { + "epoch": 1.3048391058707933, + "grad_norm": 6.426034228727986, + "learning_rate": 5.907577760727491e-07, + "loss": 0.1749, + "step": 4980 + }, + { + "epoch": 1.3061491852943585, + "grad_norm": 6.259326643585595, + "learning_rate": 5.887818622084792e-07, + "loss": 0.1687, + "step": 4985 + }, + { + "epoch": 1.3074592647179235, + "grad_norm": 7.118570215949986, + "learning_rate": 5.86807878693201e-07, + "loss": 0.1945, + "step": 4990 + }, + { + "epoch": 1.3087693441414885, + "grad_norm": 4.470114328159557, + "learning_rate": 5.848358347932305e-07, + "loss": 0.1279, + "step": 4995 + }, + { + "epoch": 1.3100794235650537, + "grad_norm": 4.518469800001269, + "learning_rate": 5.828657397657775e-07, + "loss": 0.1581, + "step": 5000 + }, + { + "epoch": 1.3100794235650537, + "eval_accuracy": 0.7664, + "eval_loss": 1.0690715312957764, + "eval_runtime": 138.0186, + "eval_samples_per_second": 9.057, + "eval_steps_per_second": 2.268, + "step": 5000 + }, + { + "epoch": 1.3113895029886187, + "grad_norm": 3.118035615186372, + "learning_rate": 5.808976028589052e-07, + "loss": 0.148, + "step": 5005 + }, + { + "epoch": 1.3126995824121837, + "grad_norm": 8.339043807140468, + "learning_rate": 5.789314333114832e-07, + "loss": 0.1599, + "step": 5010 + }, + { + "epoch": 1.3140096618357489, + "grad_norm": 3.979545884384439, + "learning_rate": 5.769672403531476e-07, + "loss": 0.1862, + "step": 5015 + }, + { + "epoch": 1.3153197412593138, + "grad_norm": 5.713823007971235, + "learning_rate": 5.750050332042546e-07, + "loss": 0.1493, + "step": 5020 + }, + { + "epoch": 1.3166298206828788, + "grad_norm": 3.975891070180634, + "learning_rate": 5.730448210758392e-07, + "loss": 0.1615, + "step": 5025 + }, + { + "epoch": 1.317939900106444, + "grad_norm": 9.895767819873718, + "learning_rate": 5.710866131695707e-07, + "loss": 0.1817, + "step": 5030 + }, + { + "epoch": 1.319249979530009, + "grad_norm": 3.9920763618019306, + "learning_rate": 5.691304186777112e-07, + "loss": 0.1139, + "step": 5035 + }, + { + "epoch": 1.320560058953574, + "grad_norm": 4.602626327223196, + "learning_rate": 5.671762467830701e-07, + "loss": 0.1388, + "step": 5040 + }, + { + "epoch": 1.3218701383771392, + "grad_norm": 3.504924250719407, + "learning_rate": 5.652241066589638e-07, + "loss": 0.1349, + "step": 5045 + }, + { + "epoch": 1.3231802178007042, + "grad_norm": 3.9727230916218694, + "learning_rate": 5.6327400746917e-07, + "loss": 0.1308, + "step": 5050 + }, + { + "epoch": 1.3244902972242691, + "grad_norm": 5.966820662181034, + "learning_rate": 5.613259583678855e-07, + "loss": 0.1937, + "step": 5055 + }, + { + "epoch": 1.3258003766478343, + "grad_norm": 4.982426614770313, + "learning_rate": 5.593799684996851e-07, + "loss": 0.0966, + "step": 5060 + }, + { + "epoch": 1.3271104560713993, + "grad_norm": 11.15113796656988, + "learning_rate": 5.574360469994755e-07, + "loss": 0.1868, + "step": 5065 + }, + { + "epoch": 1.3284205354949643, + "grad_norm": 4.032648149967243, + "learning_rate": 5.55494202992455e-07, + "loss": 0.1081, + "step": 5070 + }, + { + "epoch": 1.3297306149185295, + "grad_norm": 13.254457078365599, + "learning_rate": 5.535544455940685e-07, + "loss": 0.198, + "step": 5075 + }, + { + "epoch": 1.3310406943420945, + "grad_norm": 6.857347197358272, + "learning_rate": 5.51616783909968e-07, + "loss": 0.1458, + "step": 5080 + }, + { + "epoch": 1.3323507737656595, + "grad_norm": 4.4329913576651565, + "learning_rate": 5.496812270359651e-07, + "loss": 0.1764, + "step": 5085 + }, + { + "epoch": 1.3336608531892247, + "grad_norm": 1.9756285446680741, + "learning_rate": 5.477477840579941e-07, + "loss": 0.1328, + "step": 5090 + }, + { + "epoch": 1.3349709326127897, + "grad_norm": 16.33866926710285, + "learning_rate": 5.458164640520626e-07, + "loss": 0.1688, + "step": 5095 + }, + { + "epoch": 1.3362810120363546, + "grad_norm": 7.260662473486001, + "learning_rate": 5.438872760842155e-07, + "loss": 0.1475, + "step": 5100 + }, + { + "epoch": 1.3362810120363546, + "eval_accuracy": 0.7632, + "eval_loss": 1.166494607925415, + "eval_runtime": 139.3544, + "eval_samples_per_second": 8.97, + "eval_steps_per_second": 2.246, + "step": 5100 + }, + { + "epoch": 1.3375910914599198, + "grad_norm": 9.098282960735217, + "learning_rate": 5.419602292104877e-07, + "loss": 0.2249, + "step": 5105 + }, + { + "epoch": 1.3389011708834848, + "grad_norm": 14.59920166671128, + "learning_rate": 5.400353324768641e-07, + "loss": 0.2254, + "step": 5110 + }, + { + "epoch": 1.3402112503070498, + "grad_norm": 5.661845688109474, + "learning_rate": 5.381125949192369e-07, + "loss": 0.1491, + "step": 5115 + }, + { + "epoch": 1.341521329730615, + "grad_norm": 4.343788864841793, + "learning_rate": 5.361920255633608e-07, + "loss": 0.1416, + "step": 5120 + }, + { + "epoch": 1.34283140915418, + "grad_norm": 5.455096296340938, + "learning_rate": 5.342736334248142e-07, + "loss": 0.1591, + "step": 5125 + }, + { + "epoch": 1.344141488577745, + "grad_norm": 6.62802098246343, + "learning_rate": 5.323574275089542e-07, + "loss": 0.1631, + "step": 5130 + }, + { + "epoch": 1.3454515680013102, + "grad_norm": 5.475172079252899, + "learning_rate": 5.304434168108768e-07, + "loss": 0.1486, + "step": 5135 + }, + { + "epoch": 1.3467616474248751, + "grad_norm": 3.7823792470345867, + "learning_rate": 5.285316103153703e-07, + "loss": 0.162, + "step": 5140 + }, + { + "epoch": 1.3480717268484401, + "grad_norm": 3.950455595949725, + "learning_rate": 5.266220169968789e-07, + "loss": 0.1386, + "step": 5145 + }, + { + "epoch": 1.3493818062720053, + "grad_norm": 7.064128257879523, + "learning_rate": 5.247146458194558e-07, + "loss": 0.1265, + "step": 5150 + }, + { + "epoch": 1.3506918856955703, + "grad_norm": 3.451315722735635, + "learning_rate": 5.228095057367244e-07, + "loss": 0.1564, + "step": 5155 + }, + { + "epoch": 1.3520019651191353, + "grad_norm": 6.376390408082125, + "learning_rate": 5.209066056918336e-07, + "loss": 0.1408, + "step": 5160 + }, + { + "epoch": 1.3533120445427005, + "grad_norm": 10.020189583400871, + "learning_rate": 5.190059546174173e-07, + "loss": 0.1868, + "step": 5165 + }, + { + "epoch": 1.3546221239662655, + "grad_norm": 10.402422967695797, + "learning_rate": 5.171075614355531e-07, + "loss": 0.1567, + "step": 5170 + }, + { + "epoch": 1.3559322033898304, + "grad_norm": 6.325865551750877, + "learning_rate": 5.152114350577183e-07, + "loss": 0.1524, + "step": 5175 + }, + { + "epoch": 1.3572422828133957, + "grad_norm": 9.377120804911332, + "learning_rate": 5.133175843847507e-07, + "loss": 0.2113, + "step": 5180 + }, + { + "epoch": 1.3585523622369606, + "grad_norm": 5.0552106113831226, + "learning_rate": 5.114260183068043e-07, + "loss": 0.1793, + "step": 5185 + }, + { + "epoch": 1.3598624416605256, + "grad_norm": 7.793559280526845, + "learning_rate": 5.095367457033091e-07, + "loss": 0.2107, + "step": 5190 + }, + { + "epoch": 1.3611725210840908, + "grad_norm": 5.256871245122319, + "learning_rate": 5.076497754429286e-07, + "loss": 0.153, + "step": 5195 + }, + { + "epoch": 1.3624826005076558, + "grad_norm": 7.340368996139302, + "learning_rate": 5.0576511638352e-07, + "loss": 0.1562, + "step": 5200 + }, + { + "epoch": 1.3624826005076558, + "eval_accuracy": 0.7648, + "eval_loss": 1.0087817907333374, + "eval_runtime": 139.3617, + "eval_samples_per_second": 8.969, + "eval_steps_per_second": 2.246, + "step": 5200 + }, + { + "epoch": 1.3637926799312208, + "grad_norm": 4.87245584125935, + "learning_rate": 5.03882777372089e-07, + "loss": 0.1542, + "step": 5205 + }, + { + "epoch": 1.365102759354786, + "grad_norm": 4.198926991817569, + "learning_rate": 5.020027672447531e-07, + "loss": 0.1252, + "step": 5210 + }, + { + "epoch": 1.366412838778351, + "grad_norm": 6.952393202633123, + "learning_rate": 5.001250948266953e-07, + "loss": 0.1858, + "step": 5215 + }, + { + "epoch": 1.367722918201916, + "grad_norm": 2.4402795139976936, + "learning_rate": 4.982497689321254e-07, + "loss": 0.139, + "step": 5220 + }, + { + "epoch": 1.3690329976254811, + "grad_norm": 3.952799970755654, + "learning_rate": 4.963767983642391e-07, + "loss": 0.1942, + "step": 5225 + }, + { + "epoch": 1.3703430770490461, + "grad_norm": 3.1488846024801855, + "learning_rate": 4.945061919151748e-07, + "loss": 0.1268, + "step": 5230 + }, + { + "epoch": 1.371653156472611, + "grad_norm": 3.8295556788809533, + "learning_rate": 4.926379583659732e-07, + "loss": 0.1492, + "step": 5235 + }, + { + "epoch": 1.3729632358961763, + "grad_norm": 5.807740283147101, + "learning_rate": 4.907721064865358e-07, + "loss": 0.1764, + "step": 5240 + }, + { + "epoch": 1.3742733153197413, + "grad_norm": 6.049197583837082, + "learning_rate": 4.889086450355853e-07, + "loss": 0.1335, + "step": 5245 + }, + { + "epoch": 1.3755833947433063, + "grad_norm": 5.872325479624525, + "learning_rate": 4.870475827606218e-07, + "loss": 0.1875, + "step": 5250 + }, + { + "epoch": 1.3768934741668715, + "grad_norm": 7.484940602815659, + "learning_rate": 4.851889283978841e-07, + "loss": 0.2242, + "step": 5255 + }, + { + "epoch": 1.3782035535904364, + "grad_norm": 4.44039134698427, + "learning_rate": 4.833326906723071e-07, + "loss": 0.1884, + "step": 5260 + }, + { + "epoch": 1.3795136330140014, + "grad_norm": 2.9457269887282584, + "learning_rate": 4.814788782974814e-07, + "loss": 0.1575, + "step": 5265 + }, + { + "epoch": 1.3808237124375666, + "grad_norm": 4.142755818231746, + "learning_rate": 4.796274999756134e-07, + "loss": 0.1503, + "step": 5270 + }, + { + "epoch": 1.3821337918611316, + "grad_norm": 4.231068410579535, + "learning_rate": 4.777785643974822e-07, + "loss": 0.1296, + "step": 5275 + }, + { + "epoch": 1.3834438712846966, + "grad_norm": 7.276995612922505, + "learning_rate": 4.7593208024240196e-07, + "loss": 0.1793, + "step": 5280 + }, + { + "epoch": 1.3847539507082618, + "grad_norm": 2.8458411818811973, + "learning_rate": 4.740880561781766e-07, + "loss": 0.097, + "step": 5285 + }, + { + "epoch": 1.3860640301318268, + "grad_norm": 9.313608656005794, + "learning_rate": 4.7224650086106444e-07, + "loss": 0.1973, + "step": 5290 + }, + { + "epoch": 1.3873741095553918, + "grad_norm": 8.399153217159018, + "learning_rate": 4.7040742293573334e-07, + "loss": 0.1789, + "step": 5295 + }, + { + "epoch": 1.388684188978957, + "grad_norm": 5.871107449455294, + "learning_rate": 4.6857083103522277e-07, + "loss": 0.1899, + "step": 5300 + }, + { + "epoch": 1.388684188978957, + "eval_accuracy": 0.7624, + "eval_loss": 1.0121264457702637, + "eval_runtime": 139.5539, + "eval_samples_per_second": 8.957, + "eval_steps_per_second": 2.243, + "step": 5300 + }, + { + "epoch": 1.389994268402522, + "grad_norm": 3.8156581594965684, + "learning_rate": 4.667367337809016e-07, + "loss": 0.1204, + "step": 5305 + }, + { + "epoch": 1.391304347826087, + "grad_norm": 4.985992975846861, + "learning_rate": 4.6490513978242804e-07, + "loss": 0.1319, + "step": 5310 + }, + { + "epoch": 1.3926144272496521, + "grad_norm": 5.076467294818805, + "learning_rate": 4.6307605763771076e-07, + "loss": 0.1684, + "step": 5315 + }, + { + "epoch": 1.393924506673217, + "grad_norm": 5.593144516901242, + "learning_rate": 4.6124949593286523e-07, + "loss": 0.2016, + "step": 5320 + }, + { + "epoch": 1.395234586096782, + "grad_norm": 6.386164042680592, + "learning_rate": 4.5942546324217803e-07, + "loss": 0.1468, + "step": 5325 + }, + { + "epoch": 1.3965446655203473, + "grad_norm": 6.108862203835904, + "learning_rate": 4.576039681280608e-07, + "loss": 0.1441, + "step": 5330 + }, + { + "epoch": 1.3978547449439123, + "grad_norm": 3.9244812319321274, + "learning_rate": 4.557850191410161e-07, + "loss": 0.1768, + "step": 5335 + }, + { + "epoch": 1.3991648243674772, + "grad_norm": 4.15439436004101, + "learning_rate": 4.5396862481959243e-07, + "loss": 0.1338, + "step": 5340 + }, + { + "epoch": 1.4004749037910424, + "grad_norm": 5.897274197879647, + "learning_rate": 4.521547936903477e-07, + "loss": 0.1798, + "step": 5345 + }, + { + "epoch": 1.4017849832146074, + "grad_norm": 6.090071523418443, + "learning_rate": 4.5034353426780657e-07, + "loss": 0.1729, + "step": 5350 + }, + { + "epoch": 1.4030950626381724, + "grad_norm": 6.853061331028334, + "learning_rate": 4.4853485505442133e-07, + "loss": 0.1445, + "step": 5355 + }, + { + "epoch": 1.4044051420617376, + "grad_norm": 4.835112562982259, + "learning_rate": 4.4672876454053354e-07, + "loss": 0.1255, + "step": 5360 + }, + { + "epoch": 1.4057152214853026, + "grad_norm": 3.3603532935805713, + "learning_rate": 4.449252712043311e-07, + "loss": 0.1178, + "step": 5365 + }, + { + "epoch": 1.4070253009088676, + "grad_norm": 5.621068344363656, + "learning_rate": 4.431243835118124e-07, + "loss": 0.1521, + "step": 5370 + }, + { + "epoch": 1.4083353803324328, + "grad_norm": 6.177956693196569, + "learning_rate": 4.4132610991674123e-07, + "loss": 0.2011, + "step": 5375 + }, + { + "epoch": 1.4096454597559978, + "grad_norm": 3.8093768021526095, + "learning_rate": 4.3953045886061336e-07, + "loss": 0.1414, + "step": 5380 + }, + { + "epoch": 1.4109555391795627, + "grad_norm": 8.73859339392293, + "learning_rate": 4.377374387726116e-07, + "loss": 0.2335, + "step": 5385 + }, + { + "epoch": 1.412265618603128, + "grad_norm": 7.4975778108062965, + "learning_rate": 4.359470580695701e-07, + "loss": 0.1395, + "step": 5390 + }, + { + "epoch": 1.413575698026693, + "grad_norm": 8.260550346451351, + "learning_rate": 4.341593251559319e-07, + "loss": 0.1615, + "step": 5395 + }, + { + "epoch": 1.414885777450258, + "grad_norm": 3.552748594627843, + "learning_rate": 4.323742484237107e-07, + "loss": 0.1378, + "step": 5400 + }, + { + "epoch": 1.414885777450258, + "eval_accuracy": 0.7656, + "eval_loss": 1.022687554359436, + "eval_runtime": 142.9636, + "eval_samples_per_second": 8.743, + "eval_steps_per_second": 2.189, + "step": 5400 + }, + { + "epoch": 1.416195856873823, + "grad_norm": 3.658068549302559, + "learning_rate": 4.3059183625245275e-07, + "loss": 0.1878, + "step": 5405 + }, + { + "epoch": 1.417505936297388, + "grad_norm": 12.81739721678878, + "learning_rate": 4.288120970091947e-07, + "loss": 0.2519, + "step": 5410 + }, + { + "epoch": 1.418816015720953, + "grad_norm": 6.806589788632898, + "learning_rate": 4.270350390484274e-07, + "loss": 0.1387, + "step": 5415 + }, + { + "epoch": 1.4201260951445183, + "grad_norm": 3.558959244970237, + "learning_rate": 4.2526067071205394e-07, + "loss": 0.1574, + "step": 5420 + }, + { + "epoch": 1.4214361745680832, + "grad_norm": 4.1991124986912665, + "learning_rate": 4.234890003293522e-07, + "loss": 0.1533, + "step": 5425 + }, + { + "epoch": 1.4227462539916482, + "grad_norm": 5.57307236404189, + "learning_rate": 4.2172003621693495e-07, + "loss": 0.1435, + "step": 5430 + }, + { + "epoch": 1.4240563334152134, + "grad_norm": 9.98468287385797, + "learning_rate": 4.1995378667871206e-07, + "loss": 0.1221, + "step": 5435 + }, + { + "epoch": 1.4253664128387784, + "grad_norm": 4.021122095372401, + "learning_rate": 4.1819026000584935e-07, + "loss": 0.1356, + "step": 5440 + }, + { + "epoch": 1.4266764922623434, + "grad_norm": 2.539120750400961, + "learning_rate": 4.164294644767321e-07, + "loss": 0.1386, + "step": 5445 + }, + { + "epoch": 1.4279865716859086, + "grad_norm": 4.900068180108013, + "learning_rate": 4.1467140835692403e-07, + "loss": 0.1509, + "step": 5450 + }, + { + "epoch": 1.4292966511094736, + "grad_norm": 8.043017914794532, + "learning_rate": 4.1291609989912955e-07, + "loss": 0.1282, + "step": 5455 + }, + { + "epoch": 1.4306067305330386, + "grad_norm": 4.392158250344916, + "learning_rate": 4.1116354734315596e-07, + "loss": 0.1136, + "step": 5460 + }, + { + "epoch": 1.4319168099566038, + "grad_norm": 4.14190663170065, + "learning_rate": 4.0941375891587273e-07, + "loss": 0.1398, + "step": 5465 + }, + { + "epoch": 1.4332268893801687, + "grad_norm": 8.877219117463126, + "learning_rate": 4.076667428311739e-07, + "loss": 0.1529, + "step": 5470 + }, + { + "epoch": 1.4345369688037337, + "grad_norm": 10.951781738101637, + "learning_rate": 4.059225072899397e-07, + "loss": 0.1794, + "step": 5475 + }, + { + "epoch": 1.435847048227299, + "grad_norm": 3.5884378317825982, + "learning_rate": 4.041810604799986e-07, + "loss": 0.1287, + "step": 5480 + }, + { + "epoch": 1.437157127650864, + "grad_norm": 7.646485652411928, + "learning_rate": 4.0244241057608675e-07, + "loss": 0.1526, + "step": 5485 + }, + { + "epoch": 1.4384672070744289, + "grad_norm": 4.095820215840846, + "learning_rate": 4.0070656573981263e-07, + "loss": 0.1629, + "step": 5490 + }, + { + "epoch": 1.4397772864979939, + "grad_norm": 4.619960063809711, + "learning_rate": 3.9897353411961576e-07, + "loss": 0.1631, + "step": 5495 + }, + { + "epoch": 1.441087365921559, + "grad_norm": 7.65793128091754, + "learning_rate": 3.9724332385073e-07, + "loss": 0.1684, + "step": 5500 + }, + { + "epoch": 1.441087365921559, + "eval_accuracy": 0.7616, + "eval_loss": 1.1524358987808228, + "eval_runtime": 140.9725, + "eval_samples_per_second": 8.867, + "eval_steps_per_second": 2.22, + "step": 5500 + }, + { + "epoch": 1.442397445345124, + "grad_norm": 9.53033405655631, + "learning_rate": 3.955159430551462e-07, + "loss": 0.1856, + "step": 5505 + }, + { + "epoch": 1.443707524768689, + "grad_norm": 6.635703636489727, + "learning_rate": 3.937913998415716e-07, + "loss": 0.1173, + "step": 5510 + }, + { + "epoch": 1.4450176041922542, + "grad_norm": 2.8008935096487444, + "learning_rate": 3.9206970230539484e-07, + "loss": 0.1407, + "step": 5515 + }, + { + "epoch": 1.4463276836158192, + "grad_norm": 7.567404117719152, + "learning_rate": 3.90350858528644e-07, + "loss": 0.1339, + "step": 5520 + }, + { + "epoch": 1.4476377630393842, + "grad_norm": 5.684176798537522, + "learning_rate": 3.886348765799535e-07, + "loss": 0.1448, + "step": 5525 + }, + { + "epoch": 1.4489478424629494, + "grad_norm": 7.48394265324762, + "learning_rate": 3.8692176451452187e-07, + "loss": 0.1873, + "step": 5530 + }, + { + "epoch": 1.4502579218865144, + "grad_norm": 4.704500306984347, + "learning_rate": 3.852115303740775e-07, + "loss": 0.1384, + "step": 5535 + }, + { + "epoch": 1.4515680013100793, + "grad_norm": 7.782133396015602, + "learning_rate": 3.8350418218683656e-07, + "loss": 0.1678, + "step": 5540 + }, + { + "epoch": 1.4528780807336446, + "grad_norm": 7.950318860217571, + "learning_rate": 3.817997279674707e-07, + "loss": 0.1491, + "step": 5545 + }, + { + "epoch": 1.4541881601572095, + "grad_norm": 4.885181041366676, + "learning_rate": 3.800981757170647e-07, + "loss": 0.1333, + "step": 5550 + }, + { + "epoch": 1.4554982395807745, + "grad_norm": 4.5099336389227, + "learning_rate": 3.7839953342308195e-07, + "loss": 0.1649, + "step": 5555 + }, + { + "epoch": 1.4568083190043397, + "grad_norm": 1.3392143606297127, + "learning_rate": 3.767038090593262e-07, + "loss": 0.1196, + "step": 5560 + }, + { + "epoch": 1.4581183984279047, + "grad_norm": 6.801138336879372, + "learning_rate": 3.7501101058590156e-07, + "loss": 0.1303, + "step": 5565 + }, + { + "epoch": 1.4594284778514697, + "grad_norm": 6.231994683303187, + "learning_rate": 3.733211459491802e-07, + "loss": 0.1275, + "step": 5570 + }, + { + "epoch": 1.4607385572750349, + "grad_norm": 7.481895592566931, + "learning_rate": 3.716342230817598e-07, + "loss": 0.1563, + "step": 5575 + }, + { + "epoch": 1.4620486366985999, + "grad_norm": 5.294171079568994, + "learning_rate": 3.6995024990243097e-07, + "loss": 0.1615, + "step": 5580 + }, + { + "epoch": 1.4633587161221648, + "grad_norm": 4.889827410342588, + "learning_rate": 3.682692343161361e-07, + "loss": 0.1409, + "step": 5585 + }, + { + "epoch": 1.46466879554573, + "grad_norm": 4.755475192052327, + "learning_rate": 3.6659118421393454e-07, + "loss": 0.2151, + "step": 5590 + }, + { + "epoch": 1.465978874969295, + "grad_norm": 8.146609131839343, + "learning_rate": 3.6491610747296464e-07, + "loss": 0.167, + "step": 5595 + }, + { + "epoch": 1.46728895439286, + "grad_norm": 11.571129826043965, + "learning_rate": 3.632440119564084e-07, + "loss": 0.1526, + "step": 5600 + }, + { + "epoch": 1.46728895439286, + "eval_accuracy": 0.7632, + "eval_loss": 1.1522161960601807, + "eval_runtime": 140.4528, + "eval_samples_per_second": 8.9, + "eval_steps_per_second": 2.229, + "step": 5600 + }, + { + "epoch": 1.4685990338164252, + "grad_norm": 13.388610986617053, + "learning_rate": 3.615749055134516e-07, + "loss": 0.1434, + "step": 5605 + }, + { + "epoch": 1.4699091132399902, + "grad_norm": 4.611773903177833, + "learning_rate": 3.5990879597925015e-07, + "loss": 0.1593, + "step": 5610 + }, + { + "epoch": 1.4712191926635552, + "grad_norm": 5.209036065178781, + "learning_rate": 3.5824569117489087e-07, + "loss": 0.1589, + "step": 5615 + }, + { + "epoch": 1.4725292720871201, + "grad_norm": 5.985898828281977, + "learning_rate": 3.565855989073555e-07, + "loss": 0.2083, + "step": 5620 + }, + { + "epoch": 1.4738393515106853, + "grad_norm": 7.4907701541903355, + "learning_rate": 3.549285269694855e-07, + "loss": 0.2042, + "step": 5625 + }, + { + "epoch": 1.4751494309342503, + "grad_norm": 8.271025659121081, + "learning_rate": 3.53274483139943e-07, + "loss": 0.1482, + "step": 5630 + }, + { + "epoch": 1.4764595103578153, + "grad_norm": 8.918810219484554, + "learning_rate": 3.5162347518317614e-07, + "loss": 0.155, + "step": 5635 + }, + { + "epoch": 1.4777695897813805, + "grad_norm": 6.395860091750389, + "learning_rate": 3.499755108493814e-07, + "loss": 0.1675, + "step": 5640 + }, + { + "epoch": 1.4790796692049455, + "grad_norm": 4.314058968165375, + "learning_rate": 3.483305978744688e-07, + "loss": 0.1404, + "step": 5645 + }, + { + "epoch": 1.4803897486285105, + "grad_norm": 4.359611398643348, + "learning_rate": 3.4668874398002367e-07, + "loss": 0.1973, + "step": 5650 + }, + { + "epoch": 1.4816998280520757, + "grad_norm": 4.284018420029098, + "learning_rate": 3.450499568732722e-07, + "loss": 0.1673, + "step": 5655 + }, + { + "epoch": 1.4830099074756407, + "grad_norm": 11.418803588749793, + "learning_rate": 3.434142442470437e-07, + "loss": 0.1604, + "step": 5660 + }, + { + "epoch": 1.4843199868992056, + "grad_norm": 11.223038201848048, + "learning_rate": 3.41781613779735e-07, + "loss": 0.1685, + "step": 5665 + }, + { + "epoch": 1.4856300663227708, + "grad_norm": 2.587612812317313, + "learning_rate": 3.401520731352758e-07, + "loss": 0.136, + "step": 5670 + }, + { + "epoch": 1.4869401457463358, + "grad_norm": 3.3606721211871826, + "learning_rate": 3.385256299630901e-07, + "loss": 0.1451, + "step": 5675 + }, + { + "epoch": 1.4882502251699008, + "grad_norm": 4.563542445144167, + "learning_rate": 3.36902291898063e-07, + "loss": 0.1518, + "step": 5680 + }, + { + "epoch": 1.489560304593466, + "grad_norm": 4.828685381151716, + "learning_rate": 3.352820665605016e-07, + "loss": 0.1545, + "step": 5685 + }, + { + "epoch": 1.490870384017031, + "grad_norm": 6.374748369935473, + "learning_rate": 3.336649615561035e-07, + "loss": 0.1404, + "step": 5690 + }, + { + "epoch": 1.492180463440596, + "grad_norm": 6.1615180099066285, + "learning_rate": 3.320509844759168e-07, + "loss": 0.1522, + "step": 5695 + }, + { + "epoch": 1.4934905428641612, + "grad_norm": 3.212620440024653, + "learning_rate": 3.3044014289630827e-07, + "loss": 0.1852, + "step": 5700 + }, + { + "epoch": 1.4934905428641612, + "eval_accuracy": 0.7648, + "eval_loss": 1.1015968322753906, + "eval_runtime": 139.045, + "eval_samples_per_second": 8.99, + "eval_steps_per_second": 2.251, + "step": 5700 + }, + { + "epoch": 1.4948006222877261, + "grad_norm": 9.244280944426595, + "learning_rate": 3.288324443789243e-07, + "loss": 0.173, + "step": 5705 + }, + { + "epoch": 1.4961107017112911, + "grad_norm": 7.103864592003495, + "learning_rate": 3.272278964706575e-07, + "loss": 0.1468, + "step": 5710 + }, + { + "epoch": 1.4974207811348563, + "grad_norm": 9.814370133917347, + "learning_rate": 3.256265067036118e-07, + "loss": 0.2144, + "step": 5715 + }, + { + "epoch": 1.4987308605584213, + "grad_norm": 3.41872463628258, + "learning_rate": 3.2402828259506445e-07, + "loss": 0.1161, + "step": 5720 + }, + { + "epoch": 1.5000409399819863, + "grad_norm": 3.982224660417661, + "learning_rate": 3.2243323164743453e-07, + "loss": 0.1338, + "step": 5725 + }, + { + "epoch": 1.5013510194055515, + "grad_norm": 3.5236382432537052, + "learning_rate": 3.208413613482429e-07, + "loss": 0.1216, + "step": 5730 + }, + { + "epoch": 1.5026610988291165, + "grad_norm": 5.361802539263841, + "learning_rate": 3.1925267917008224e-07, + "loss": 0.1533, + "step": 5735 + }, + { + "epoch": 1.5039711782526815, + "grad_norm": 2.839792538080114, + "learning_rate": 3.1766719257057785e-07, + "loss": 0.1389, + "step": 5740 + }, + { + "epoch": 1.5052812576762467, + "grad_norm": 8.32225450006049, + "learning_rate": 3.160849089923555e-07, + "loss": 0.1513, + "step": 5745 + }, + { + "epoch": 1.5065913370998116, + "grad_norm": 7.675934502719418, + "learning_rate": 3.145058358630043e-07, + "loss": 0.1482, + "step": 5750 + }, + { + "epoch": 1.5079014165233766, + "grad_norm": 4.898667455406003, + "learning_rate": 3.1292998059504294e-07, + "loss": 0.1264, + "step": 5755 + }, + { + "epoch": 1.5092114959469418, + "grad_norm": 7.366536986513522, + "learning_rate": 3.113573505858855e-07, + "loss": 0.1777, + "step": 5760 + }, + { + "epoch": 1.5105215753705068, + "grad_norm": 6.704445471281222, + "learning_rate": 3.0978795321780506e-07, + "loss": 0.1492, + "step": 5765 + }, + { + "epoch": 1.5118316547940718, + "grad_norm": 7.187985717646523, + "learning_rate": 3.0822179585790063e-07, + "loss": 0.1358, + "step": 5770 + }, + { + "epoch": 1.513141734217637, + "grad_norm": 8.290411615555337, + "learning_rate": 3.0665888585806163e-07, + "loss": 0.2399, + "step": 5775 + }, + { + "epoch": 1.514451813641202, + "grad_norm": 4.958261863176042, + "learning_rate": 3.050992305549335e-07, + "loss": 0.1241, + "step": 5780 + }, + { + "epoch": 1.515761893064767, + "grad_norm": 5.902391740892916, + "learning_rate": 3.035428372698833e-07, + "loss": 0.1296, + "step": 5785 + }, + { + "epoch": 1.5170719724883321, + "grad_norm": 4.285558089259955, + "learning_rate": 3.0198971330896637e-07, + "loss": 0.183, + "step": 5790 + }, + { + "epoch": 1.5183820519118971, + "grad_norm": 5.273051514451481, + "learning_rate": 3.0043986596289027e-07, + "loss": 0.1311, + "step": 5795 + }, + { + "epoch": 1.519692131335462, + "grad_norm": 6.51925311343804, + "learning_rate": 2.988933025069811e-07, + "loss": 0.1358, + "step": 5800 + }, + { + "epoch": 1.519692131335462, + "eval_accuracy": 0.7624, + "eval_loss": 1.1159664392471313, + "eval_runtime": 141.0917, + "eval_samples_per_second": 8.859, + "eval_steps_per_second": 2.218, + "step": 5800 + }, + { + "epoch": 1.5210022107590273, + "grad_norm": 7.383268111667339, + "learning_rate": 2.973500302011509e-07, + "loss": 0.147, + "step": 5805 + }, + { + "epoch": 1.5223122901825923, + "grad_norm": 3.8912007914030182, + "learning_rate": 2.958100562898609e-07, + "loss": 0.1089, + "step": 5810 + }, + { + "epoch": 1.5236223696061573, + "grad_norm": 8.712734187230408, + "learning_rate": 2.9427338800209033e-07, + "loss": 0.2046, + "step": 5815 + }, + { + "epoch": 1.5249324490297225, + "grad_norm": 3.405653489283435, + "learning_rate": 2.927400325513001e-07, + "loss": 0.1524, + "step": 5820 + }, + { + "epoch": 1.5262425284532875, + "grad_norm": 9.562161269374949, + "learning_rate": 2.912099971354002e-07, + "loss": 0.1311, + "step": 5825 + }, + { + "epoch": 1.5275526078768524, + "grad_norm": 4.4220037412099495, + "learning_rate": 2.896832889367151e-07, + "loss": 0.1844, + "step": 5830 + }, + { + "epoch": 1.5288626873004176, + "grad_norm": 8.764506736183701, + "learning_rate": 2.8815991512195217e-07, + "loss": 0.1857, + "step": 5835 + }, + { + "epoch": 1.5301727667239826, + "grad_norm": 5.111081703573647, + "learning_rate": 2.8663988284216444e-07, + "loss": 0.1286, + "step": 5840 + }, + { + "epoch": 1.5314828461475476, + "grad_norm": 4.293980562734722, + "learning_rate": 2.851231992327208e-07, + "loss": 0.1363, + "step": 5845 + }, + { + "epoch": 1.5327929255711128, + "grad_norm": 8.711462310411664, + "learning_rate": 2.8360987141326954e-07, + "loss": 0.1606, + "step": 5850 + }, + { + "epoch": 1.5341030049946778, + "grad_norm": 8.990881900478213, + "learning_rate": 2.820999064877062e-07, + "loss": 0.2247, + "step": 5855 + }, + { + "epoch": 1.5354130844182428, + "grad_norm": 9.192897104652108, + "learning_rate": 2.805933115441412e-07, + "loss": 0.1715, + "step": 5860 + }, + { + "epoch": 1.536723163841808, + "grad_norm": 3.8288642962534567, + "learning_rate": 2.790900936548646e-07, + "loss": 0.1652, + "step": 5865 + }, + { + "epoch": 1.538033243265373, + "grad_norm": 12.038766998437053, + "learning_rate": 2.775902598763137e-07, + "loss": 0.2038, + "step": 5870 + }, + { + "epoch": 1.539343322688938, + "grad_norm": 4.827284945086436, + "learning_rate": 2.7609381724904024e-07, + "loss": 0.1901, + "step": 5875 + }, + { + "epoch": 1.5406534021125031, + "grad_norm": 4.331535469429681, + "learning_rate": 2.746007727976779e-07, + "loss": 0.1754, + "step": 5880 + }, + { + "epoch": 1.541963481536068, + "grad_norm": 1.900735020734344, + "learning_rate": 2.731111335309072e-07, + "loss": 0.1058, + "step": 5885 + }, + { + "epoch": 1.543273560959633, + "grad_norm": 2.961654173117568, + "learning_rate": 2.7162490644142545e-07, + "loss": 0.1598, + "step": 5890 + }, + { + "epoch": 1.5445836403831983, + "grad_norm": 10.176050102064952, + "learning_rate": 2.701420985059112e-07, + "loss": 0.1775, + "step": 5895 + }, + { + "epoch": 1.5458937198067633, + "grad_norm": 8.432950736971934, + "learning_rate": 2.686627166849931e-07, + "loss": 0.1664, + "step": 5900 + }, + { + "epoch": 1.5458937198067633, + "eval_accuracy": 0.7504, + "eval_loss": 1.094858169555664, + "eval_runtime": 139.504, + "eval_samples_per_second": 8.96, + "eval_steps_per_second": 2.244, + "step": 5900 + }, + { + "epoch": 1.5472037992303282, + "grad_norm": 3.4036278165694545, + "learning_rate": 2.671867679232175e-07, + "loss": 0.1702, + "step": 5905 + }, + { + "epoch": 1.5485138786538934, + "grad_norm": 2.46431216575455, + "learning_rate": 2.65714259149014e-07, + "loss": 0.2071, + "step": 5910 + }, + { + "epoch": 1.5498239580774584, + "grad_norm": 4.676488316765192, + "learning_rate": 2.64245197274666e-07, + "loss": 0.1668, + "step": 5915 + }, + { + "epoch": 1.5511340375010234, + "grad_norm": 2.776658179276032, + "learning_rate": 2.6277958919627386e-07, + "loss": 0.173, + "step": 5920 + }, + { + "epoch": 1.5524441169245886, + "grad_norm": 2.9881895057457424, + "learning_rate": 2.6131744179372725e-07, + "loss": 0.166, + "step": 5925 + }, + { + "epoch": 1.5537541963481536, + "grad_norm": 5.351569952228676, + "learning_rate": 2.5985876193066925e-07, + "loss": 0.1378, + "step": 5930 + }, + { + "epoch": 1.5550642757717186, + "grad_norm": 7.287593146087938, + "learning_rate": 2.5840355645446687e-07, + "loss": 0.18, + "step": 5935 + }, + { + "epoch": 1.5563743551952838, + "grad_norm": 5.162933527380095, + "learning_rate": 2.5695183219617644e-07, + "loss": 0.2142, + "step": 5940 + }, + { + "epoch": 1.5576844346188488, + "grad_norm": 4.473626526697219, + "learning_rate": 2.555035959705127e-07, + "loss": 0.1674, + "step": 5945 + }, + { + "epoch": 1.5589945140424137, + "grad_norm": 5.821815812779585, + "learning_rate": 2.540588545758179e-07, + "loss": 0.1876, + "step": 5950 + }, + { + "epoch": 1.560304593465979, + "grad_norm": 4.101891407721645, + "learning_rate": 2.5261761479402734e-07, + "loss": 0.1597, + "step": 5955 + }, + { + "epoch": 1.561614672889544, + "grad_norm": 15.605199269628502, + "learning_rate": 2.5117988339064053e-07, + "loss": 0.1509, + "step": 5960 + }, + { + "epoch": 1.562924752313109, + "grad_norm": 7.523818440166435, + "learning_rate": 2.49745667114686e-07, + "loss": 0.1557, + "step": 5965 + }, + { + "epoch": 1.564234831736674, + "grad_norm": 4.751911403623371, + "learning_rate": 2.483149726986934e-07, + "loss": 0.1609, + "step": 5970 + }, + { + "epoch": 1.565544911160239, + "grad_norm": 10.300208770198049, + "learning_rate": 2.468878068586583e-07, + "loss": 0.2317, + "step": 5975 + }, + { + "epoch": 1.566854990583804, + "grad_norm": 6.418645648934525, + "learning_rate": 2.4546417629401396e-07, + "loss": 0.1699, + "step": 5980 + }, + { + "epoch": 1.5681650700073693, + "grad_norm": 6.411061114094063, + "learning_rate": 2.440440876875971e-07, + "loss": 0.1921, + "step": 5985 + }, + { + "epoch": 1.5694751494309342, + "grad_norm": 7.188743967752872, + "learning_rate": 2.4262754770561777e-07, + "loss": 0.1515, + "step": 5990 + }, + { + "epoch": 1.5707852288544992, + "grad_norm": 5.309747838189995, + "learning_rate": 2.412145629976289e-07, + "loss": 0.1801, + "step": 5995 + }, + { + "epoch": 1.5720953082780644, + "grad_norm": 4.00510825001943, + "learning_rate": 2.39805140196493e-07, + "loss": 0.1731, + "step": 6000 + }, + { + "epoch": 1.5720953082780644, + "eval_accuracy": 0.76, + "eval_loss": 1.0234113931655884, + "eval_runtime": 139.1959, + "eval_samples_per_second": 8.98, + "eval_steps_per_second": 2.249, + "step": 6000 + }, + { + "epoch": 1.5734053877016294, + "grad_norm": 9.59492476008665, + "learning_rate": 2.3839928591835335e-07, + "loss": 0.1686, + "step": 6005 + }, + { + "epoch": 1.5747154671251944, + "grad_norm": 3.765377816124392, + "learning_rate": 2.3699700676260092e-07, + "loss": 0.1494, + "step": 6010 + }, + { + "epoch": 1.5760255465487596, + "grad_norm": 5.338406619519301, + "learning_rate": 2.3559830931184455e-07, + "loss": 0.1467, + "step": 6015 + }, + { + "epoch": 1.5773356259723246, + "grad_norm": 6.763713809000965, + "learning_rate": 2.3420320013187954e-07, + "loss": 0.1898, + "step": 6020 + }, + { + "epoch": 1.5786457053958896, + "grad_norm": 5.481130008134444, + "learning_rate": 2.328116857716579e-07, + "loss": 0.1548, + "step": 6025 + }, + { + "epoch": 1.5799557848194548, + "grad_norm": 3.557311963677506, + "learning_rate": 2.3142377276325563e-07, + "loss": 0.1443, + "step": 6030 + }, + { + "epoch": 1.5812658642430197, + "grad_norm": 5.246282698025739, + "learning_rate": 2.30039467621844e-07, + "loss": 0.1982, + "step": 6035 + }, + { + "epoch": 1.5825759436665847, + "grad_norm": 7.849841735411486, + "learning_rate": 2.286587768456575e-07, + "loss": 0.1785, + "step": 6040 + }, + { + "epoch": 1.58388602309015, + "grad_norm": 3.056662932671019, + "learning_rate": 2.272817069159647e-07, + "loss": 0.1452, + "step": 6045 + }, + { + "epoch": 1.585196102513715, + "grad_norm": 7.441075760718876, + "learning_rate": 2.2590826429703647e-07, + "loss": 0.1358, + "step": 6050 + }, + { + "epoch": 1.5865061819372799, + "grad_norm": 4.349747044723176, + "learning_rate": 2.2453845543611705e-07, + "loss": 0.126, + "step": 6055 + }, + { + "epoch": 1.587816261360845, + "grad_norm": 7.685604617278789, + "learning_rate": 2.2317228676339216e-07, + "loss": 0.1422, + "step": 6060 + }, + { + "epoch": 1.58912634078441, + "grad_norm": 5.9349721355409795, + "learning_rate": 2.218097646919599e-07, + "loss": 0.1861, + "step": 6065 + }, + { + "epoch": 1.590436420207975, + "grad_norm": 5.891145802679616, + "learning_rate": 2.2045089561780107e-07, + "loss": 0.1431, + "step": 6070 + }, + { + "epoch": 1.5917464996315402, + "grad_norm": 2.287356113721854, + "learning_rate": 2.1909568591974748e-07, + "loss": 0.1288, + "step": 6075 + }, + { + "epoch": 1.5930565790551052, + "grad_norm": 7.38109989814088, + "learning_rate": 2.1774414195945423e-07, + "loss": 0.1277, + "step": 6080 + }, + { + "epoch": 1.5943666584786702, + "grad_norm": 3.893434900862315, + "learning_rate": 2.1639627008136697e-07, + "loss": 0.1412, + "step": 6085 + }, + { + "epoch": 1.5956767379022354, + "grad_norm": 3.7517744969526183, + "learning_rate": 2.1505207661269554e-07, + "loss": 0.131, + "step": 6090 + }, + { + "epoch": 1.5969868173258004, + "grad_norm": 5.213568999899694, + "learning_rate": 2.1371156786338107e-07, + "loss": 0.1493, + "step": 6095 + }, + { + "epoch": 1.5982968967493654, + "grad_norm": 7.30638497105969, + "learning_rate": 2.123747501260691e-07, + "loss": 0.1427, + "step": 6100 + }, + { + "epoch": 1.5982968967493654, + "eval_accuracy": 0.7576, + "eval_loss": 1.1005467176437378, + "eval_runtime": 138.3031, + "eval_samples_per_second": 9.038, + "eval_steps_per_second": 2.263, + "step": 6100 + }, + { + "epoch": 1.5996069761729306, + "grad_norm": 4.455588136802878, + "learning_rate": 2.1104162967607774e-07, + "loss": 0.1667, + "step": 6105 + }, + { + "epoch": 1.6009170555964956, + "grad_norm": 5.978054630784655, + "learning_rate": 2.0971221277136942e-07, + "loss": 0.1548, + "step": 6110 + }, + { + "epoch": 1.6022271350200605, + "grad_norm": 6.935845367832056, + "learning_rate": 2.083865056525218e-07, + "loss": 0.1465, + "step": 6115 + }, + { + "epoch": 1.6035372144436257, + "grad_norm": 9.625156258887605, + "learning_rate": 2.0706451454269723e-07, + "loss": 0.1988, + "step": 6120 + }, + { + "epoch": 1.6048472938671907, + "grad_norm": 8.66757452370429, + "learning_rate": 2.0574624564761557e-07, + "loss": 0.1249, + "step": 6125 + }, + { + "epoch": 1.6061573732907557, + "grad_norm": 6.280401831803158, + "learning_rate": 2.0443170515552166e-07, + "loss": 0.1398, + "step": 6130 + }, + { + "epoch": 1.607467452714321, + "grad_norm": 7.078421243621821, + "learning_rate": 2.0312089923716058e-07, + "loss": 0.1744, + "step": 6135 + }, + { + "epoch": 1.6087775321378859, + "grad_norm": 3.5683942015597343, + "learning_rate": 2.0181383404574493e-07, + "loss": 0.1518, + "step": 6140 + }, + { + "epoch": 1.6100876115614509, + "grad_norm": 5.9348857695988135, + "learning_rate": 2.0051051571692866e-07, + "loss": 0.15, + "step": 6145 + }, + { + "epoch": 1.611397690985016, + "grad_norm": 2.6156284263853595, + "learning_rate": 1.9921095036877644e-07, + "loss": 0.1248, + "step": 6150 + }, + { + "epoch": 1.612707770408581, + "grad_norm": 5.4712881492494425, + "learning_rate": 1.9791514410173538e-07, + "loss": 0.1972, + "step": 6155 + }, + { + "epoch": 1.614017849832146, + "grad_norm": 5.1651116865695785, + "learning_rate": 1.966231029986075e-07, + "loss": 0.1164, + "step": 6160 + }, + { + "epoch": 1.6153279292557112, + "grad_norm": 7.989806082396054, + "learning_rate": 1.9533483312451959e-07, + "loss": 0.2138, + "step": 6165 + }, + { + "epoch": 1.6166380086792762, + "grad_norm": 5.514479603993972, + "learning_rate": 1.9405034052689585e-07, + "loss": 0.1346, + "step": 6170 + }, + { + "epoch": 1.6179480881028412, + "grad_norm": 5.995445272935516, + "learning_rate": 1.927696312354289e-07, + "loss": 0.1327, + "step": 6175 + }, + { + "epoch": 1.6192581675264064, + "grad_norm": 7.346676621492845, + "learning_rate": 1.9149271126205168e-07, + "loss": 0.172, + "step": 6180 + }, + { + "epoch": 1.6205682469499714, + "grad_norm": 7.02864110812697, + "learning_rate": 1.902195866009091e-07, + "loss": 0.1684, + "step": 6185 + }, + { + "epoch": 1.6218783263735363, + "grad_norm": 5.984903488092592, + "learning_rate": 1.8895026322833063e-07, + "loss": 0.1282, + "step": 6190 + }, + { + "epoch": 1.6231884057971016, + "grad_norm": 3.5054458379581197, + "learning_rate": 1.876847471028009e-07, + "loss": 0.1257, + "step": 6195 + }, + { + "epoch": 1.6244984852206665, + "grad_norm": 4.70529605927654, + "learning_rate": 1.8642304416493283e-07, + "loss": 0.1267, + "step": 6200 + }, + { + "epoch": 1.6244984852206665, + "eval_accuracy": 0.7552, + "eval_loss": 1.1194959878921509, + "eval_runtime": 142.0998, + "eval_samples_per_second": 8.797, + "eval_steps_per_second": 2.203, + "step": 6200 + }, + { + "epoch": 1.6258085646442315, + "grad_norm": 7.7424757434182006, + "learning_rate": 1.8516516033743956e-07, + "loss": 0.1575, + "step": 6205 + }, + { + "epoch": 1.6271186440677967, + "grad_norm": 5.852572753378986, + "learning_rate": 1.8391110152510615e-07, + "loss": 0.1466, + "step": 6210 + }, + { + "epoch": 1.6284287234913617, + "grad_norm": 9.711403077550843, + "learning_rate": 1.8266087361476258e-07, + "loss": 0.2505, + "step": 6215 + }, + { + "epoch": 1.6297388029149267, + "grad_norm": 7.233520202930265, + "learning_rate": 1.8141448247525527e-07, + "loss": 0.1326, + "step": 6220 + }, + { + "epoch": 1.6310488823384919, + "grad_norm": 7.089498480489309, + "learning_rate": 1.8017193395742024e-07, + "loss": 0.165, + "step": 6225 + }, + { + "epoch": 1.6323589617620569, + "grad_norm": 6.065667727984184, + "learning_rate": 1.7893323389405524e-07, + "loss": 0.1338, + "step": 6230 + }, + { + "epoch": 1.6336690411856218, + "grad_norm": 4.969225424762747, + "learning_rate": 1.776983880998929e-07, + "loss": 0.1625, + "step": 6235 + }, + { + "epoch": 1.634979120609187, + "grad_norm": 6.713374539142975, + "learning_rate": 1.7646740237157254e-07, + "loss": 0.1703, + "step": 6240 + }, + { + "epoch": 1.636289200032752, + "grad_norm": 5.594374782950924, + "learning_rate": 1.7524028248761401e-07, + "loss": 0.1917, + "step": 6245 + }, + { + "epoch": 1.637599279456317, + "grad_norm": 6.226467459107668, + "learning_rate": 1.7401703420838975e-07, + "loss": 0.1807, + "step": 6250 + }, + { + "epoch": 1.6389093588798822, + "grad_norm": 4.7490550372943074, + "learning_rate": 1.7279766327609757e-07, + "loss": 0.1605, + "step": 6255 + }, + { + "epoch": 1.6402194383034472, + "grad_norm": 6.40470417405122, + "learning_rate": 1.7158217541473518e-07, + "loss": 0.1279, + "step": 6260 + }, + { + "epoch": 1.6415295177270122, + "grad_norm": 9.101155921957748, + "learning_rate": 1.7037057633007157e-07, + "loss": 0.1125, + "step": 6265 + }, + { + "epoch": 1.6428395971505774, + "grad_norm": 6.16268053536328, + "learning_rate": 1.6916287170962107e-07, + "loss": 0.1575, + "step": 6270 + }, + { + "epoch": 1.6441496765741423, + "grad_norm": 6.511167097266026, + "learning_rate": 1.6795906722261644e-07, + "loss": 0.1668, + "step": 6275 + }, + { + "epoch": 1.6454597559977073, + "grad_norm": 5.236685660626339, + "learning_rate": 1.6675916851998272e-07, + "loss": 0.106, + "step": 6280 + }, + { + "epoch": 1.6467698354212725, + "grad_norm": 5.638441943629535, + "learning_rate": 1.6556318123430978e-07, + "loss": 0.1362, + "step": 6285 + }, + { + "epoch": 1.6480799148448375, + "grad_norm": 8.863766597675237, + "learning_rate": 1.6437111097982726e-07, + "loss": 0.1769, + "step": 6290 + }, + { + "epoch": 1.6493899942684025, + "grad_norm": 3.794500769944755, + "learning_rate": 1.631829633523767e-07, + "loss": 0.1522, + "step": 6295 + }, + { + "epoch": 1.6507000736919677, + "grad_norm": 3.9716392563806027, + "learning_rate": 1.6199874392938574e-07, + "loss": 0.1483, + "step": 6300 + }, + { + "epoch": 1.6507000736919677, + "eval_accuracy": 0.7576, + "eval_loss": 1.1004310846328735, + "eval_runtime": 141.7328, + "eval_samples_per_second": 8.819, + "eval_steps_per_second": 2.208, + "step": 6300 + }, + { + "epoch": 1.6520101531155327, + "grad_norm": 3.655291620709186, + "learning_rate": 1.6081845826984307e-07, + "loss": 0.1227, + "step": 6305 + }, + { + "epoch": 1.6533202325390977, + "grad_norm": 6.90496866662, + "learning_rate": 1.5964211191427058e-07, + "loss": 0.1756, + "step": 6310 + }, + { + "epoch": 1.6546303119626629, + "grad_norm": 9.388436741639218, + "learning_rate": 1.5846971038469915e-07, + "loss": 0.1361, + "step": 6315 + }, + { + "epoch": 1.6559403913862278, + "grad_norm": 6.255702654542568, + "learning_rate": 1.573012591846402e-07, + "loss": 0.1674, + "step": 6320 + }, + { + "epoch": 1.6572504708097928, + "grad_norm": 5.3552774475843945, + "learning_rate": 1.5613676379906315e-07, + "loss": 0.1525, + "step": 6325 + }, + { + "epoch": 1.658560550233358, + "grad_norm": 2.691722315737717, + "learning_rate": 1.5497622969436662e-07, + "loss": 0.1796, + "step": 6330 + }, + { + "epoch": 1.659870629656923, + "grad_norm": 6.025378541300572, + "learning_rate": 1.538196623183552e-07, + "loss": 0.183, + "step": 6335 + }, + { + "epoch": 1.661180709080488, + "grad_norm": 4.761564415573431, + "learning_rate": 1.5266706710021194e-07, + "loss": 0.1312, + "step": 6340 + }, + { + "epoch": 1.6624907885040532, + "grad_norm": 7.805446740567002, + "learning_rate": 1.51518449450474e-07, + "loss": 0.1651, + "step": 6345 + }, + { + "epoch": 1.6638008679276182, + "grad_norm": 5.256694126891557, + "learning_rate": 1.5037381476100707e-07, + "loss": 0.1294, + "step": 6350 + }, + { + "epoch": 1.6651109473511831, + "grad_norm": 7.755672196084709, + "learning_rate": 1.4923316840497968e-07, + "loss": 0.156, + "step": 6355 + }, + { + "epoch": 1.6664210267747483, + "grad_norm": 4.656789891974052, + "learning_rate": 1.480965157368389e-07, + "loss": 0.133, + "step": 6360 + }, + { + "epoch": 1.667731106198313, + "grad_norm": 5.4961595195828705, + "learning_rate": 1.4696386209228307e-07, + "loss": 0.1812, + "step": 6365 + }, + { + "epoch": 1.6690411856218783, + "grad_norm": 6.495919732500172, + "learning_rate": 1.4583521278824008e-07, + "loss": 0.1657, + "step": 6370 + }, + { + "epoch": 1.6703512650454435, + "grad_norm": 6.812774990594977, + "learning_rate": 1.4471057312283906e-07, + "loss": 0.1115, + "step": 6375 + }, + { + "epoch": 1.6716613444690083, + "grad_norm": 3.956134496828771, + "learning_rate": 1.4358994837538817e-07, + "loss": 0.2423, + "step": 6380 + }, + { + "epoch": 1.6729714238925735, + "grad_norm": 6.259621872344836, + "learning_rate": 1.424733438063479e-07, + "loss": 0.1022, + "step": 6385 + }, + { + "epoch": 1.6742815033161387, + "grad_norm": 6.072009290092665, + "learning_rate": 1.4136076465730695e-07, + "loss": 0.1832, + "step": 6390 + }, + { + "epoch": 1.6755915827397034, + "grad_norm": 7.456858413136058, + "learning_rate": 1.4025221615095873e-07, + "loss": 0.1657, + "step": 6395 + }, + { + "epoch": 1.6769016621632686, + "grad_norm": 7.260932680002808, + "learning_rate": 1.3914770349107495e-07, + "loss": 0.1346, + "step": 6400 + }, + { + "epoch": 1.6769016621632686, + "eval_accuracy": 0.7632, + "eval_loss": 1.100335717201233, + "eval_runtime": 141.6273, + "eval_samples_per_second": 8.826, + "eval_steps_per_second": 2.21, + "step": 6400 + }, + { + "epoch": 1.6782117415868338, + "grad_norm": 3.6061256334408203, + "learning_rate": 1.3804723186248313e-07, + "loss": 0.1343, + "step": 6405 + }, + { + "epoch": 1.6795218210103986, + "grad_norm": 8.270495540507543, + "learning_rate": 1.369508064310404e-07, + "loss": 0.1182, + "step": 6410 + }, + { + "epoch": 1.6808319004339638, + "grad_norm": 5.629731179384749, + "learning_rate": 1.3585843234361049e-07, + "loss": 0.1568, + "step": 6415 + }, + { + "epoch": 1.682141979857529, + "grad_norm": 2.867569435579598, + "learning_rate": 1.347701147280391e-07, + "loss": 0.1729, + "step": 6420 + }, + { + "epoch": 1.6834520592810938, + "grad_norm": 6.83897468616409, + "learning_rate": 1.3368585869313065e-07, + "loss": 0.1874, + "step": 6425 + }, + { + "epoch": 1.684762138704659, + "grad_norm": 4.3014228257373945, + "learning_rate": 1.326056693286226e-07, + "loss": 0.1778, + "step": 6430 + }, + { + "epoch": 1.6860722181282242, + "grad_norm": 10.457994997688326, + "learning_rate": 1.31529551705163e-07, + "loss": 0.2127, + "step": 6435 + }, + { + "epoch": 1.687382297551789, + "grad_norm": 5.086460128057394, + "learning_rate": 1.3045751087428648e-07, + "loss": 0.153, + "step": 6440 + }, + { + "epoch": 1.6886923769753541, + "grad_norm": 5.2909003289546375, + "learning_rate": 1.2938955186838983e-07, + "loss": 0.1303, + "step": 6445 + }, + { + "epoch": 1.6900024563989193, + "grad_norm": 6.3469654482036555, + "learning_rate": 1.283256797007094e-07, + "loss": 0.1625, + "step": 6450 + }, + { + "epoch": 1.691312535822484, + "grad_norm": 3.8066454211371687, + "learning_rate": 1.2726589936529654e-07, + "loss": 0.2029, + "step": 6455 + }, + { + "epoch": 1.6926226152460493, + "grad_norm": 5.242010338144686, + "learning_rate": 1.2621021583699476e-07, + "loss": 0.1424, + "step": 6460 + }, + { + "epoch": 1.6939326946696145, + "grad_norm": 3.25302928925533, + "learning_rate": 1.2515863407141603e-07, + "loss": 0.1493, + "step": 6465 + }, + { + "epoch": 1.6952427740931792, + "grad_norm": 5.788528827053253, + "learning_rate": 1.2411115900491865e-07, + "loss": 0.1396, + "step": 6470 + }, + { + "epoch": 1.6965528535167445, + "grad_norm": 5.311777991254716, + "learning_rate": 1.230677955545819e-07, + "loss": 0.1388, + "step": 6475 + }, + { + "epoch": 1.6978629329403097, + "grad_norm": 9.193485516456121, + "learning_rate": 1.2202854861818557e-07, + "loss": 0.1502, + "step": 6480 + }, + { + "epoch": 1.6991730123638744, + "grad_norm": 3.6649259537864043, + "learning_rate": 1.2099342307418392e-07, + "loss": 0.1834, + "step": 6485 + }, + { + "epoch": 1.7004830917874396, + "grad_norm": 2.368713152141659, + "learning_rate": 1.199624237816862e-07, + "loss": 0.1621, + "step": 6490 + }, + { + "epoch": 1.7017931712110048, + "grad_norm": 10.317778961476517, + "learning_rate": 1.1893555558043089e-07, + "loss": 0.1625, + "step": 6495 + }, + { + "epoch": 1.7031032506345696, + "grad_norm": 3.1939718268686623, + "learning_rate": 1.1791282329076523e-07, + "loss": 0.1682, + "step": 6500 + }, + { + "epoch": 1.7031032506345696, + "eval_accuracy": 0.7608, + "eval_loss": 1.0906686782836914, + "eval_runtime": 141.6589, + "eval_samples_per_second": 8.824, + "eval_steps_per_second": 2.21, + "step": 6500 + }, + { + "epoch": 1.7044133300581348, + "grad_norm": 7.245253263810927, + "learning_rate": 1.1689423171362079e-07, + "loss": 0.147, + "step": 6505 + }, + { + "epoch": 1.7057234094817, + "grad_norm": 3.713535860169349, + "learning_rate": 1.1587978563049161e-07, + "loss": 0.1361, + "step": 6510 + }, + { + "epoch": 1.7070334889052647, + "grad_norm": 3.1243926261206547, + "learning_rate": 1.1486948980341282e-07, + "loss": 0.1104, + "step": 6515 + }, + { + "epoch": 1.70834356832883, + "grad_norm": 2.6440342898591838, + "learning_rate": 1.1386334897493632e-07, + "loss": 0.1154, + "step": 6520 + }, + { + "epoch": 1.7096536477523951, + "grad_norm": 2.7119760374007877, + "learning_rate": 1.128613678681104e-07, + "loss": 0.1315, + "step": 6525 + }, + { + "epoch": 1.71096372717596, + "grad_norm": 6.397748692900095, + "learning_rate": 1.1186355118645552e-07, + "loss": 0.1652, + "step": 6530 + }, + { + "epoch": 1.712273806599525, + "grad_norm": 4.992532932504358, + "learning_rate": 1.1086990361394477e-07, + "loss": 0.1224, + "step": 6535 + }, + { + "epoch": 1.7135838860230903, + "grad_norm": 8.728515211435251, + "learning_rate": 1.0988042981497947e-07, + "loss": 0.2042, + "step": 6540 + }, + { + "epoch": 1.714893965446655, + "grad_norm": 13.21150999052598, + "learning_rate": 1.0889513443436904e-07, + "loss": 0.1576, + "step": 6545 + }, + { + "epoch": 1.7162040448702203, + "grad_norm": 2.632026076658514, + "learning_rate": 1.0791402209730794e-07, + "loss": 0.0997, + "step": 6550 + }, + { + "epoch": 1.7175141242937855, + "grad_norm": 5.720341695762074, + "learning_rate": 1.0693709740935463e-07, + "loss": 0.155, + "step": 6555 + }, + { + "epoch": 1.7188242037173502, + "grad_norm": 5.80222557370497, + "learning_rate": 1.0596436495641025e-07, + "loss": 0.1255, + "step": 6560 + }, + { + "epoch": 1.7201342831409154, + "grad_norm": 6.100322814196168, + "learning_rate": 1.0499582930469597e-07, + "loss": 0.1629, + "step": 6565 + }, + { + "epoch": 1.7214443625644806, + "grad_norm": 4.478475471096975, + "learning_rate": 1.0403149500073294e-07, + "loss": 0.1398, + "step": 6570 + }, + { + "epoch": 1.7227544419880454, + "grad_norm": 4.8025162956814835, + "learning_rate": 1.0307136657131977e-07, + "loss": 0.1035, + "step": 6575 + }, + { + "epoch": 1.7240645214116106, + "grad_norm": 3.2219463387203233, + "learning_rate": 1.0211544852351183e-07, + "loss": 0.1807, + "step": 6580 + }, + { + "epoch": 1.7253746008351758, + "grad_norm": 5.573188128425597, + "learning_rate": 1.0116374534459993e-07, + "loss": 0.1532, + "step": 6585 + }, + { + "epoch": 1.7266846802587406, + "grad_norm": 4.185020111907581, + "learning_rate": 1.0021626150208984e-07, + "loss": 0.1329, + "step": 6590 + }, + { + "epoch": 1.7279947596823058, + "grad_norm": 3.8823758733463016, + "learning_rate": 9.927300144368045e-08, + "loss": 0.1349, + "step": 6595 + }, + { + "epoch": 1.7293048391058707, + "grad_norm": 16.750270685634902, + "learning_rate": 9.833396959724306e-08, + "loss": 0.1322, + "step": 6600 + }, + { + "epoch": 1.7293048391058707, + "eval_accuracy": 0.7608, + "eval_loss": 1.128875970840454, + "eval_runtime": 143.9246, + "eval_samples_per_second": 8.685, + "eval_steps_per_second": 2.175, + "step": 6600 + }, + { + "epoch": 1.7306149185294357, + "grad_norm": 5.043906483496135, + "learning_rate": 9.739917037080148e-08, + "loss": 0.1572, + "step": 6605 + }, + { + "epoch": 1.731924997953001, + "grad_norm": 6.199552158675514, + "learning_rate": 9.646860815250979e-08, + "loss": 0.1627, + "step": 6610 + }, + { + "epoch": 1.733235077376566, + "grad_norm": 2.951863930249291, + "learning_rate": 9.554228731063373e-08, + "loss": 0.154, + "step": 6615 + }, + { + "epoch": 1.7345451568001309, + "grad_norm": 5.2157616611104975, + "learning_rate": 9.462021219352801e-08, + "loss": 0.1631, + "step": 6620 + }, + { + "epoch": 1.735855236223696, + "grad_norm": 3.3619242290029963, + "learning_rate": 9.370238712961742e-08, + "loss": 0.2129, + "step": 6625 + }, + { + "epoch": 1.737165315647261, + "grad_norm": 4.655934383309167, + "learning_rate": 9.27888164273759e-08, + "loss": 0.1738, + "step": 6630 + }, + { + "epoch": 1.738475395070826, + "grad_norm": 9.09248052048832, + "learning_rate": 9.1879504375307e-08, + "loss": 0.21, + "step": 6635 + }, + { + "epoch": 1.7397854744943912, + "grad_norm": 9.603961489686213, + "learning_rate": 9.097445524192248e-08, + "loss": 0.1156, + "step": 6640 + }, + { + "epoch": 1.7410955539179562, + "grad_norm": 6.608421456426681, + "learning_rate": 9.007367327572368e-08, + "loss": 0.1623, + "step": 6645 + }, + { + "epoch": 1.7424056333415212, + "grad_norm": 5.838309352454389, + "learning_rate": 8.91771627051805e-08, + "loss": 0.1661, + "step": 6650 + }, + { + "epoch": 1.7437157127650864, + "grad_norm": 5.515377990794858, + "learning_rate": 8.828492773871177e-08, + "loss": 0.1721, + "step": 6655 + }, + { + "epoch": 1.7450257921886514, + "grad_norm": 5.475478047926709, + "learning_rate": 8.739697256466638e-08, + "loss": 0.1668, + "step": 6660 + }, + { + "epoch": 1.7463358716122164, + "grad_norm": 6.660668383621966, + "learning_rate": 8.651330135130241e-08, + "loss": 0.1841, + "step": 6665 + }, + { + "epoch": 1.7476459510357816, + "grad_norm": 4.043989017999333, + "learning_rate": 8.563391824676814e-08, + "loss": 0.1521, + "step": 6670 + }, + { + "epoch": 1.7489560304593466, + "grad_norm": 3.805298088229625, + "learning_rate": 8.475882737908247e-08, + "loss": 0.129, + "step": 6675 + }, + { + "epoch": 1.7502661098829115, + "grad_norm": 3.0665024919957022, + "learning_rate": 8.388803285611601e-08, + "loss": 0.1577, + "step": 6680 + }, + { + "epoch": 1.7515761893064767, + "grad_norm": 14.26899142932033, + "learning_rate": 8.30215387655706e-08, + "loss": 0.1589, + "step": 6685 + }, + { + "epoch": 1.7528862687300417, + "grad_norm": 6.541801424230482, + "learning_rate": 8.215934917496192e-08, + "loss": 0.153, + "step": 6690 + }, + { + "epoch": 1.7541963481536067, + "grad_norm": 5.250779392264523, + "learning_rate": 8.130146813159844e-08, + "loss": 0.148, + "step": 6695 + }, + { + "epoch": 1.755506427577172, + "grad_norm": 3.927112284430514, + "learning_rate": 8.044789966256382e-08, + "loss": 0.1994, + "step": 6700 + }, + { + "epoch": 1.755506427577172, + "eval_accuracy": 0.76, + "eval_loss": 1.106866478919983, + "eval_runtime": 141.8989, + "eval_samples_per_second": 8.809, + "eval_steps_per_second": 2.206, + "step": 6700 + }, + { + "epoch": 1.7568165070007369, + "grad_norm": 9.767839811176259, + "learning_rate": 7.959864777469749e-08, + "loss": 0.2056, + "step": 6705 + }, + { + "epoch": 1.7581265864243019, + "grad_norm": 5.9103243659566, + "learning_rate": 7.875371645457574e-08, + "loss": 0.1468, + "step": 6710 + }, + { + "epoch": 1.759436665847867, + "grad_norm": 6.055358982690812, + "learning_rate": 7.791310966849362e-08, + "loss": 0.1375, + "step": 6715 + }, + { + "epoch": 1.760746745271432, + "grad_norm": 7.406585709894513, + "learning_rate": 7.707683136244503e-08, + "loss": 0.1663, + "step": 6720 + }, + { + "epoch": 1.762056824694997, + "grad_norm": 10.412933831104338, + "learning_rate": 7.624488546210584e-08, + "loss": 0.1703, + "step": 6725 + }, + { + "epoch": 1.7633669041185622, + "grad_norm": 11.2079157468613, + "learning_rate": 7.5417275872814e-08, + "loss": 0.1649, + "step": 6730 + }, + { + "epoch": 1.7646769835421272, + "grad_norm": 4.487770161857672, + "learning_rate": 7.459400647955261e-08, + "loss": 0.1109, + "step": 6735 + }, + { + "epoch": 1.7659870629656922, + "grad_norm": 4.720489203941066, + "learning_rate": 7.377508114693021e-08, + "loss": 0.2277, + "step": 6740 + }, + { + "epoch": 1.7672971423892574, + "grad_norm": 5.432786720479122, + "learning_rate": 7.296050371916362e-08, + "loss": 0.1617, + "step": 6745 + }, + { + "epoch": 1.7686072218128224, + "grad_norm": 6.287493094401141, + "learning_rate": 7.21502780200598e-08, + "loss": 0.1686, + "step": 6750 + }, + { + "epoch": 1.7699173012363874, + "grad_norm": 2.1808008637078897, + "learning_rate": 7.134440785299745e-08, + "loss": 0.1527, + "step": 6755 + }, + { + "epoch": 1.7712273806599526, + "grad_norm": 3.224585978413297, + "learning_rate": 7.054289700090987e-08, + "loss": 0.1003, + "step": 6760 + }, + { + "epoch": 1.7725374600835175, + "grad_norm": 4.540878351864405, + "learning_rate": 6.974574922626598e-08, + "loss": 0.146, + "step": 6765 + }, + { + "epoch": 1.7738475395070825, + "grad_norm": 7.4029328124452345, + "learning_rate": 6.895296827105423e-08, + "loss": 0.1749, + "step": 6770 + }, + { + "epoch": 1.7751576189306477, + "grad_norm": 4.084693702284536, + "learning_rate": 6.81645578567639e-08, + "loss": 0.1532, + "step": 6775 + }, + { + "epoch": 1.7764676983542127, + "grad_norm": 5.481752305139202, + "learning_rate": 6.738052168436814e-08, + "loss": 0.1742, + "step": 6780 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 6.07861709288181, + "learning_rate": 6.660086343430637e-08, + "loss": 0.1624, + "step": 6785 + }, + { + "epoch": 1.7790878572013429, + "grad_norm": 4.676302274517847, + "learning_rate": 6.582558676646676e-08, + "loss": 0.1583, + "step": 6790 + }, + { + "epoch": 1.7803979366249079, + "grad_norm": 3.9543347540784666, + "learning_rate": 6.505469532017005e-08, + "loss": 0.142, + "step": 6795 + }, + { + "epoch": 1.7817080160484728, + "grad_norm": 7.99552033660062, + "learning_rate": 6.428819271415098e-08, + "loss": 0.159, + "step": 6800 + }, + { + "epoch": 1.7817080160484728, + "eval_accuracy": 0.7608, + "eval_loss": 1.0944597721099854, + "eval_runtime": 142.6449, + "eval_samples_per_second": 8.763, + "eval_steps_per_second": 2.194, + "step": 6800 + }, + { + "epoch": 1.783018095472038, + "grad_norm": 5.928421818999669, + "learning_rate": 6.35260825465429e-08, + "loss": 0.1406, + "step": 6805 + }, + { + "epoch": 1.784328174895603, + "grad_norm": 5.358450738294621, + "learning_rate": 6.276836839485944e-08, + "loss": 0.1684, + "step": 6810 + }, + { + "epoch": 1.785638254319168, + "grad_norm": 7.4098511156954885, + "learning_rate": 6.201505381597872e-08, + "loss": 0.1258, + "step": 6815 + }, + { + "epoch": 1.7869483337427332, + "grad_norm": 10.028496011778648, + "learning_rate": 6.126614234612593e-08, + "loss": 0.1363, + "step": 6820 + }, + { + "epoch": 1.7882584131662982, + "grad_norm": 8.42691690098271, + "learning_rate": 6.05216375008576e-08, + "loss": 0.1629, + "step": 6825 + }, + { + "epoch": 1.7895684925898632, + "grad_norm": 7.8437977453123695, + "learning_rate": 5.978154277504432e-08, + "loss": 0.1488, + "step": 6830 + }, + { + "epoch": 1.7908785720134284, + "grad_norm": 9.079568079332288, + "learning_rate": 5.904586164285441e-08, + "loss": 0.1451, + "step": 6835 + }, + { + "epoch": 1.7921886514369934, + "grad_norm": 5.503155338633177, + "learning_rate": 5.831459755773815e-08, + "loss": 0.1478, + "step": 6840 + }, + { + "epoch": 1.7934987308605583, + "grad_norm": 6.773303376259157, + "learning_rate": 5.7587753952411e-08, + "loss": 0.1445, + "step": 6845 + }, + { + "epoch": 1.7948088102841235, + "grad_norm": 4.765236453726558, + "learning_rate": 5.686533423883788e-08, + "loss": 0.1617, + "step": 6850 + }, + { + "epoch": 1.7961188897076885, + "grad_norm": 6.230089077098877, + "learning_rate": 5.6147341808216894e-08, + "loss": 0.1509, + "step": 6855 + }, + { + "epoch": 1.7974289691312535, + "grad_norm": 4.574445807215422, + "learning_rate": 5.543378003096344e-08, + "loss": 0.1722, + "step": 6860 + }, + { + "epoch": 1.7987390485548187, + "grad_norm": 4.348732547956968, + "learning_rate": 5.4724652256694205e-08, + "loss": 0.1443, + "step": 6865 + }, + { + "epoch": 1.8000491279783837, + "grad_norm": 7.066725464839033, + "learning_rate": 5.401996181421253e-08, + "loss": 0.1485, + "step": 6870 + }, + { + "epoch": 1.8013592074019487, + "grad_norm": 5.328359673700302, + "learning_rate": 5.331971201149088e-08, + "loss": 0.1419, + "step": 6875 + }, + { + "epoch": 1.8026692868255139, + "grad_norm": 5.644656887108177, + "learning_rate": 5.262390613565737e-08, + "loss": 0.1424, + "step": 6880 + }, + { + "epoch": 1.8039793662490788, + "grad_norm": 5.671327408847072, + "learning_rate": 5.193254745297848e-08, + "loss": 0.198, + "step": 6885 + }, + { + "epoch": 1.8052894456726438, + "grad_norm": 4.877568064660579, + "learning_rate": 5.124563920884495e-08, + "loss": 0.1428, + "step": 6890 + }, + { + "epoch": 1.806599525096209, + "grad_norm": 6.817395500823747, + "learning_rate": 5.056318462775644e-08, + "loss": 0.1432, + "step": 6895 + }, + { + "epoch": 1.807909604519774, + "grad_norm": 5.927934052499454, + "learning_rate": 4.988518691330579e-08, + "loss": 0.1137, + "step": 6900 + }, + { + "epoch": 1.807909604519774, + "eval_accuracy": 0.7632, + "eval_loss": 1.1021169424057007, + "eval_runtime": 141.7931, + "eval_samples_per_second": 8.816, + "eval_steps_per_second": 2.207, + "step": 6900 + }, + { + "epoch": 1.809219683943339, + "grad_norm": 4.973694598901904, + "learning_rate": 4.9211649248164125e-08, + "loss": 0.1506, + "step": 6905 + }, + { + "epoch": 1.8105297633669042, + "grad_norm": 3.883659080774288, + "learning_rate": 4.854257479406654e-08, + "loss": 0.141, + "step": 6910 + }, + { + "epoch": 1.8118398427904692, + "grad_norm": 6.41491315709119, + "learning_rate": 4.787796669179689e-08, + "loss": 0.158, + "step": 6915 + }, + { + "epoch": 1.8131499222140341, + "grad_norm": 3.9302834820572814, + "learning_rate": 4.721782806117236e-08, + "loss": 0.1322, + "step": 6920 + }, + { + "epoch": 1.8144600016375994, + "grad_norm": 4.289211965868008, + "learning_rate": 4.656216200103036e-08, + "loss": 0.1337, + "step": 6925 + }, + { + "epoch": 1.8157700810611643, + "grad_norm": 11.248831743934637, + "learning_rate": 4.591097158921198e-08, + "loss": 0.1829, + "step": 6930 + }, + { + "epoch": 1.8170801604847293, + "grad_norm": 9.878450217229018, + "learning_rate": 4.526425988254967e-08, + "loss": 0.1566, + "step": 6935 + }, + { + "epoch": 1.8183902399082945, + "grad_norm": 7.865233707641088, + "learning_rate": 4.4622029916850935e-08, + "loss": 0.1189, + "step": 6940 + }, + { + "epoch": 1.8197003193318595, + "grad_norm": 9.744045573658111, + "learning_rate": 4.3984284706885976e-08, + "loss": 0.1497, + "step": 6945 + }, + { + "epoch": 1.8210103987554245, + "grad_norm": 7.22547788135765, + "learning_rate": 4.335102724637163e-08, + "loss": 0.2296, + "step": 6950 + }, + { + "epoch": 1.8223204781789897, + "grad_norm": 11.079040650914395, + "learning_rate": 4.2722260507958684e-08, + "loss": 0.1922, + "step": 6955 + }, + { + "epoch": 1.8236305576025547, + "grad_norm": 8.165182518676898, + "learning_rate": 4.2097987443217577e-08, + "loss": 0.1381, + "step": 6960 + }, + { + "epoch": 1.8249406370261196, + "grad_norm": 3.739467681211686, + "learning_rate": 4.147821098262405e-08, + "loss": 0.1294, + "step": 6965 + }, + { + "epoch": 1.8262507164496848, + "grad_norm": 5.402892377251783, + "learning_rate": 4.086293403554641e-08, + "loss": 0.1786, + "step": 6970 + }, + { + "epoch": 1.8275607958732498, + "grad_norm": 4.468753443729968, + "learning_rate": 4.0252159490230645e-08, + "loss": 0.1654, + "step": 6975 + }, + { + "epoch": 1.8288708752968148, + "grad_norm": 5.203594506261989, + "learning_rate": 3.964589021378772e-08, + "loss": 0.1367, + "step": 6980 + }, + { + "epoch": 1.83018095472038, + "grad_norm": 3.380052180374155, + "learning_rate": 3.90441290521798e-08, + "loss": 0.1106, + "step": 6985 + }, + { + "epoch": 1.831491034143945, + "grad_norm": 4.633407767619745, + "learning_rate": 3.8446878830207254e-08, + "loss": 0.1679, + "step": 6990 + }, + { + "epoch": 1.83280111356751, + "grad_norm": 5.329985366173506, + "learning_rate": 3.785414235149465e-08, + "loss": 0.1565, + "step": 6995 + }, + { + "epoch": 1.8341111929910752, + "grad_norm": 6.556659404147076, + "learning_rate": 3.726592239847826e-08, + "loss": 0.2095, + "step": 7000 + }, + { + "epoch": 1.8341111929910752, + "eval_accuracy": 0.7624, + "eval_loss": 1.1032047271728516, + "eval_runtime": 141.6558, + "eval_samples_per_second": 8.824, + "eval_steps_per_second": 2.21, + "step": 7000 + }, + { + "epoch": 1.8354212724146401, + "grad_norm": 4.684757438072978, + "learning_rate": 3.668222173239288e-08, + "loss": 0.1576, + "step": 7005 + }, + { + "epoch": 1.8367313518382051, + "grad_norm": 7.039438206694064, + "learning_rate": 3.6103043093258625e-08, + "loss": 0.1128, + "step": 7010 + }, + { + "epoch": 1.8380414312617703, + "grad_norm": 12.172322751037852, + "learning_rate": 3.552838919986845e-08, + "loss": 0.1683, + "step": 7015 + }, + { + "epoch": 1.8393515106853353, + "grad_norm": 7.127343846862284, + "learning_rate": 3.495826274977487e-08, + "loss": 0.0943, + "step": 7020 + }, + { + "epoch": 1.8406615901089003, + "grad_norm": 3.994985416023033, + "learning_rate": 3.439266641927752e-08, + "loss": 0.1505, + "step": 7025 + }, + { + "epoch": 1.8419716695324655, + "grad_norm": 10.627564884995905, + "learning_rate": 3.383160286341091e-08, + "loss": 0.1712, + "step": 7030 + }, + { + "epoch": 1.8432817489560305, + "grad_norm": 5.604749535399941, + "learning_rate": 3.327507471593172e-08, + "loss": 0.1205, + "step": 7035 + }, + { + "epoch": 1.8445918283795955, + "grad_norm": 4.8421269912484215, + "learning_rate": 3.272308458930606e-08, + "loss": 0.1152, + "step": 7040 + }, + { + "epoch": 1.8459019078031607, + "grad_norm": 9.074100019849872, + "learning_rate": 3.2175635074698005e-08, + "loss": 0.2357, + "step": 7045 + }, + { + "epoch": 1.8472119872267256, + "grad_norm": 7.154307006903744, + "learning_rate": 3.1632728741956884e-08, + "loss": 0.1552, + "step": 7050 + }, + { + "epoch": 1.8485220666502906, + "grad_norm": 2.6781924530315306, + "learning_rate": 3.1094368139604865e-08, + "loss": 0.106, + "step": 7055 + }, + { + "epoch": 1.8498321460738558, + "grad_norm": 7.370147947886868, + "learning_rate": 3.0560555794826196e-08, + "loss": 0.1413, + "step": 7060 + }, + { + "epoch": 1.8511422254974208, + "grad_norm": 3.802825821569026, + "learning_rate": 3.003129421345407e-08, + "loss": 0.1453, + "step": 7065 + }, + { + "epoch": 1.8524523049209858, + "grad_norm": 6.1792330095504395, + "learning_rate": 2.9506585879959577e-08, + "loss": 0.1564, + "step": 7070 + }, + { + "epoch": 1.853762384344551, + "grad_norm": 5.355565750702989, + "learning_rate": 2.8986433257439658e-08, + "loss": 0.1967, + "step": 7075 + }, + { + "epoch": 1.855072463768116, + "grad_norm": 6.360151114951984, + "learning_rate": 2.8470838787606034e-08, + "loss": 0.0963, + "step": 7080 + }, + { + "epoch": 1.856382543191681, + "grad_norm": 6.69819610174965, + "learning_rate": 2.795980489077332e-08, + "loss": 0.1303, + "step": 7085 + }, + { + "epoch": 1.8576926226152461, + "grad_norm": 4.435876647603045, + "learning_rate": 2.7453333965847815e-08, + "loss": 0.1269, + "step": 7090 + }, + { + "epoch": 1.8590027020388111, + "grad_norm": 7.698895216569222, + "learning_rate": 2.6951428390316165e-08, + "loss": 0.1347, + "step": 7095 + }, + { + "epoch": 1.860312781462376, + "grad_norm": 3.790916594995889, + "learning_rate": 2.6454090520234063e-08, + "loss": 0.2099, + "step": 7100 + }, + { + "epoch": 1.860312781462376, + "eval_accuracy": 0.7632, + "eval_loss": 1.1122453212738037, + "eval_runtime": 142.3069, + "eval_samples_per_second": 8.784, + "eval_steps_per_second": 2.199, + "step": 7100 + }, + { + "epoch": 1.8616228608859413, + "grad_norm": 3.9663209864483724, + "learning_rate": 2.596132269021589e-08, + "loss": 0.1212, + "step": 7105 + }, + { + "epoch": 1.8629329403095063, + "grad_norm": 5.657131945956216, + "learning_rate": 2.5473127213422762e-08, + "loss": 0.1551, + "step": 7110 + }, + { + "epoch": 1.8642430197330713, + "grad_norm": 5.092795916916417, + "learning_rate": 2.4989506381552617e-08, + "loss": 0.1736, + "step": 7115 + }, + { + "epoch": 1.8655530991566365, + "grad_norm": 5.798872866253983, + "learning_rate": 2.4510462464828352e-08, + "loss": 0.1684, + "step": 7120 + }, + { + "epoch": 1.8668631785802015, + "grad_norm": 3.20953589455004, + "learning_rate": 2.4035997711988387e-08, + "loss": 0.1094, + "step": 7125 + }, + { + "epoch": 1.8681732580037664, + "grad_norm": 3.3680896195026477, + "learning_rate": 2.3566114350275223e-08, + "loss": 0.1694, + "step": 7130 + }, + { + "epoch": 1.8694833374273316, + "grad_norm": 5.729982989667835, + "learning_rate": 2.3100814585425564e-08, + "loss": 0.1564, + "step": 7135 + }, + { + "epoch": 1.8707934168508966, + "grad_norm": 10.389717669058681, + "learning_rate": 2.264010060165944e-08, + "loss": 0.1514, + "step": 7140 + }, + { + "epoch": 1.8721034962744616, + "grad_norm": 5.075854563803782, + "learning_rate": 2.2183974561670205e-08, + "loss": 0.2024, + "step": 7145 + }, + { + "epoch": 1.8734135756980268, + "grad_norm": 4.242493204391383, + "learning_rate": 2.1732438606614665e-08, + "loss": 0.1311, + "step": 7150 + }, + { + "epoch": 1.8747236551215918, + "grad_norm": 5.4042102288406095, + "learning_rate": 2.1285494856102315e-08, + "loss": 0.1726, + "step": 7155 + }, + { + "epoch": 1.8760337345451568, + "grad_norm": 6.071406295353831, + "learning_rate": 2.0843145408186547e-08, + "loss": 0.1006, + "step": 7160 + }, + { + "epoch": 1.877343813968722, + "grad_norm": 6.149832756857255, + "learning_rate": 2.0405392339353234e-08, + "loss": 0.1713, + "step": 7165 + }, + { + "epoch": 1.878653893392287, + "grad_norm": 9.357766475713992, + "learning_rate": 1.9972237704512283e-08, + "loss": 0.1644, + "step": 7170 + }, + { + "epoch": 1.879963972815852, + "grad_norm": 5.253827651758333, + "learning_rate": 1.9543683536987434e-08, + "loss": 0.111, + "step": 7175 + }, + { + "epoch": 1.8812740522394171, + "grad_norm": 11.566026718756346, + "learning_rate": 1.9119731848506902e-08, + "loss": 0.1984, + "step": 7180 + }, + { + "epoch": 1.882584131662982, + "grad_norm": 8.248572977961965, + "learning_rate": 1.8700384629193876e-08, + "loss": 0.1202, + "step": 7185 + }, + { + "epoch": 1.883894211086547, + "grad_norm": 4.350069192366543, + "learning_rate": 1.828564384755682e-08, + "loss": 0.1471, + "step": 7190 + }, + { + "epoch": 1.8852042905101123, + "grad_norm": 5.389482671182921, + "learning_rate": 1.787551145048094e-08, + "loss": 0.1356, + "step": 7195 + }, + { + "epoch": 1.8865143699336773, + "grad_norm": 9.568845913678533, + "learning_rate": 1.7469989363218528e-08, + "loss": 0.209, + "step": 7200 + }, + { + "epoch": 1.8865143699336773, + "eval_accuracy": 0.7616, + "eval_loss": 1.1072343587875366, + "eval_runtime": 141.2973, + "eval_samples_per_second": 8.847, + "eval_steps_per_second": 2.215, + "step": 7200 + }, + { + "epoch": 1.8878244493572423, + "grad_norm": 8.437455929535107, + "learning_rate": 1.706907948938008e-08, + "loss": 0.1703, + "step": 7205 + }, + { + "epoch": 1.8891345287808075, + "grad_norm": 9.671218815145942, + "learning_rate": 1.6672783710925288e-08, + "loss": 0.18, + "step": 7210 + }, + { + "epoch": 1.8904446082043724, + "grad_norm": 5.889420942486911, + "learning_rate": 1.628110388815429e-08, + "loss": 0.1196, + "step": 7215 + }, + { + "epoch": 1.8917546876279374, + "grad_norm": 4.068164701286381, + "learning_rate": 1.5894041859698783e-08, + "loss": 0.1432, + "step": 7220 + }, + { + "epoch": 1.8930647670515026, + "grad_norm": 4.827161273788743, + "learning_rate": 1.5511599442513677e-08, + "loss": 0.1612, + "step": 7225 + }, + { + "epoch": 1.8943748464750676, + "grad_norm": 2.5102178836850495, + "learning_rate": 1.5133778431868583e-08, + "loss": 0.1626, + "step": 7230 + }, + { + "epoch": 1.8956849258986326, + "grad_norm": 7.7934519895192285, + "learning_rate": 1.4760580601338669e-08, + "loss": 0.2144, + "step": 7235 + }, + { + "epoch": 1.8969950053221978, + "grad_norm": 8.667381276168085, + "learning_rate": 1.439200770279736e-08, + "loss": 0.2232, + "step": 7240 + }, + { + "epoch": 1.8983050847457628, + "grad_norm": 7.482118460513503, + "learning_rate": 1.4028061466407449e-08, + "loss": 0.1269, + "step": 7245 + }, + { + "epoch": 1.8996151641693277, + "grad_norm": 5.507929252171665, + "learning_rate": 1.3668743600613097e-08, + "loss": 0.1869, + "step": 7250 + }, + { + "epoch": 1.900925243592893, + "grad_norm": 3.497582694746413, + "learning_rate": 1.3314055792131961e-08, + "loss": 0.1518, + "step": 7255 + }, + { + "epoch": 1.902235323016458, + "grad_norm": 6.083554156233226, + "learning_rate": 1.2963999705947193e-08, + "loss": 0.158, + "step": 7260 + }, + { + "epoch": 1.903545402440023, + "grad_norm": 11.09067780911257, + "learning_rate": 1.2618576985299334e-08, + "loss": 0.1666, + "step": 7265 + }, + { + "epoch": 1.904855481863588, + "grad_norm": 4.6729278171579605, + "learning_rate": 1.227778925167955e-08, + "loss": 0.135, + "step": 7270 + }, + { + "epoch": 1.906165561287153, + "grad_norm": 5.01367722348655, + "learning_rate": 1.1941638104820517e-08, + "loss": 0.1376, + "step": 7275 + }, + { + "epoch": 1.907475640710718, + "grad_norm": 8.20404432458644, + "learning_rate": 1.1610125122690328e-08, + "loss": 0.2188, + "step": 7280 + }, + { + "epoch": 1.9087857201342833, + "grad_norm": 9.608276812286391, + "learning_rate": 1.1283251861484378e-08, + "loss": 0.199, + "step": 7285 + }, + { + "epoch": 1.910095799557848, + "grad_norm": 4.3189093484564145, + "learning_rate": 1.0961019855618037e-08, + "loss": 0.1662, + "step": 7290 + }, + { + "epoch": 1.9114058789814132, + "grad_norm": 6.758336690737442, + "learning_rate": 1.0643430617719663e-08, + "loss": 0.1357, + "step": 7295 + }, + { + "epoch": 1.9127159584049784, + "grad_norm": 6.780661902438455, + "learning_rate": 1.0330485638623488e-08, + "loss": 0.178, + "step": 7300 + }, + { + "epoch": 1.9127159584049784, + "eval_accuracy": 0.7656, + "eval_loss": 1.1024446487426758, + "eval_runtime": 141.415, + "eval_samples_per_second": 8.839, + "eval_steps_per_second": 2.213, + "step": 7300 + }, + { + "epoch": 1.9140260378285432, + "grad_norm": 6.026966107553702, + "learning_rate": 1.0022186387362742e-08, + "loss": 0.1445, + "step": 7305 + }, + { + "epoch": 1.9153361172521084, + "grad_norm": 6.277429872371872, + "learning_rate": 9.718534311161985e-09, + "loss": 0.1679, + "step": 7310 + }, + { + "epoch": 1.9166461966756736, + "grad_norm": 4.119214806989816, + "learning_rate": 9.419530835431676e-09, + "loss": 0.1928, + "step": 7315 + }, + { + "epoch": 1.9179562760992384, + "grad_norm": 5.6153080280283385, + "learning_rate": 9.125177363759951e-09, + "loss": 0.1118, + "step": 7320 + }, + { + "epoch": 1.9192663555228036, + "grad_norm": 4.7666055487436525, + "learning_rate": 8.835475277907622e-09, + "loss": 0.1643, + "step": 7325 + }, + { + "epoch": 1.9205764349463688, + "grad_norm": 8.192595195938912, + "learning_rate": 8.550425937800088e-09, + "loss": 0.1507, + "step": 7330 + }, + { + "epoch": 1.9218865143699335, + "grad_norm": 6.765512918112135, + "learning_rate": 8.270030681522099e-09, + "loss": 0.1295, + "step": 7335 + }, + { + "epoch": 1.9231965937934987, + "grad_norm": 3.56638369560944, + "learning_rate": 7.994290825311333e-09, + "loss": 0.1031, + "step": 7340 + }, + { + "epoch": 1.924506673217064, + "grad_norm": 7.699305575968428, + "learning_rate": 7.72320766355139e-09, + "loss": 0.1526, + "step": 7345 + }, + { + "epoch": 1.9258167526406287, + "grad_norm": 4.8819269994799654, + "learning_rate": 7.45678246876702e-09, + "loss": 0.1646, + "step": 7350 + }, + { + "epoch": 1.9271268320641939, + "grad_norm": 3.9964247606777357, + "learning_rate": 7.19501649161669e-09, + "loss": 0.1028, + "step": 7355 + }, + { + "epoch": 1.928436911487759, + "grad_norm": 7.878684128224621, + "learning_rate": 6.937910960888138e-09, + "loss": 0.1542, + "step": 7360 + }, + { + "epoch": 1.9297469909113238, + "grad_norm": 9.167984693047696, + "learning_rate": 6.685467083491492e-09, + "loss": 0.1468, + "step": 7365 + }, + { + "epoch": 1.931057070334889, + "grad_norm": 2.1053337480076344, + "learning_rate": 6.437686044454382e-09, + "loss": 0.153, + "step": 7370 + }, + { + "epoch": 1.9323671497584543, + "grad_norm": 6.447874086726777, + "learning_rate": 6.194569006915729e-09, + "loss": 0.1358, + "step": 7375 + }, + { + "epoch": 1.933677229182019, + "grad_norm": 10.70751883403138, + "learning_rate": 5.95611711212074e-09, + "loss": 0.2028, + "step": 7380 + }, + { + "epoch": 1.9349873086055842, + "grad_norm": 10.40080019721987, + "learning_rate": 5.722331479415476e-09, + "loss": 0.1971, + "step": 7385 + }, + { + "epoch": 1.9362973880291494, + "grad_norm": 6.470624613494294, + "learning_rate": 5.4932132062414095e-09, + "loss": 0.148, + "step": 7390 + }, + { + "epoch": 1.9376074674527142, + "grad_norm": 6.406990396350547, + "learning_rate": 5.268763368130425e-09, + "loss": 0.1788, + "step": 7395 + }, + { + "epoch": 1.9389175468762794, + "grad_norm": 2.770021538790025, + "learning_rate": 5.048983018699826e-09, + "loss": 0.1198, + "step": 7400 + }, + { + "epoch": 1.9389175468762794, + "eval_accuracy": 0.7632, + "eval_loss": 1.106671929359436, + "eval_runtime": 143.3333, + "eval_samples_per_second": 8.721, + "eval_steps_per_second": 2.184, + "step": 7400 + }, + { + "epoch": 1.9402276262998446, + "grad_norm": 7.932600638186708, + "learning_rate": 4.8338731896472305e-09, + "loss": 0.0954, + "step": 7405 + }, + { + "epoch": 1.9415377057234093, + "grad_norm": 4.158505987091941, + "learning_rate": 4.623434890745792e-09, + "loss": 0.1482, + "step": 7410 + }, + { + "epoch": 1.9428477851469745, + "grad_norm": 4.125686999210389, + "learning_rate": 4.417669109839539e-09, + "loss": 0.1672, + "step": 7415 + }, + { + "epoch": 1.9441578645705397, + "grad_norm": 9.401687509625868, + "learning_rate": 4.2165768128384905e-09, + "loss": 0.2056, + "step": 7420 + }, + { + "epoch": 1.9454679439941045, + "grad_norm": 6.656706387472742, + "learning_rate": 4.020158943714436e-09, + "loss": 0.1292, + "step": 7425 + }, + { + "epoch": 1.9467780234176697, + "grad_norm": 5.612244185690268, + "learning_rate": 3.828416424496383e-09, + "loss": 0.1141, + "step": 7430 + }, + { + "epoch": 1.948088102841235, + "grad_norm": 8.832257438742863, + "learning_rate": 3.641350155266232e-09, + "loss": 0.2152, + "step": 7435 + }, + { + "epoch": 1.9493981822647997, + "grad_norm": 9.016570066506297, + "learning_rate": 3.458961014154327e-09, + "loss": 0.1548, + "step": 7440 + }, + { + "epoch": 1.9507082616883649, + "grad_norm": 5.852774450303297, + "learning_rate": 3.2812498573359104e-09, + "loss": 0.1769, + "step": 7445 + }, + { + "epoch": 1.95201834111193, + "grad_norm": 3.8609337077050783, + "learning_rate": 3.108217519026235e-09, + "loss": 0.1429, + "step": 7450 + }, + { + "epoch": 1.9533284205354948, + "grad_norm": 3.881198457400227, + "learning_rate": 2.9398648114775658e-09, + "loss": 0.1024, + "step": 7455 + }, + { + "epoch": 1.95463849995906, + "grad_norm": 4.8164655468300115, + "learning_rate": 2.776192524974741e-09, + "loss": 0.138, + "step": 7460 + }, + { + "epoch": 1.9559485793826252, + "grad_norm": 6.545137926620753, + "learning_rate": 2.617201427831728e-09, + "loss": 0.1693, + "step": 7465 + }, + { + "epoch": 1.95725865880619, + "grad_norm": 4.068942827553475, + "learning_rate": 2.4628922663879615e-09, + "loss": 0.1181, + "step": 7470 + }, + { + "epoch": 1.9585687382297552, + "grad_norm": 3.941067732119495, + "learning_rate": 2.3132657650047905e-09, + "loss": 0.1674, + "step": 7475 + }, + { + "epoch": 1.9598788176533204, + "grad_norm": 4.838837050335417, + "learning_rate": 2.168322626062147e-09, + "loss": 0.1547, + "step": 7480 + }, + { + "epoch": 1.9611888970768852, + "grad_norm": 7.190077481755126, + "learning_rate": 2.0280635299551043e-09, + "loss": 0.1601, + "step": 7485 + }, + { + "epoch": 1.9624989765004504, + "grad_norm": 6.231947930455576, + "learning_rate": 1.8924891350911023e-09, + "loss": 0.1571, + "step": 7490 + }, + { + "epoch": 1.9638090559240156, + "grad_norm": 3.2883780328164156, + "learning_rate": 1.7616000778863938e-09, + "loss": 0.1346, + "step": 7495 + }, + { + "epoch": 1.9651191353475803, + "grad_norm": 2.251817701553141, + "learning_rate": 1.6353969727629368e-09, + "loss": 0.1483, + "step": 7500 + }, + { + "epoch": 1.9651191353475803, + "eval_accuracy": 0.764, + "eval_loss": 1.1052285432815552, + "eval_runtime": 140.9937, + "eval_samples_per_second": 8.866, + "eval_steps_per_second": 2.22, + "step": 7500 + }, + { + "epoch": 1.9664292147711455, + "grad_norm": 6.416248872950269, + "learning_rate": 1.5138804121462844e-09, + "loss": 0.1381, + "step": 7505 + }, + { + "epoch": 1.9677392941947107, + "grad_norm": 6.998254537894883, + "learning_rate": 1.3970509664620323e-09, + "loss": 0.143, + "step": 7510 + }, + { + "epoch": 1.9690493736182755, + "grad_norm": 8.413132217200232, + "learning_rate": 1.284909184133487e-09, + "loss": 0.1645, + "step": 7515 + }, + { + "epoch": 1.9703594530418407, + "grad_norm": 5.34533852883374, + "learning_rate": 1.1774555915787799e-09, + "loss": 0.1896, + "step": 7520 + }, + { + "epoch": 1.9716695324654057, + "grad_norm": 3.1253164448556814, + "learning_rate": 1.0746906932092016e-09, + "loss": 0.17, + "step": 7525 + }, + { + "epoch": 1.9729796118889706, + "grad_norm": 6.051727298230588, + "learning_rate": 9.7661497142576e-10, + "loss": 0.1539, + "step": 7530 + }, + { + "epoch": 1.9742896913125358, + "grad_norm": 9.970322292501223, + "learning_rate": 8.832288866175152e-10, + "loss": 0.1438, + "step": 7535 + }, + { + "epoch": 1.9755997707361008, + "grad_norm": 4.902999720057873, + "learning_rate": 7.945328771596926e-10, + "loss": 0.1661, + "step": 7540 + }, + { + "epoch": 1.9769098501596658, + "grad_norm": 6.8168524634333325, + "learning_rate": 7.105273594107953e-10, + "loss": 0.1571, + "step": 7545 + }, + { + "epoch": 1.978219929583231, + "grad_norm": 11.517594529415646, + "learning_rate": 6.312127277113833e-10, + "loss": 0.2014, + "step": 7550 + }, + { + "epoch": 1.979530009006796, + "grad_norm": 5.3054234456984535, + "learning_rate": 5.565893543818534e-10, + "loss": 0.1121, + "step": 7555 + }, + { + "epoch": 1.980840088430361, + "grad_norm": 6.920016441538864, + "learning_rate": 4.866575897208846e-10, + "loss": 0.1289, + "step": 7560 + }, + { + "epoch": 1.9821501678539262, + "grad_norm": 5.089129025398095, + "learning_rate": 4.2141776200366184e-10, + "loss": 0.1832, + "step": 7565 + }, + { + "epoch": 1.9834602472774912, + "grad_norm": 13.206489418737272, + "learning_rate": 3.6087017748043235e-10, + "loss": 0.1799, + "step": 7570 + }, + { + "epoch": 1.9847703267010561, + "grad_norm": 7.895102786478579, + "learning_rate": 3.050151203749518e-10, + "loss": 0.1761, + "step": 7575 + }, + { + "epoch": 1.9860804061246213, + "grad_norm": 7.13408976553629, + "learning_rate": 2.538528528831518e-10, + "loss": 0.1308, + "step": 7580 + }, + { + "epoch": 1.9873904855481863, + "grad_norm": 5.735776530531999, + "learning_rate": 2.0738361517214087e-10, + "loss": 0.1536, + "step": 7585 + }, + { + "epoch": 1.9887005649717513, + "grad_norm": 6.373237190830313, + "learning_rate": 1.656076253786498e-10, + "loss": 0.1635, + "step": 7590 + }, + { + "epoch": 1.9900106443953165, + "grad_norm": 6.685342113001509, + "learning_rate": 1.2852507960858793e-10, + "loss": 0.124, + "step": 7595 + }, + { + "epoch": 1.9913207238188815, + "grad_norm": 5.437567848160449, + "learning_rate": 9.613615193548863e-11, + "loss": 0.1298, + "step": 7600 + }, + { + "epoch": 1.9913207238188815, + "eval_accuracy": 0.76, + "eval_loss": 1.105454921722412, + "eval_runtime": 140.9417, + "eval_samples_per_second": 8.869, + "eval_steps_per_second": 2.221, + "step": 7600 + }, + { + "epoch": 1.9926308032424465, + "grad_norm": 6.920328620350111, + "learning_rate": 6.84409944003983e-11, + "loss": 0.1196, + "step": 7605 + }, + { + "epoch": 1.9939408826660117, + "grad_norm": 8.1447085166869, + "learning_rate": 4.543973701021109e-11, + "loss": 0.1285, + "step": 7610 + }, + { + "epoch": 1.9952509620895766, + "grad_norm": 3.617450574006755, + "learning_rate": 2.7132487738223964e-11, + "loss": 0.1182, + "step": 7615 + }, + { + "epoch": 1.9965610415131416, + "grad_norm": 5.8602211913393445, + "learning_rate": 1.3519332522471393e-11, + "loss": 0.1203, + "step": 7620 + }, + { + "epoch": 1.9978711209367068, + "grad_norm": 7.209657643570803, + "learning_rate": 4.6003352661694304e-12, + "loss": 0.1478, + "step": 7625 + }, + { + "epoch": 1.9991812003602718, + "grad_norm": 4.426426765312953, + "learning_rate": 3.7553783716059993e-13, + "loss": 0.1669, + "step": 7630 + } + ], + "logging_steps": 5, + "max_steps": 7632, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 400, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}