diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21978 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 31359, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003188877196339169, + "grad_norm": 3.036648750305176, + "learning_rate": 3.1888771963391685e-05, + "loss": 9.533, + "step": 10 + }, + { + "epoch": 0.0006377754392678338, + "grad_norm": 1.8982125520706177, + "learning_rate": 6.377754392678337e-05, + "loss": 8.4311, + "step": 20 + }, + { + "epoch": 0.0009566631589017507, + "grad_norm": 1.0077489614486694, + "learning_rate": 9.566631589017505e-05, + "loss": 7.5347, + "step": 30 + }, + { + "epoch": 0.0012755508785356677, + "grad_norm": 0.7245733737945557, + "learning_rate": 0.00012755508785356674, + "loss": 6.8032, + "step": 40 + }, + { + "epoch": 0.0015944385981695845, + "grad_norm": 0.6618061065673828, + "learning_rate": 0.00015944385981695843, + "loss": 6.3497, + "step": 50 + }, + { + "epoch": 0.0019133263178035013, + "grad_norm": 0.9375441074371338, + "learning_rate": 0.0001913326317803501, + "loss": 6.1371, + "step": 60 + }, + { + "epoch": 0.0022322140374374183, + "grad_norm": 0.8749001622200012, + "learning_rate": 0.00022322140374374182, + "loss": 5.9688, + "step": 70 + }, + { + "epoch": 0.0025511017570713354, + "grad_norm": 0.7610182762145996, + "learning_rate": 0.0002551101757071335, + "loss": 5.802, + "step": 80 + }, + { + "epoch": 0.002869989476705252, + "grad_norm": 0.6044188737869263, + "learning_rate": 0.0002869989476705252, + "loss": 5.6593, + "step": 90 + }, + { + "epoch": 0.003188877196339169, + "grad_norm": 0.8063629865646362, + "learning_rate": 0.00031888771963391687, + "loss": 5.5041, + "step": 100 + }, + { + "epoch": 0.003507764915973086, + "grad_norm": 0.776926577091217, + "learning_rate": 0.00035077649159730856, + "loss": 5.3573, + "step": 110 + }, + { + "epoch": 0.0038266526356070026, + "grad_norm": 0.7012314200401306, + "learning_rate": 0.0003826652635607002, + "loss": 5.1943, + "step": 120 + }, + { + "epoch": 0.00414554035524092, + "grad_norm": 0.8802277445793152, + "learning_rate": 0.00041455403552409194, + "loss": 5.0659, + "step": 130 + }, + { + "epoch": 0.004464428074874837, + "grad_norm": 1.1483536958694458, + "learning_rate": 0.00044644280748748364, + "loss": 4.8957, + "step": 140 + }, + { + "epoch": 0.004783315794508754, + "grad_norm": 0.7169669270515442, + "learning_rate": 0.0004783315794508753, + "loss": 4.7524, + "step": 150 + }, + { + "epoch": 0.005102203514142671, + "grad_norm": 0.6984596848487854, + "learning_rate": 0.000510220351414267, + "loss": 4.6166, + "step": 160 + }, + { + "epoch": 0.005421091233776587, + "grad_norm": 0.8619917035102844, + "learning_rate": 0.0005421091233776587, + "loss": 4.5516, + "step": 170 + }, + { + "epoch": 0.005739978953410504, + "grad_norm": 0.9251404404640198, + "learning_rate": 0.0005739978953410504, + "loss": 4.4415, + "step": 180 + }, + { + "epoch": 0.006058866673044421, + "grad_norm": 0.9755675792694092, + "learning_rate": 0.0006058866673044421, + "loss": 4.3531, + "step": 190 + }, + { + "epoch": 0.006377754392678338, + "grad_norm": 0.7827635407447815, + "learning_rate": 0.0006377754392678337, + "loss": 4.2329, + "step": 200 + }, + { + "epoch": 0.006696642112312255, + "grad_norm": 1.1084445714950562, + "learning_rate": 0.0006696642112312254, + "loss": 4.165, + "step": 210 + }, + { + "epoch": 0.007015529831946172, + "grad_norm": 0.7396318912506104, + "learning_rate": 0.0007015529831946171, + "loss": 4.0607, + "step": 220 + }, + { + "epoch": 0.007334417551580088, + "grad_norm": 0.8335429430007935, + "learning_rate": 0.0007334417551580089, + "loss": 3.9912, + "step": 230 + }, + { + "epoch": 0.007653305271214005, + "grad_norm": 0.7607409954071045, + "learning_rate": 0.0007653305271214004, + "loss": 3.9232, + "step": 240 + }, + { + "epoch": 0.007972192990847922, + "grad_norm": 0.7046091556549072, + "learning_rate": 0.0007972192990847921, + "loss": 3.8883, + "step": 250 + }, + { + "epoch": 0.00829108071048184, + "grad_norm": 0.858257532119751, + "learning_rate": 0.0008291080710481839, + "loss": 3.8052, + "step": 260 + }, + { + "epoch": 0.008609968430115756, + "grad_norm": 0.7207130789756775, + "learning_rate": 0.0008609968430115756, + "loss": 3.7916, + "step": 270 + }, + { + "epoch": 0.008928856149749673, + "grad_norm": 0.8144423961639404, + "learning_rate": 0.0008928856149749673, + "loss": 3.7006, + "step": 280 + }, + { + "epoch": 0.00924774386938359, + "grad_norm": 0.824480414390564, + "learning_rate": 0.0009247743869383588, + "loss": 3.6646, + "step": 290 + }, + { + "epoch": 0.009566631589017507, + "grad_norm": 0.8660809397697449, + "learning_rate": 0.0009566631589017505, + "loss": 3.6161, + "step": 300 + }, + { + "epoch": 0.009885519308651424, + "grad_norm": 0.7533829808235168, + "learning_rate": 0.0009885519308651422, + "loss": 3.5742, + "step": 310 + }, + { + "epoch": 0.010204407028285341, + "grad_norm": 0.7015078067779541, + "learning_rate": 0.001, + "loss": 3.5328, + "step": 320 + }, + { + "epoch": 0.010523294747919257, + "grad_norm": 0.7448273301124573, + "learning_rate": 0.001, + "loss": 3.4694, + "step": 330 + }, + { + "epoch": 0.010842182467553174, + "grad_norm": 0.7898711562156677, + "learning_rate": 0.001, + "loss": 3.4442, + "step": 340 + }, + { + "epoch": 0.01116107018718709, + "grad_norm": 0.9917442202568054, + "learning_rate": 0.001, + "loss": 3.3712, + "step": 350 + }, + { + "epoch": 0.011479957906821008, + "grad_norm": 0.8575316071510315, + "learning_rate": 0.001, + "loss": 3.3555, + "step": 360 + }, + { + "epoch": 0.011798845626454925, + "grad_norm": 0.7570849061012268, + "learning_rate": 0.001, + "loss": 3.2849, + "step": 370 + }, + { + "epoch": 0.012117733346088842, + "grad_norm": 0.7412615418434143, + "learning_rate": 0.001, + "loss": 3.2602, + "step": 380 + }, + { + "epoch": 0.012436621065722759, + "grad_norm": 0.6936795711517334, + "learning_rate": 0.001, + "loss": 3.2217, + "step": 390 + }, + { + "epoch": 0.012755508785356676, + "grad_norm": 0.8034941554069519, + "learning_rate": 0.001, + "loss": 3.1868, + "step": 400 + }, + { + "epoch": 0.013074396504990593, + "grad_norm": 0.7071384787559509, + "learning_rate": 0.001, + "loss": 3.1318, + "step": 410 + }, + { + "epoch": 0.01339328422462451, + "grad_norm": 0.7512100338935852, + "learning_rate": 0.001, + "loss": 3.1087, + "step": 420 + }, + { + "epoch": 0.013712171944258427, + "grad_norm": 0.8786042928695679, + "learning_rate": 0.001, + "loss": 3.0601, + "step": 430 + }, + { + "epoch": 0.014031059663892344, + "grad_norm": 0.7732911705970764, + "learning_rate": 0.001, + "loss": 3.0246, + "step": 440 + }, + { + "epoch": 0.014349947383526261, + "grad_norm": 0.8299092054367065, + "learning_rate": 0.001, + "loss": 2.9614, + "step": 450 + }, + { + "epoch": 0.014668835103160176, + "grad_norm": 0.6985538005828857, + "learning_rate": 0.001, + "loss": 2.9618, + "step": 460 + }, + { + "epoch": 0.014987722822794093, + "grad_norm": 0.8050011396408081, + "learning_rate": 0.001, + "loss": 2.9392, + "step": 470 + }, + { + "epoch": 0.01530661054242801, + "grad_norm": 0.7188740372657776, + "learning_rate": 0.001, + "loss": 2.908, + "step": 480 + }, + { + "epoch": 0.01562549826206193, + "grad_norm": 0.9172284603118896, + "learning_rate": 0.001, + "loss": 2.8388, + "step": 490 + }, + { + "epoch": 0.015944385981695845, + "grad_norm": 0.7237478494644165, + "learning_rate": 0.001, + "loss": 2.841, + "step": 500 + }, + { + "epoch": 0.016263273701329763, + "grad_norm": 0.8718260526657104, + "learning_rate": 0.001, + "loss": 2.7981, + "step": 510 + }, + { + "epoch": 0.01658216142096368, + "grad_norm": 0.7130722999572754, + "learning_rate": 0.001, + "loss": 2.7755, + "step": 520 + }, + { + "epoch": 0.016901049140597594, + "grad_norm": 0.6788746118545532, + "learning_rate": 0.001, + "loss": 2.7742, + "step": 530 + }, + { + "epoch": 0.017219936860231513, + "grad_norm": 0.7213516235351562, + "learning_rate": 0.001, + "loss": 2.7421, + "step": 540 + }, + { + "epoch": 0.017538824579865428, + "grad_norm": 0.7722727060317993, + "learning_rate": 0.001, + "loss": 2.6919, + "step": 550 + }, + { + "epoch": 0.017857712299499347, + "grad_norm": 0.8629282712936401, + "learning_rate": 0.001, + "loss": 2.6804, + "step": 560 + }, + { + "epoch": 0.018176600019133262, + "grad_norm": 0.7852650880813599, + "learning_rate": 0.001, + "loss": 2.6518, + "step": 570 + }, + { + "epoch": 0.01849548773876718, + "grad_norm": 0.6630326509475708, + "learning_rate": 0.001, + "loss": 2.6447, + "step": 580 + }, + { + "epoch": 0.018814375458401096, + "grad_norm": 0.6685476899147034, + "learning_rate": 0.001, + "loss": 2.599, + "step": 590 + }, + { + "epoch": 0.019133263178035015, + "grad_norm": 0.7120856642723083, + "learning_rate": 0.001, + "loss": 2.5967, + "step": 600 + }, + { + "epoch": 0.01945215089766893, + "grad_norm": 0.6978277564048767, + "learning_rate": 0.001, + "loss": 2.5731, + "step": 610 + }, + { + "epoch": 0.01977103861730285, + "grad_norm": 0.6799705624580383, + "learning_rate": 0.001, + "loss": 2.5141, + "step": 620 + }, + { + "epoch": 0.020089926336936764, + "grad_norm": 0.625628650188446, + "learning_rate": 0.001, + "loss": 2.5176, + "step": 630 + }, + { + "epoch": 0.020408814056570683, + "grad_norm": 0.6856659054756165, + "learning_rate": 0.001, + "loss": 2.5204, + "step": 640 + }, + { + "epoch": 0.0207277017762046, + "grad_norm": 0.6310821175575256, + "learning_rate": 0.001, + "loss": 2.4801, + "step": 650 + }, + { + "epoch": 0.021046589495838514, + "grad_norm": 0.6268860697746277, + "learning_rate": 0.001, + "loss": 2.4809, + "step": 660 + }, + { + "epoch": 0.021365477215472432, + "grad_norm": 0.6450563073158264, + "learning_rate": 0.001, + "loss": 2.4476, + "step": 670 + }, + { + "epoch": 0.021684364935106348, + "grad_norm": 0.650497317314148, + "learning_rate": 0.001, + "loss": 2.4374, + "step": 680 + }, + { + "epoch": 0.022003252654740266, + "grad_norm": 0.6631786227226257, + "learning_rate": 0.001, + "loss": 2.4252, + "step": 690 + }, + { + "epoch": 0.02232214037437418, + "grad_norm": 0.6338786482810974, + "learning_rate": 0.001, + "loss": 2.3957, + "step": 700 + }, + { + "epoch": 0.0226410280940081, + "grad_norm": 0.6474817991256714, + "learning_rate": 0.001, + "loss": 2.424, + "step": 710 + }, + { + "epoch": 0.022959915813642016, + "grad_norm": 0.6207141280174255, + "learning_rate": 0.001, + "loss": 2.3646, + "step": 720 + }, + { + "epoch": 0.023278803533275935, + "grad_norm": 0.708824098110199, + "learning_rate": 0.001, + "loss": 2.3651, + "step": 730 + }, + { + "epoch": 0.02359769125290985, + "grad_norm": 0.6238510608673096, + "learning_rate": 0.001, + "loss": 2.3275, + "step": 740 + }, + { + "epoch": 0.02391657897254377, + "grad_norm": 0.6039829254150391, + "learning_rate": 0.001, + "loss": 2.3622, + "step": 750 + }, + { + "epoch": 0.024235466692177684, + "grad_norm": 0.6189019083976746, + "learning_rate": 0.001, + "loss": 2.3433, + "step": 760 + }, + { + "epoch": 0.024554354411811603, + "grad_norm": 0.5963674187660217, + "learning_rate": 0.001, + "loss": 2.3492, + "step": 770 + }, + { + "epoch": 0.024873242131445518, + "grad_norm": 0.6473168730735779, + "learning_rate": 0.001, + "loss": 2.2995, + "step": 780 + }, + { + "epoch": 0.025192129851079433, + "grad_norm": 0.590446412563324, + "learning_rate": 0.001, + "loss": 2.3136, + "step": 790 + }, + { + "epoch": 0.025511017570713352, + "grad_norm": 0.6440556049346924, + "learning_rate": 0.001, + "loss": 2.333, + "step": 800 + }, + { + "epoch": 0.025829905290347267, + "grad_norm": 0.5967952013015747, + "learning_rate": 0.001, + "loss": 2.2718, + "step": 810 + }, + { + "epoch": 0.026148793009981186, + "grad_norm": 0.6391497850418091, + "learning_rate": 0.001, + "loss": 2.3109, + "step": 820 + }, + { + "epoch": 0.0264676807296151, + "grad_norm": 0.5951425433158875, + "learning_rate": 0.001, + "loss": 2.2615, + "step": 830 + }, + { + "epoch": 0.02678656844924902, + "grad_norm": 0.7214036583900452, + "learning_rate": 0.001, + "loss": 2.2651, + "step": 840 + }, + { + "epoch": 0.027105456168882935, + "grad_norm": 0.5951161980628967, + "learning_rate": 0.001, + "loss": 2.2469, + "step": 850 + }, + { + "epoch": 0.027424343888516854, + "grad_norm": 0.643730103969574, + "learning_rate": 0.001, + "loss": 2.2456, + "step": 860 + }, + { + "epoch": 0.02774323160815077, + "grad_norm": 0.6100723147392273, + "learning_rate": 0.001, + "loss": 2.2127, + "step": 870 + }, + { + "epoch": 0.028062119327784688, + "grad_norm": 0.6163387894630432, + "learning_rate": 0.001, + "loss": 2.215, + "step": 880 + }, + { + "epoch": 0.028381007047418604, + "grad_norm": 0.6332610845565796, + "learning_rate": 0.001, + "loss": 2.1957, + "step": 890 + }, + { + "epoch": 0.028699894767052522, + "grad_norm": 0.7613614201545715, + "learning_rate": 0.001, + "loss": 2.2013, + "step": 900 + }, + { + "epoch": 0.029018782486686438, + "grad_norm": 0.6165406703948975, + "learning_rate": 0.001, + "loss": 2.1597, + "step": 910 + }, + { + "epoch": 0.029337670206320353, + "grad_norm": 0.5806939005851746, + "learning_rate": 0.001, + "loss": 2.1633, + "step": 920 + }, + { + "epoch": 0.02965655792595427, + "grad_norm": 0.5569039583206177, + "learning_rate": 0.001, + "loss": 2.1748, + "step": 930 + }, + { + "epoch": 0.029975445645588187, + "grad_norm": 0.5546286106109619, + "learning_rate": 0.001, + "loss": 2.1717, + "step": 940 + }, + { + "epoch": 0.030294333365222106, + "grad_norm": 0.5814953446388245, + "learning_rate": 0.001, + "loss": 2.1502, + "step": 950 + }, + { + "epoch": 0.03061322108485602, + "grad_norm": 0.5773137807846069, + "learning_rate": 0.001, + "loss": 2.1499, + "step": 960 + }, + { + "epoch": 0.03093210880448994, + "grad_norm": 0.5882891416549683, + "learning_rate": 0.001, + "loss": 2.1352, + "step": 970 + }, + { + "epoch": 0.03125099652412386, + "grad_norm": 0.597629964351654, + "learning_rate": 0.001, + "loss": 2.1569, + "step": 980 + }, + { + "epoch": 0.03156988424375777, + "grad_norm": 0.5862744450569153, + "learning_rate": 0.001, + "loss": 2.1362, + "step": 990 + }, + { + "epoch": 0.03188877196339169, + "grad_norm": 0.5462473630905151, + "learning_rate": 0.001, + "loss": 2.115, + "step": 1000 + }, + { + "epoch": 0.03220765968302561, + "grad_norm": 0.5664170384407043, + "learning_rate": 0.001, + "loss": 2.114, + "step": 1010 + }, + { + "epoch": 0.03252654740265953, + "grad_norm": 0.6027910113334656, + "learning_rate": 0.001, + "loss": 2.1258, + "step": 1020 + }, + { + "epoch": 0.03284543512229344, + "grad_norm": 0.6072728633880615, + "learning_rate": 0.001, + "loss": 2.0963, + "step": 1030 + }, + { + "epoch": 0.03316432284192736, + "grad_norm": 0.5642842650413513, + "learning_rate": 0.001, + "loss": 2.0512, + "step": 1040 + }, + { + "epoch": 0.033483210561561276, + "grad_norm": 0.5518870949745178, + "learning_rate": 0.001, + "loss": 2.0815, + "step": 1050 + }, + { + "epoch": 0.03380209828119519, + "grad_norm": 0.5724993348121643, + "learning_rate": 0.001, + "loss": 2.0478, + "step": 1060 + }, + { + "epoch": 0.03412098600082911, + "grad_norm": 0.576343834400177, + "learning_rate": 0.001, + "loss": 2.0811, + "step": 1070 + }, + { + "epoch": 0.034439873720463025, + "grad_norm": 0.5714786052703857, + "learning_rate": 0.001, + "loss": 2.0707, + "step": 1080 + }, + { + "epoch": 0.034758761440096944, + "grad_norm": 0.5584716796875, + "learning_rate": 0.001, + "loss": 2.0623, + "step": 1090 + }, + { + "epoch": 0.035077649159730856, + "grad_norm": 0.6107367277145386, + "learning_rate": 0.001, + "loss": 2.0665, + "step": 1100 + }, + { + "epoch": 0.035396536879364775, + "grad_norm": 0.5658422112464905, + "learning_rate": 0.001, + "loss": 2.0287, + "step": 1110 + }, + { + "epoch": 0.035715424598998693, + "grad_norm": 0.6898009777069092, + "learning_rate": 0.001, + "loss": 2.0313, + "step": 1120 + }, + { + "epoch": 0.03603431231863261, + "grad_norm": 0.5509852766990662, + "learning_rate": 0.001, + "loss": 2.0321, + "step": 1130 + }, + { + "epoch": 0.036353200038266524, + "grad_norm": 0.5971260070800781, + "learning_rate": 0.001, + "loss": 2.049, + "step": 1140 + }, + { + "epoch": 0.03667208775790044, + "grad_norm": 0.5642650127410889, + "learning_rate": 0.001, + "loss": 2.0325, + "step": 1150 + }, + { + "epoch": 0.03699097547753436, + "grad_norm": 0.5502352118492126, + "learning_rate": 0.001, + "loss": 2.0189, + "step": 1160 + }, + { + "epoch": 0.03730986319716828, + "grad_norm": 0.5586209297180176, + "learning_rate": 0.001, + "loss": 1.9837, + "step": 1170 + }, + { + "epoch": 0.03762875091680219, + "grad_norm": 0.5707424879074097, + "learning_rate": 0.001, + "loss": 1.9626, + "step": 1180 + }, + { + "epoch": 0.03794763863643611, + "grad_norm": 0.513467013835907, + "learning_rate": 0.001, + "loss": 1.9747, + "step": 1190 + }, + { + "epoch": 0.03826652635607003, + "grad_norm": 0.5255717039108276, + "learning_rate": 0.001, + "loss": 1.9826, + "step": 1200 + }, + { + "epoch": 0.03858541407570394, + "grad_norm": 0.5995370745658875, + "learning_rate": 0.001, + "loss": 1.9606, + "step": 1210 + }, + { + "epoch": 0.03890430179533786, + "grad_norm": 0.5771000385284424, + "learning_rate": 0.001, + "loss": 1.9829, + "step": 1220 + }, + { + "epoch": 0.03922318951497178, + "grad_norm": 0.5424252152442932, + "learning_rate": 0.001, + "loss": 1.9634, + "step": 1230 + }, + { + "epoch": 0.0395420772346057, + "grad_norm": 0.6467852592468262, + "learning_rate": 0.001, + "loss": 1.975, + "step": 1240 + }, + { + "epoch": 0.03986096495423961, + "grad_norm": 0.5047982931137085, + "learning_rate": 0.001, + "loss": 1.9443, + "step": 1250 + }, + { + "epoch": 0.04017985267387353, + "grad_norm": 0.5370773077011108, + "learning_rate": 0.001, + "loss": 1.9455, + "step": 1260 + }, + { + "epoch": 0.04049874039350745, + "grad_norm": 0.5400688648223877, + "learning_rate": 0.001, + "loss": 1.9559, + "step": 1270 + }, + { + "epoch": 0.040817628113141366, + "grad_norm": 0.49548593163490295, + "learning_rate": 0.001, + "loss": 1.9353, + "step": 1280 + }, + { + "epoch": 0.04113651583277528, + "grad_norm": 0.5320776104927063, + "learning_rate": 0.001, + "loss": 1.9386, + "step": 1290 + }, + { + "epoch": 0.0414554035524092, + "grad_norm": 0.516173243522644, + "learning_rate": 0.001, + "loss": 1.9209, + "step": 1300 + }, + { + "epoch": 0.041774291272043115, + "grad_norm": 0.5142052173614502, + "learning_rate": 0.001, + "loss": 1.9009, + "step": 1310 + }, + { + "epoch": 0.04209317899167703, + "grad_norm": 0.5320322513580322, + "learning_rate": 0.001, + "loss": 1.94, + "step": 1320 + }, + { + "epoch": 0.042412066711310946, + "grad_norm": 0.5104788541793823, + "learning_rate": 0.001, + "loss": 1.9197, + "step": 1330 + }, + { + "epoch": 0.042730954430944865, + "grad_norm": 0.5254496932029724, + "learning_rate": 0.001, + "loss": 1.9007, + "step": 1340 + }, + { + "epoch": 0.04304984215057878, + "grad_norm": 0.5628265738487244, + "learning_rate": 0.001, + "loss": 1.9229, + "step": 1350 + }, + { + "epoch": 0.043368729870212695, + "grad_norm": 0.5209566950798035, + "learning_rate": 0.001, + "loss": 1.9035, + "step": 1360 + }, + { + "epoch": 0.043687617589846614, + "grad_norm": 0.5213086605072021, + "learning_rate": 0.001, + "loss": 1.9135, + "step": 1370 + }, + { + "epoch": 0.04400650530948053, + "grad_norm": 0.5099833011627197, + "learning_rate": 0.001, + "loss": 1.9162, + "step": 1380 + }, + { + "epoch": 0.04432539302911445, + "grad_norm": 0.5214992761611938, + "learning_rate": 0.001, + "loss": 1.8639, + "step": 1390 + }, + { + "epoch": 0.04464428074874836, + "grad_norm": 0.53340744972229, + "learning_rate": 0.001, + "loss": 1.8961, + "step": 1400 + }, + { + "epoch": 0.04496316846838228, + "grad_norm": 0.4767109453678131, + "learning_rate": 0.001, + "loss": 1.8945, + "step": 1410 + }, + { + "epoch": 0.0452820561880162, + "grad_norm": 0.5631620287895203, + "learning_rate": 0.001, + "loss": 1.8566, + "step": 1420 + }, + { + "epoch": 0.04560094390765012, + "grad_norm": 0.5479603409767151, + "learning_rate": 0.001, + "loss": 1.8817, + "step": 1430 + }, + { + "epoch": 0.04591983162728403, + "grad_norm": 0.49137023091316223, + "learning_rate": 0.001, + "loss": 1.8552, + "step": 1440 + }, + { + "epoch": 0.04623871934691795, + "grad_norm": 0.5349081158638, + "learning_rate": 0.001, + "loss": 1.8952, + "step": 1450 + }, + { + "epoch": 0.04655760706655187, + "grad_norm": 0.4876307249069214, + "learning_rate": 0.001, + "loss": 1.8497, + "step": 1460 + }, + { + "epoch": 0.04687649478618578, + "grad_norm": 0.4965214729309082, + "learning_rate": 0.001, + "loss": 1.8491, + "step": 1470 + }, + { + "epoch": 0.0471953825058197, + "grad_norm": 0.5440576672554016, + "learning_rate": 0.001, + "loss": 1.8561, + "step": 1480 + }, + { + "epoch": 0.04751427022545362, + "grad_norm": 0.48483118414878845, + "learning_rate": 0.001, + "loss": 1.8507, + "step": 1490 + }, + { + "epoch": 0.04783315794508754, + "grad_norm": 0.4601515233516693, + "learning_rate": 0.001, + "loss": 1.8341, + "step": 1500 + }, + { + "epoch": 0.04815204566472145, + "grad_norm": 0.5070135593414307, + "learning_rate": 0.001, + "loss": 1.8136, + "step": 1510 + }, + { + "epoch": 0.04847093338435537, + "grad_norm": 0.4945167899131775, + "learning_rate": 0.001, + "loss": 1.8423, + "step": 1520 + }, + { + "epoch": 0.048789821103989287, + "grad_norm": 0.4869058132171631, + "learning_rate": 0.001, + "loss": 1.8465, + "step": 1530 + }, + { + "epoch": 0.049108708823623205, + "grad_norm": 0.5155792832374573, + "learning_rate": 0.001, + "loss": 1.8085, + "step": 1540 + }, + { + "epoch": 0.04942759654325712, + "grad_norm": 0.48565489053726196, + "learning_rate": 0.001, + "loss": 1.8332, + "step": 1550 + }, + { + "epoch": 0.049746484262891036, + "grad_norm": 0.5483407974243164, + "learning_rate": 0.001, + "loss": 1.8151, + "step": 1560 + }, + { + "epoch": 0.050065371982524955, + "grad_norm": 0.473037451505661, + "learning_rate": 0.001, + "loss": 1.8127, + "step": 1570 + }, + { + "epoch": 0.050384259702158866, + "grad_norm": 0.48309481143951416, + "learning_rate": 0.001, + "loss": 1.792, + "step": 1580 + }, + { + "epoch": 0.050703147421792785, + "grad_norm": 0.4604003131389618, + "learning_rate": 0.001, + "loss": 1.7976, + "step": 1590 + }, + { + "epoch": 0.051022035141426704, + "grad_norm": 0.5088821649551392, + "learning_rate": 0.001, + "loss": 1.7901, + "step": 1600 + }, + { + "epoch": 0.05134092286106062, + "grad_norm": 0.46633222699165344, + "learning_rate": 0.001, + "loss": 1.7978, + "step": 1610 + }, + { + "epoch": 0.051659810580694535, + "grad_norm": 0.48440852761268616, + "learning_rate": 0.001, + "loss": 1.8161, + "step": 1620 + }, + { + "epoch": 0.05197869830032845, + "grad_norm": 0.46742019057273865, + "learning_rate": 0.001, + "loss": 1.7755, + "step": 1630 + }, + { + "epoch": 0.05229758601996237, + "grad_norm": 0.4699628949165344, + "learning_rate": 0.001, + "loss": 1.7872, + "step": 1640 + }, + { + "epoch": 0.05261647373959629, + "grad_norm": 0.46953830122947693, + "learning_rate": 0.001, + "loss": 1.8104, + "step": 1650 + }, + { + "epoch": 0.0529353614592302, + "grad_norm": 0.4661422073841095, + "learning_rate": 0.001, + "loss": 1.7835, + "step": 1660 + }, + { + "epoch": 0.05325424917886412, + "grad_norm": 0.4410492479801178, + "learning_rate": 0.001, + "loss": 1.7827, + "step": 1670 + }, + { + "epoch": 0.05357313689849804, + "grad_norm": 0.4911010265350342, + "learning_rate": 0.001, + "loss": 1.7439, + "step": 1680 + }, + { + "epoch": 0.05389202461813196, + "grad_norm": 0.48298680782318115, + "learning_rate": 0.001, + "loss": 1.7601, + "step": 1690 + }, + { + "epoch": 0.05421091233776587, + "grad_norm": 0.482377827167511, + "learning_rate": 0.001, + "loss": 1.769, + "step": 1700 + }, + { + "epoch": 0.05452980005739979, + "grad_norm": 0.45547738671302795, + "learning_rate": 0.001, + "loss": 1.7547, + "step": 1710 + }, + { + "epoch": 0.05484868777703371, + "grad_norm": 0.4433269798755646, + "learning_rate": 0.001, + "loss": 1.7642, + "step": 1720 + }, + { + "epoch": 0.05516757549666762, + "grad_norm": 0.4870554804801941, + "learning_rate": 0.001, + "loss": 1.756, + "step": 1730 + }, + { + "epoch": 0.05548646321630154, + "grad_norm": 0.5187873840332031, + "learning_rate": 0.001, + "loss": 1.7672, + "step": 1740 + }, + { + "epoch": 0.05580535093593546, + "grad_norm": 0.4401336908340454, + "learning_rate": 0.001, + "loss": 1.7468, + "step": 1750 + }, + { + "epoch": 0.056124238655569376, + "grad_norm": 0.45578229427337646, + "learning_rate": 0.001, + "loss": 1.7545, + "step": 1760 + }, + { + "epoch": 0.05644312637520329, + "grad_norm": 0.46454453468322754, + "learning_rate": 0.001, + "loss": 1.7371, + "step": 1770 + }, + { + "epoch": 0.05676201409483721, + "grad_norm": 0.4536410868167877, + "learning_rate": 0.001, + "loss": 1.7634, + "step": 1780 + }, + { + "epoch": 0.057080901814471126, + "grad_norm": 0.4952872395515442, + "learning_rate": 0.001, + "loss": 1.7373, + "step": 1790 + }, + { + "epoch": 0.057399789534105045, + "grad_norm": 0.47220513224601746, + "learning_rate": 0.001, + "loss": 1.7348, + "step": 1800 + }, + { + "epoch": 0.057718677253738956, + "grad_norm": 0.44693368673324585, + "learning_rate": 0.001, + "loss": 1.7569, + "step": 1810 + }, + { + "epoch": 0.058037564973372875, + "grad_norm": 0.4495336711406708, + "learning_rate": 0.001, + "loss": 1.7362, + "step": 1820 + }, + { + "epoch": 0.058356452693006794, + "grad_norm": 0.44833892583847046, + "learning_rate": 0.001, + "loss": 1.7259, + "step": 1830 + }, + { + "epoch": 0.058675340412640706, + "grad_norm": 0.44210535287857056, + "learning_rate": 0.001, + "loss": 1.7251, + "step": 1840 + }, + { + "epoch": 0.058994228132274625, + "grad_norm": 0.45084482431411743, + "learning_rate": 0.001, + "loss": 1.7018, + "step": 1850 + }, + { + "epoch": 0.05931311585190854, + "grad_norm": 0.45676374435424805, + "learning_rate": 0.001, + "loss": 1.7216, + "step": 1860 + }, + { + "epoch": 0.05963200357154246, + "grad_norm": 0.45795685052871704, + "learning_rate": 0.001, + "loss": 1.723, + "step": 1870 + }, + { + "epoch": 0.059950891291176374, + "grad_norm": 0.42781588435173035, + "learning_rate": 0.001, + "loss": 1.7298, + "step": 1880 + }, + { + "epoch": 0.06026977901081029, + "grad_norm": 0.44484594464302063, + "learning_rate": 0.001, + "loss": 1.7131, + "step": 1890 + }, + { + "epoch": 0.06058866673044421, + "grad_norm": 0.49411898851394653, + "learning_rate": 0.001, + "loss": 1.7175, + "step": 1900 + }, + { + "epoch": 0.06090755445007813, + "grad_norm": 0.43948128819465637, + "learning_rate": 0.001, + "loss": 1.709, + "step": 1910 + }, + { + "epoch": 0.06122644216971204, + "grad_norm": 0.44391316175460815, + "learning_rate": 0.001, + "loss": 1.6897, + "step": 1920 + }, + { + "epoch": 0.06154532988934596, + "grad_norm": 0.4589037001132965, + "learning_rate": 0.001, + "loss": 1.7213, + "step": 1930 + }, + { + "epoch": 0.06186421760897988, + "grad_norm": 0.41747188568115234, + "learning_rate": 0.001, + "loss": 1.6885, + "step": 1940 + }, + { + "epoch": 0.0621831053286138, + "grad_norm": 0.41558709740638733, + "learning_rate": 0.001, + "loss": 1.6909, + "step": 1950 + }, + { + "epoch": 0.06250199304824772, + "grad_norm": 0.43651485443115234, + "learning_rate": 0.001, + "loss": 1.6924, + "step": 1960 + }, + { + "epoch": 0.06282088076788163, + "grad_norm": 0.44822901487350464, + "learning_rate": 0.001, + "loss": 1.7026, + "step": 1970 + }, + { + "epoch": 0.06313976848751554, + "grad_norm": 0.43817171454429626, + "learning_rate": 0.001, + "loss": 1.683, + "step": 1980 + }, + { + "epoch": 0.06345865620714947, + "grad_norm": 0.45964375138282776, + "learning_rate": 0.001, + "loss": 1.6755, + "step": 1990 + }, + { + "epoch": 0.06377754392678338, + "grad_norm": 0.4256100356578827, + "learning_rate": 0.001, + "loss": 1.6814, + "step": 2000 + }, + { + "epoch": 0.06409643164641729, + "grad_norm": 0.4580231010913849, + "learning_rate": 0.001, + "loss": 1.6761, + "step": 2010 + }, + { + "epoch": 0.06441531936605122, + "grad_norm": 0.43515661358833313, + "learning_rate": 0.001, + "loss": 1.6977, + "step": 2020 + }, + { + "epoch": 0.06473420708568513, + "grad_norm": 0.5032089352607727, + "learning_rate": 0.001, + "loss": 1.6764, + "step": 2030 + }, + { + "epoch": 0.06505309480531905, + "grad_norm": 0.45531144738197327, + "learning_rate": 0.001, + "loss": 1.6734, + "step": 2040 + }, + { + "epoch": 0.06537198252495297, + "grad_norm": 0.42781633138656616, + "learning_rate": 0.001, + "loss": 1.6501, + "step": 2050 + }, + { + "epoch": 0.06569087024458688, + "grad_norm": 0.4123460352420807, + "learning_rate": 0.001, + "loss": 1.6385, + "step": 2060 + }, + { + "epoch": 0.0660097579642208, + "grad_norm": 0.44210684299468994, + "learning_rate": 0.001, + "loss": 1.6733, + "step": 2070 + }, + { + "epoch": 0.06632864568385471, + "grad_norm": 0.4108719229698181, + "learning_rate": 0.001, + "loss": 1.658, + "step": 2080 + }, + { + "epoch": 0.06664753340348863, + "grad_norm": 0.4030722677707672, + "learning_rate": 0.001, + "loss": 1.6742, + "step": 2090 + }, + { + "epoch": 0.06696642112312255, + "grad_norm": 0.4500997066497803, + "learning_rate": 0.001, + "loss": 1.6714, + "step": 2100 + }, + { + "epoch": 0.06728530884275646, + "grad_norm": 0.4074576795101166, + "learning_rate": 0.001, + "loss": 1.6419, + "step": 2110 + }, + { + "epoch": 0.06760419656239038, + "grad_norm": 0.40393441915512085, + "learning_rate": 0.001, + "loss": 1.6462, + "step": 2120 + }, + { + "epoch": 0.0679230842820243, + "grad_norm": 0.4103441536426544, + "learning_rate": 0.001, + "loss": 1.6531, + "step": 2130 + }, + { + "epoch": 0.06824197200165821, + "grad_norm": 0.4321202337741852, + "learning_rate": 0.001, + "loss": 1.6399, + "step": 2140 + }, + { + "epoch": 0.06856085972129214, + "grad_norm": 0.41198042035102844, + "learning_rate": 0.001, + "loss": 1.6556, + "step": 2150 + }, + { + "epoch": 0.06887974744092605, + "grad_norm": 0.4039364457130432, + "learning_rate": 0.001, + "loss": 1.654, + "step": 2160 + }, + { + "epoch": 0.06919863516055996, + "grad_norm": 0.4295644462108612, + "learning_rate": 0.001, + "loss": 1.6432, + "step": 2170 + }, + { + "epoch": 0.06951752288019389, + "grad_norm": 0.4189327657222748, + "learning_rate": 0.001, + "loss": 1.6393, + "step": 2180 + }, + { + "epoch": 0.0698364105998278, + "grad_norm": 0.4218682050704956, + "learning_rate": 0.001, + "loss": 1.6204, + "step": 2190 + }, + { + "epoch": 0.07015529831946171, + "grad_norm": 0.40319690108299255, + "learning_rate": 0.001, + "loss": 1.6381, + "step": 2200 + }, + { + "epoch": 0.07047418603909564, + "grad_norm": 0.40946927666664124, + "learning_rate": 0.001, + "loss": 1.6121, + "step": 2210 + }, + { + "epoch": 0.07079307375872955, + "grad_norm": 0.397491455078125, + "learning_rate": 0.001, + "loss": 1.6261, + "step": 2220 + }, + { + "epoch": 0.07111196147836348, + "grad_norm": 0.4268970191478729, + "learning_rate": 0.001, + "loss": 1.6144, + "step": 2230 + }, + { + "epoch": 0.07143084919799739, + "grad_norm": 0.46040934324264526, + "learning_rate": 0.001, + "loss": 1.6208, + "step": 2240 + }, + { + "epoch": 0.0717497369176313, + "grad_norm": 0.41573449969291687, + "learning_rate": 0.001, + "loss": 1.6021, + "step": 2250 + }, + { + "epoch": 0.07206862463726522, + "grad_norm": 0.40803709626197815, + "learning_rate": 0.001, + "loss": 1.5945, + "step": 2260 + }, + { + "epoch": 0.07238751235689914, + "grad_norm": 0.4061513841152191, + "learning_rate": 0.001, + "loss": 1.6189, + "step": 2270 + }, + { + "epoch": 0.07270640007653305, + "grad_norm": 0.40856054425239563, + "learning_rate": 0.001, + "loss": 1.6073, + "step": 2280 + }, + { + "epoch": 0.07302528779616697, + "grad_norm": 0.39824768900871277, + "learning_rate": 0.001, + "loss": 1.5878, + "step": 2290 + }, + { + "epoch": 0.07334417551580089, + "grad_norm": 0.4029414653778076, + "learning_rate": 0.001, + "loss": 1.5974, + "step": 2300 + }, + { + "epoch": 0.0736630632354348, + "grad_norm": 0.3928816318511963, + "learning_rate": 0.001, + "loss": 1.5884, + "step": 2310 + }, + { + "epoch": 0.07398195095506872, + "grad_norm": 0.403522253036499, + "learning_rate": 0.001, + "loss": 1.6077, + "step": 2320 + }, + { + "epoch": 0.07430083867470264, + "grad_norm": 0.39391088485717773, + "learning_rate": 0.001, + "loss": 1.6057, + "step": 2330 + }, + { + "epoch": 0.07461972639433656, + "grad_norm": 0.40311235189437866, + "learning_rate": 0.001, + "loss": 1.5865, + "step": 2340 + }, + { + "epoch": 0.07493861411397047, + "grad_norm": 0.4081204831600189, + "learning_rate": 0.001, + "loss": 1.6074, + "step": 2350 + }, + { + "epoch": 0.07525750183360438, + "grad_norm": 0.3867422044277191, + "learning_rate": 0.001, + "loss": 1.5975, + "step": 2360 + }, + { + "epoch": 0.07557638955323831, + "grad_norm": 0.3857383131980896, + "learning_rate": 0.001, + "loss": 1.5611, + "step": 2370 + }, + { + "epoch": 0.07589527727287222, + "grad_norm": 0.3840004801750183, + "learning_rate": 0.001, + "loss": 1.5612, + "step": 2380 + }, + { + "epoch": 0.07621416499250613, + "grad_norm": 0.42938390374183655, + "learning_rate": 0.001, + "loss": 1.6057, + "step": 2390 + }, + { + "epoch": 0.07653305271214006, + "grad_norm": 0.3979109227657318, + "learning_rate": 0.001, + "loss": 1.5863, + "step": 2400 + }, + { + "epoch": 0.07685194043177397, + "grad_norm": 0.38790082931518555, + "learning_rate": 0.001, + "loss": 1.5973, + "step": 2410 + }, + { + "epoch": 0.07717082815140788, + "grad_norm": 0.3939755856990814, + "learning_rate": 0.001, + "loss": 1.5764, + "step": 2420 + }, + { + "epoch": 0.07748971587104181, + "grad_norm": 0.3828144371509552, + "learning_rate": 0.001, + "loss": 1.5704, + "step": 2430 + }, + { + "epoch": 0.07780860359067572, + "grad_norm": 0.42103809118270874, + "learning_rate": 0.001, + "loss": 1.5667, + "step": 2440 + }, + { + "epoch": 0.07812749131030965, + "grad_norm": 0.43680015206336975, + "learning_rate": 0.001, + "loss": 1.5534, + "step": 2450 + }, + { + "epoch": 0.07844637902994356, + "grad_norm": 0.38762131333351135, + "learning_rate": 0.001, + "loss": 1.5724, + "step": 2460 + }, + { + "epoch": 0.07876526674957747, + "grad_norm": 0.38347816467285156, + "learning_rate": 0.001, + "loss": 1.5616, + "step": 2470 + }, + { + "epoch": 0.0790841544692114, + "grad_norm": 0.3741878271102905, + "learning_rate": 0.001, + "loss": 1.5503, + "step": 2480 + }, + { + "epoch": 0.07940304218884531, + "grad_norm": 0.36821624636650085, + "learning_rate": 0.001, + "loss": 1.5792, + "step": 2490 + }, + { + "epoch": 0.07972192990847922, + "grad_norm": 0.39044198393821716, + "learning_rate": 0.001, + "loss": 1.5657, + "step": 2500 + }, + { + "epoch": 0.08004081762811315, + "grad_norm": 0.3792630434036255, + "learning_rate": 0.001, + "loss": 1.5419, + "step": 2510 + }, + { + "epoch": 0.08035970534774706, + "grad_norm": 0.40021875500679016, + "learning_rate": 0.001, + "loss": 1.565, + "step": 2520 + }, + { + "epoch": 0.08067859306738097, + "grad_norm": 0.38357385993003845, + "learning_rate": 0.001, + "loss": 1.5643, + "step": 2530 + }, + { + "epoch": 0.0809974807870149, + "grad_norm": 0.3646472096443176, + "learning_rate": 0.001, + "loss": 1.5621, + "step": 2540 + }, + { + "epoch": 0.0813163685066488, + "grad_norm": 0.37614285945892334, + "learning_rate": 0.001, + "loss": 1.5516, + "step": 2550 + }, + { + "epoch": 0.08163525622628273, + "grad_norm": 0.37552952766418457, + "learning_rate": 0.001, + "loss": 1.5506, + "step": 2560 + }, + { + "epoch": 0.08195414394591664, + "grad_norm": 0.3884067237377167, + "learning_rate": 0.001, + "loss": 1.5232, + "step": 2570 + }, + { + "epoch": 0.08227303166555056, + "grad_norm": 0.393189936876297, + "learning_rate": 0.001, + "loss": 1.5588, + "step": 2580 + }, + { + "epoch": 0.08259191938518448, + "grad_norm": 0.3752188980579376, + "learning_rate": 0.001, + "loss": 1.5298, + "step": 2590 + }, + { + "epoch": 0.0829108071048184, + "grad_norm": 0.37879225611686707, + "learning_rate": 0.001, + "loss": 1.5462, + "step": 2600 + }, + { + "epoch": 0.0832296948244523, + "grad_norm": 0.37227118015289307, + "learning_rate": 0.001, + "loss": 1.5473, + "step": 2610 + }, + { + "epoch": 0.08354858254408623, + "grad_norm": 0.3972522020339966, + "learning_rate": 0.001, + "loss": 1.5261, + "step": 2620 + }, + { + "epoch": 0.08386747026372014, + "grad_norm": 0.37113243341445923, + "learning_rate": 0.001, + "loss": 1.5337, + "step": 2630 + }, + { + "epoch": 0.08418635798335405, + "grad_norm": 0.364286333322525, + "learning_rate": 0.001, + "loss": 1.5293, + "step": 2640 + }, + { + "epoch": 0.08450524570298798, + "grad_norm": 0.362486869096756, + "learning_rate": 0.001, + "loss": 1.5333, + "step": 2650 + }, + { + "epoch": 0.08482413342262189, + "grad_norm": 0.3880142271518707, + "learning_rate": 0.001, + "loss": 1.5459, + "step": 2660 + }, + { + "epoch": 0.08514302114225582, + "grad_norm": 0.3843633532524109, + "learning_rate": 0.001, + "loss": 1.5181, + "step": 2670 + }, + { + "epoch": 0.08546190886188973, + "grad_norm": 0.36711886525154114, + "learning_rate": 0.001, + "loss": 1.5501, + "step": 2680 + }, + { + "epoch": 0.08578079658152364, + "grad_norm": 0.3663885295391083, + "learning_rate": 0.001, + "loss": 1.5248, + "step": 2690 + }, + { + "epoch": 0.08609968430115757, + "grad_norm": 0.3654477298259735, + "learning_rate": 0.001, + "loss": 1.5316, + "step": 2700 + }, + { + "epoch": 0.08641857202079148, + "grad_norm": 0.39503544569015503, + "learning_rate": 0.001, + "loss": 1.5214, + "step": 2710 + }, + { + "epoch": 0.08673745974042539, + "grad_norm": 0.39475134015083313, + "learning_rate": 0.001, + "loss": 1.5385, + "step": 2720 + }, + { + "epoch": 0.08705634746005932, + "grad_norm": 0.3583775758743286, + "learning_rate": 0.001, + "loss": 1.539, + "step": 2730 + }, + { + "epoch": 0.08737523517969323, + "grad_norm": 0.3696213364601135, + "learning_rate": 0.001, + "loss": 1.5548, + "step": 2740 + }, + { + "epoch": 0.08769412289932715, + "grad_norm": 0.3889085352420807, + "learning_rate": 0.001, + "loss": 1.5136, + "step": 2750 + }, + { + "epoch": 0.08801301061896107, + "grad_norm": 0.3819277882575989, + "learning_rate": 0.001, + "loss": 1.5187, + "step": 2760 + }, + { + "epoch": 0.08833189833859498, + "grad_norm": 0.3591541647911072, + "learning_rate": 0.001, + "loss": 1.5146, + "step": 2770 + }, + { + "epoch": 0.0886507860582289, + "grad_norm": 0.37153083086013794, + "learning_rate": 0.001, + "loss": 1.539, + "step": 2780 + }, + { + "epoch": 0.08896967377786281, + "grad_norm": 0.3519269526004791, + "learning_rate": 0.001, + "loss": 1.4952, + "step": 2790 + }, + { + "epoch": 0.08928856149749673, + "grad_norm": 0.3622041344642639, + "learning_rate": 0.001, + "loss": 1.5245, + "step": 2800 + }, + { + "epoch": 0.08960744921713065, + "grad_norm": 0.3769047260284424, + "learning_rate": 0.001, + "loss": 1.4881, + "step": 2810 + }, + { + "epoch": 0.08992633693676456, + "grad_norm": 0.3731653392314911, + "learning_rate": 0.001, + "loss": 1.5169, + "step": 2820 + }, + { + "epoch": 0.09024522465639848, + "grad_norm": 0.36590540409088135, + "learning_rate": 0.001, + "loss": 1.511, + "step": 2830 + }, + { + "epoch": 0.0905641123760324, + "grad_norm": 0.35569992661476135, + "learning_rate": 0.001, + "loss": 1.5098, + "step": 2840 + }, + { + "epoch": 0.09088300009566631, + "grad_norm": 0.37215662002563477, + "learning_rate": 0.001, + "loss": 1.4903, + "step": 2850 + }, + { + "epoch": 0.09120188781530024, + "grad_norm": 0.35044750571250916, + "learning_rate": 0.001, + "loss": 1.5006, + "step": 2860 + }, + { + "epoch": 0.09152077553493415, + "grad_norm": 0.3568022847175598, + "learning_rate": 0.001, + "loss": 1.4942, + "step": 2870 + }, + { + "epoch": 0.09183966325456806, + "grad_norm": 0.3533008098602295, + "learning_rate": 0.001, + "loss": 1.5072, + "step": 2880 + }, + { + "epoch": 0.09215855097420199, + "grad_norm": 0.3491477370262146, + "learning_rate": 0.001, + "loss": 1.4951, + "step": 2890 + }, + { + "epoch": 0.0924774386938359, + "grad_norm": 0.3570086658000946, + "learning_rate": 0.001, + "loss": 1.4967, + "step": 2900 + }, + { + "epoch": 0.09279632641346981, + "grad_norm": 0.3539246916770935, + "learning_rate": 0.001, + "loss": 1.503, + "step": 2910 + }, + { + "epoch": 0.09311521413310374, + "grad_norm": 0.37031111121177673, + "learning_rate": 0.001, + "loss": 1.5041, + "step": 2920 + }, + { + "epoch": 0.09343410185273765, + "grad_norm": 0.34995830059051514, + "learning_rate": 0.001, + "loss": 1.5063, + "step": 2930 + }, + { + "epoch": 0.09375298957237156, + "grad_norm": 0.36037734150886536, + "learning_rate": 0.001, + "loss": 1.5144, + "step": 2940 + }, + { + "epoch": 0.09407187729200549, + "grad_norm": 0.382372111082077, + "learning_rate": 0.001, + "loss": 1.4877, + "step": 2950 + }, + { + "epoch": 0.0943907650116394, + "grad_norm": 0.3281106650829315, + "learning_rate": 0.001, + "loss": 1.4862, + "step": 2960 + }, + { + "epoch": 0.09470965273127332, + "grad_norm": 0.36586999893188477, + "learning_rate": 0.001, + "loss": 1.497, + "step": 2970 + }, + { + "epoch": 0.09502854045090724, + "grad_norm": 0.3418881893157959, + "learning_rate": 0.001, + "loss": 1.4877, + "step": 2980 + }, + { + "epoch": 0.09534742817054115, + "grad_norm": 0.3603076934814453, + "learning_rate": 0.001, + "loss": 1.4896, + "step": 2990 + }, + { + "epoch": 0.09566631589017507, + "grad_norm": 0.3709716498851776, + "learning_rate": 0.001, + "loss": 1.4994, + "step": 3000 + }, + { + "epoch": 0.09598520360980899, + "grad_norm": 0.34613725543022156, + "learning_rate": 0.001, + "loss": 1.4909, + "step": 3010 + }, + { + "epoch": 0.0963040913294429, + "grad_norm": 0.34238988161087036, + "learning_rate": 0.001, + "loss": 1.477, + "step": 3020 + }, + { + "epoch": 0.09662297904907682, + "grad_norm": 0.357252299785614, + "learning_rate": 0.001, + "loss": 1.4807, + "step": 3030 + }, + { + "epoch": 0.09694186676871074, + "grad_norm": 0.34529972076416016, + "learning_rate": 0.001, + "loss": 1.4892, + "step": 3040 + }, + { + "epoch": 0.09726075448834465, + "grad_norm": 0.3669123947620392, + "learning_rate": 0.001, + "loss": 1.5015, + "step": 3050 + }, + { + "epoch": 0.09757964220797857, + "grad_norm": 0.35653454065322876, + "learning_rate": 0.001, + "loss": 1.4943, + "step": 3060 + }, + { + "epoch": 0.09789852992761248, + "grad_norm": 0.3683283030986786, + "learning_rate": 0.001, + "loss": 1.4762, + "step": 3070 + }, + { + "epoch": 0.09821741764724641, + "grad_norm": 0.3327406346797943, + "learning_rate": 0.001, + "loss": 1.4843, + "step": 3080 + }, + { + "epoch": 0.09853630536688032, + "grad_norm": 0.3510376513004303, + "learning_rate": 0.001, + "loss": 1.4725, + "step": 3090 + }, + { + "epoch": 0.09885519308651423, + "grad_norm": 0.3618435263633728, + "learning_rate": 0.001, + "loss": 1.4636, + "step": 3100 + }, + { + "epoch": 0.09917408080614816, + "grad_norm": 0.3902658224105835, + "learning_rate": 0.001, + "loss": 1.4647, + "step": 3110 + }, + { + "epoch": 0.09949296852578207, + "grad_norm": 0.336622416973114, + "learning_rate": 0.001, + "loss": 1.4864, + "step": 3120 + }, + { + "epoch": 0.09981185624541598, + "grad_norm": 0.3628654181957245, + "learning_rate": 0.001, + "loss": 1.4914, + "step": 3130 + }, + { + "epoch": 0.10013074396504991, + "grad_norm": 0.34092777967453003, + "learning_rate": 0.001, + "loss": 1.478, + "step": 3140 + }, + { + "epoch": 0.10044963168468382, + "grad_norm": 0.3362221419811249, + "learning_rate": 0.001, + "loss": 1.4862, + "step": 3150 + }, + { + "epoch": 0.10076851940431773, + "grad_norm": 0.37139853835105896, + "learning_rate": 0.001, + "loss": 1.4655, + "step": 3160 + }, + { + "epoch": 0.10108740712395166, + "grad_norm": 0.3196049630641937, + "learning_rate": 0.001, + "loss": 1.468, + "step": 3170 + }, + { + "epoch": 0.10140629484358557, + "grad_norm": 0.34154269099235535, + "learning_rate": 0.001, + "loss": 1.4603, + "step": 3180 + }, + { + "epoch": 0.1017251825632195, + "grad_norm": 0.33629539608955383, + "learning_rate": 0.001, + "loss": 1.4422, + "step": 3190 + }, + { + "epoch": 0.10204407028285341, + "grad_norm": 0.32535743713378906, + "learning_rate": 0.001, + "loss": 1.4495, + "step": 3200 + }, + { + "epoch": 0.10236295800248732, + "grad_norm": 0.3403289020061493, + "learning_rate": 0.001, + "loss": 1.4856, + "step": 3210 + }, + { + "epoch": 0.10268184572212125, + "grad_norm": 0.3382999300956726, + "learning_rate": 0.001, + "loss": 1.4719, + "step": 3220 + }, + { + "epoch": 0.10300073344175516, + "grad_norm": 0.3283659815788269, + "learning_rate": 0.001, + "loss": 1.4374, + "step": 3230 + }, + { + "epoch": 0.10331962116138907, + "grad_norm": 0.3706674873828888, + "learning_rate": 0.001, + "loss": 1.4259, + "step": 3240 + }, + { + "epoch": 0.103638508881023, + "grad_norm": 0.3296601474285126, + "learning_rate": 0.001, + "loss": 1.4563, + "step": 3250 + }, + { + "epoch": 0.1039573966006569, + "grad_norm": 0.3546639680862427, + "learning_rate": 0.001, + "loss": 1.4707, + "step": 3260 + }, + { + "epoch": 0.10427628432029083, + "grad_norm": 0.33120912313461304, + "learning_rate": 0.001, + "loss": 1.4324, + "step": 3270 + }, + { + "epoch": 0.10459517203992474, + "grad_norm": 0.36581435799598694, + "learning_rate": 0.001, + "loss": 1.4687, + "step": 3280 + }, + { + "epoch": 0.10491405975955866, + "grad_norm": 0.34121185541152954, + "learning_rate": 0.001, + "loss": 1.4292, + "step": 3290 + }, + { + "epoch": 0.10523294747919258, + "grad_norm": 0.3378187119960785, + "learning_rate": 0.001, + "loss": 1.4586, + "step": 3300 + }, + { + "epoch": 0.1055518351988265, + "grad_norm": 0.34159472584724426, + "learning_rate": 0.001, + "loss": 1.4506, + "step": 3310 + }, + { + "epoch": 0.1058707229184604, + "grad_norm": 0.3327382504940033, + "learning_rate": 0.001, + "loss": 1.4283, + "step": 3320 + }, + { + "epoch": 0.10618961063809433, + "grad_norm": 0.3393729329109192, + "learning_rate": 0.001, + "loss": 1.4972, + "step": 3330 + }, + { + "epoch": 0.10650849835772824, + "grad_norm": 0.3339972198009491, + "learning_rate": 0.001, + "loss": 1.4593, + "step": 3340 + }, + { + "epoch": 0.10682738607736215, + "grad_norm": 0.34833648800849915, + "learning_rate": 0.001, + "loss": 1.4697, + "step": 3350 + }, + { + "epoch": 0.10714627379699608, + "grad_norm": 0.3432925045490265, + "learning_rate": 0.001, + "loss": 1.451, + "step": 3360 + }, + { + "epoch": 0.10746516151662999, + "grad_norm": 0.3279559910297394, + "learning_rate": 0.001, + "loss": 1.4578, + "step": 3370 + }, + { + "epoch": 0.10778404923626392, + "grad_norm": 0.3407198488712311, + "learning_rate": 0.001, + "loss": 1.4476, + "step": 3380 + }, + { + "epoch": 0.10810293695589783, + "grad_norm": 0.35291847586631775, + "learning_rate": 0.001, + "loss": 1.4141, + "step": 3390 + }, + { + "epoch": 0.10842182467553174, + "grad_norm": 0.3389386832714081, + "learning_rate": 0.001, + "loss": 1.4367, + "step": 3400 + }, + { + "epoch": 0.10874071239516567, + "grad_norm": 0.3285823464393616, + "learning_rate": 0.001, + "loss": 1.4468, + "step": 3410 + }, + { + "epoch": 0.10905960011479958, + "grad_norm": 0.3264278471469879, + "learning_rate": 0.001, + "loss": 1.4572, + "step": 3420 + }, + { + "epoch": 0.10937848783443349, + "grad_norm": 0.3490174412727356, + "learning_rate": 0.001, + "loss": 1.4502, + "step": 3430 + }, + { + "epoch": 0.10969737555406742, + "grad_norm": 0.31935915350914, + "learning_rate": 0.001, + "loss": 1.4519, + "step": 3440 + }, + { + "epoch": 0.11001626327370133, + "grad_norm": 0.325456440448761, + "learning_rate": 0.001, + "loss": 1.4553, + "step": 3450 + }, + { + "epoch": 0.11033515099333524, + "grad_norm": 0.33232828974723816, + "learning_rate": 0.001, + "loss": 1.4221, + "step": 3460 + }, + { + "epoch": 0.11065403871296917, + "grad_norm": 0.3299655318260193, + "learning_rate": 0.001, + "loss": 1.4496, + "step": 3470 + }, + { + "epoch": 0.11097292643260308, + "grad_norm": 0.33184587955474854, + "learning_rate": 0.001, + "loss": 1.4256, + "step": 3480 + }, + { + "epoch": 0.111291814152237, + "grad_norm": 0.33328554034233093, + "learning_rate": 0.001, + "loss": 1.4229, + "step": 3490 + }, + { + "epoch": 0.11161070187187092, + "grad_norm": 0.33121776580810547, + "learning_rate": 0.001, + "loss": 1.4369, + "step": 3500 + }, + { + "epoch": 0.11192958959150483, + "grad_norm": 0.33411258459091187, + "learning_rate": 0.001, + "loss": 1.4346, + "step": 3510 + }, + { + "epoch": 0.11224847731113875, + "grad_norm": 0.3087165355682373, + "learning_rate": 0.001, + "loss": 1.4262, + "step": 3520 + }, + { + "epoch": 0.11256736503077266, + "grad_norm": 0.3170263469219208, + "learning_rate": 0.001, + "loss": 1.4212, + "step": 3530 + }, + { + "epoch": 0.11288625275040658, + "grad_norm": 0.33870166540145874, + "learning_rate": 0.001, + "loss": 1.4373, + "step": 3540 + }, + { + "epoch": 0.1132051404700405, + "grad_norm": 0.34458819031715393, + "learning_rate": 0.001, + "loss": 1.4329, + "step": 3550 + }, + { + "epoch": 0.11352402818967441, + "grad_norm": 0.3231976628303528, + "learning_rate": 0.001, + "loss": 1.4343, + "step": 3560 + }, + { + "epoch": 0.11384291590930833, + "grad_norm": 0.33272790908813477, + "learning_rate": 0.001, + "loss": 1.4098, + "step": 3570 + }, + { + "epoch": 0.11416180362894225, + "grad_norm": 0.31947606801986694, + "learning_rate": 0.001, + "loss": 1.4049, + "step": 3580 + }, + { + "epoch": 0.11448069134857616, + "grad_norm": 0.34905490279197693, + "learning_rate": 0.001, + "loss": 1.444, + "step": 3590 + }, + { + "epoch": 0.11479957906821009, + "grad_norm": 0.3256986737251282, + "learning_rate": 0.001, + "loss": 1.4312, + "step": 3600 + }, + { + "epoch": 0.115118466787844, + "grad_norm": 0.3320602476596832, + "learning_rate": 0.001, + "loss": 1.3994, + "step": 3610 + }, + { + "epoch": 0.11543735450747791, + "grad_norm": 0.32200315594673157, + "learning_rate": 0.001, + "loss": 1.3982, + "step": 3620 + }, + { + "epoch": 0.11575624222711184, + "grad_norm": 0.3169730007648468, + "learning_rate": 0.001, + "loss": 1.4297, + "step": 3630 + }, + { + "epoch": 0.11607512994674575, + "grad_norm": 0.33035051822662354, + "learning_rate": 0.001, + "loss": 1.4403, + "step": 3640 + }, + { + "epoch": 0.11639401766637966, + "grad_norm": 0.3387161195278168, + "learning_rate": 0.001, + "loss": 1.4163, + "step": 3650 + }, + { + "epoch": 0.11671290538601359, + "grad_norm": 0.31775781512260437, + "learning_rate": 0.001, + "loss": 1.4008, + "step": 3660 + }, + { + "epoch": 0.1170317931056475, + "grad_norm": 0.3159157931804657, + "learning_rate": 0.001, + "loss": 1.4304, + "step": 3670 + }, + { + "epoch": 0.11735068082528141, + "grad_norm": 0.3200733959674835, + "learning_rate": 0.001, + "loss": 1.4446, + "step": 3680 + }, + { + "epoch": 0.11766956854491534, + "grad_norm": 0.3100077211856842, + "learning_rate": 0.001, + "loss": 1.4135, + "step": 3690 + }, + { + "epoch": 0.11798845626454925, + "grad_norm": 0.3178308606147766, + "learning_rate": 0.001, + "loss": 1.4192, + "step": 3700 + }, + { + "epoch": 0.11830734398418317, + "grad_norm": 0.3008234202861786, + "learning_rate": 0.001, + "loss": 1.4215, + "step": 3710 + }, + { + "epoch": 0.11862623170381709, + "grad_norm": 0.31308454275131226, + "learning_rate": 0.001, + "loss": 1.4019, + "step": 3720 + }, + { + "epoch": 0.118945119423451, + "grad_norm": 0.32719430327415466, + "learning_rate": 0.001, + "loss": 1.4209, + "step": 3730 + }, + { + "epoch": 0.11926400714308492, + "grad_norm": 0.3134723901748657, + "learning_rate": 0.001, + "loss": 1.4274, + "step": 3740 + }, + { + "epoch": 0.11958289486271884, + "grad_norm": 0.31542280316352844, + "learning_rate": 0.001, + "loss": 1.401, + "step": 3750 + }, + { + "epoch": 0.11990178258235275, + "grad_norm": 0.3206769526004791, + "learning_rate": 0.001, + "loss": 1.4101, + "step": 3760 + }, + { + "epoch": 0.12022067030198667, + "grad_norm": 0.31581780314445496, + "learning_rate": 0.001, + "loss": 1.4272, + "step": 3770 + }, + { + "epoch": 0.12053955802162059, + "grad_norm": 0.31411078572273254, + "learning_rate": 0.001, + "loss": 1.44, + "step": 3780 + }, + { + "epoch": 0.12085844574125451, + "grad_norm": 0.30180829763412476, + "learning_rate": 0.001, + "loss": 1.3919, + "step": 3790 + }, + { + "epoch": 0.12117733346088842, + "grad_norm": 0.30781787633895874, + "learning_rate": 0.001, + "loss": 1.3889, + "step": 3800 + }, + { + "epoch": 0.12149622118052233, + "grad_norm": 0.32143038511276245, + "learning_rate": 0.001, + "loss": 1.3944, + "step": 3810 + }, + { + "epoch": 0.12181510890015626, + "grad_norm": 0.34171000123023987, + "learning_rate": 0.001, + "loss": 1.4061, + "step": 3820 + }, + { + "epoch": 0.12213399661979017, + "grad_norm": 0.31811872124671936, + "learning_rate": 0.001, + "loss": 1.3958, + "step": 3830 + }, + { + "epoch": 0.12245288433942408, + "grad_norm": 0.33086341619491577, + "learning_rate": 0.001, + "loss": 1.4189, + "step": 3840 + }, + { + "epoch": 0.12277177205905801, + "grad_norm": 0.30563294887542725, + "learning_rate": 0.001, + "loss": 1.4161, + "step": 3850 + }, + { + "epoch": 0.12309065977869192, + "grad_norm": 0.30680814385414124, + "learning_rate": 0.001, + "loss": 1.4156, + "step": 3860 + }, + { + "epoch": 0.12340954749832583, + "grad_norm": 0.32353097200393677, + "learning_rate": 0.001, + "loss": 1.4239, + "step": 3870 + }, + { + "epoch": 0.12372843521795976, + "grad_norm": 0.3182706832885742, + "learning_rate": 0.001, + "loss": 1.3936, + "step": 3880 + }, + { + "epoch": 0.12404732293759367, + "grad_norm": 0.309923380613327, + "learning_rate": 0.001, + "loss": 1.4099, + "step": 3890 + }, + { + "epoch": 0.1243662106572276, + "grad_norm": 0.31839415431022644, + "learning_rate": 0.001, + "loss": 1.4066, + "step": 3900 + }, + { + "epoch": 0.12468509837686151, + "grad_norm": 0.33320650458335876, + "learning_rate": 0.001, + "loss": 1.4058, + "step": 3910 + }, + { + "epoch": 0.12500398609649543, + "grad_norm": 0.31085923314094543, + "learning_rate": 0.001, + "loss": 1.4109, + "step": 3920 + }, + { + "epoch": 0.12532287381612933, + "grad_norm": 0.308795303106308, + "learning_rate": 0.001, + "loss": 1.3775, + "step": 3930 + }, + { + "epoch": 0.12564176153576326, + "grad_norm": 0.3182823657989502, + "learning_rate": 0.001, + "loss": 1.3809, + "step": 3940 + }, + { + "epoch": 0.12596064925539718, + "grad_norm": 0.2957512140274048, + "learning_rate": 0.001, + "loss": 1.3972, + "step": 3950 + }, + { + "epoch": 0.12627953697503108, + "grad_norm": 0.32786622643470764, + "learning_rate": 0.001, + "loss": 1.3852, + "step": 3960 + }, + { + "epoch": 0.126598424694665, + "grad_norm": 0.3133629560470581, + "learning_rate": 0.001, + "loss": 1.4012, + "step": 3970 + }, + { + "epoch": 0.12691731241429893, + "grad_norm": 0.3149418830871582, + "learning_rate": 0.001, + "loss": 1.384, + "step": 3980 + }, + { + "epoch": 0.12723620013393283, + "grad_norm": 0.31080493330955505, + "learning_rate": 0.001, + "loss": 1.3909, + "step": 3990 + }, + { + "epoch": 0.12755508785356676, + "grad_norm": 0.3017434775829315, + "learning_rate": 0.001, + "loss": 1.389, + "step": 4000 + }, + { + "epoch": 0.12787397557320068, + "grad_norm": 0.3071618378162384, + "learning_rate": 0.001, + "loss": 1.4003, + "step": 4010 + }, + { + "epoch": 0.12819286329283458, + "grad_norm": 0.3157201409339905, + "learning_rate": 0.001, + "loss": 1.3902, + "step": 4020 + }, + { + "epoch": 0.1285117510124685, + "grad_norm": 0.32032763957977295, + "learning_rate": 0.001, + "loss": 1.3973, + "step": 4030 + }, + { + "epoch": 0.12883063873210243, + "grad_norm": 0.3034627437591553, + "learning_rate": 0.001, + "loss": 1.3938, + "step": 4040 + }, + { + "epoch": 0.12914952645173636, + "grad_norm": 0.3229222893714905, + "learning_rate": 0.001, + "loss": 1.3955, + "step": 4050 + }, + { + "epoch": 0.12946841417137026, + "grad_norm": 0.3058593273162842, + "learning_rate": 0.001, + "loss": 1.4061, + "step": 4060 + }, + { + "epoch": 0.12978730189100418, + "grad_norm": 0.30278199911117554, + "learning_rate": 0.001, + "loss": 1.3985, + "step": 4070 + }, + { + "epoch": 0.1301061896106381, + "grad_norm": 0.3108304440975189, + "learning_rate": 0.001, + "loss": 1.379, + "step": 4080 + }, + { + "epoch": 0.130425077330272, + "grad_norm": 0.3051409125328064, + "learning_rate": 0.001, + "loss": 1.3886, + "step": 4090 + }, + { + "epoch": 0.13074396504990593, + "grad_norm": 0.32630011439323425, + "learning_rate": 0.001, + "loss": 1.3905, + "step": 4100 + }, + { + "epoch": 0.13106285276953986, + "grad_norm": 0.31419435143470764, + "learning_rate": 0.001, + "loss": 1.3871, + "step": 4110 + }, + { + "epoch": 0.13138174048917375, + "grad_norm": 0.32161471247673035, + "learning_rate": 0.001, + "loss": 1.3701, + "step": 4120 + }, + { + "epoch": 0.13170062820880768, + "grad_norm": 0.29289132356643677, + "learning_rate": 0.001, + "loss": 1.3881, + "step": 4130 + }, + { + "epoch": 0.1320195159284416, + "grad_norm": 0.3462681174278259, + "learning_rate": 0.001, + "loss": 1.3798, + "step": 4140 + }, + { + "epoch": 0.1323384036480755, + "grad_norm": 0.3089030086994171, + "learning_rate": 0.001, + "loss": 1.3721, + "step": 4150 + }, + { + "epoch": 0.13265729136770943, + "grad_norm": 0.2968080937862396, + "learning_rate": 0.001, + "loss": 1.3774, + "step": 4160 + }, + { + "epoch": 0.13297617908734335, + "grad_norm": 0.3071855306625366, + "learning_rate": 0.001, + "loss": 1.3786, + "step": 4170 + }, + { + "epoch": 0.13329506680697725, + "grad_norm": 0.3024219870567322, + "learning_rate": 0.001, + "loss": 1.3618, + "step": 4180 + }, + { + "epoch": 0.13361395452661118, + "grad_norm": 0.3031976521015167, + "learning_rate": 0.001, + "loss": 1.3712, + "step": 4190 + }, + { + "epoch": 0.1339328422462451, + "grad_norm": 0.30467966198921204, + "learning_rate": 0.001, + "loss": 1.3776, + "step": 4200 + }, + { + "epoch": 0.134251729965879, + "grad_norm": 0.3077302575111389, + "learning_rate": 0.001, + "loss": 1.3746, + "step": 4210 + }, + { + "epoch": 0.13457061768551293, + "grad_norm": 0.28658169507980347, + "learning_rate": 0.001, + "loss": 1.3679, + "step": 4220 + }, + { + "epoch": 0.13488950540514685, + "grad_norm": 0.2950628399848938, + "learning_rate": 0.001, + "loss": 1.3658, + "step": 4230 + }, + { + "epoch": 0.13520839312478075, + "grad_norm": 0.3013644516468048, + "learning_rate": 0.001, + "loss": 1.4024, + "step": 4240 + }, + { + "epoch": 0.13552728084441468, + "grad_norm": 0.30451542139053345, + "learning_rate": 0.001, + "loss": 1.3746, + "step": 4250 + }, + { + "epoch": 0.1358461685640486, + "grad_norm": 0.30308064818382263, + "learning_rate": 0.001, + "loss": 1.3883, + "step": 4260 + }, + { + "epoch": 0.13616505628368253, + "grad_norm": 0.2882744371891022, + "learning_rate": 0.001, + "loss": 1.3283, + "step": 4270 + }, + { + "epoch": 0.13648394400331643, + "grad_norm": 0.3060248792171478, + "learning_rate": 0.001, + "loss": 1.3688, + "step": 4280 + }, + { + "epoch": 0.13680283172295035, + "grad_norm": 0.2914104461669922, + "learning_rate": 0.001, + "loss": 1.3523, + "step": 4290 + }, + { + "epoch": 0.13712171944258428, + "grad_norm": 0.2941143810749054, + "learning_rate": 0.001, + "loss": 1.3662, + "step": 4300 + }, + { + "epoch": 0.13744060716221818, + "grad_norm": 0.31439968943595886, + "learning_rate": 0.001, + "loss": 1.3765, + "step": 4310 + }, + { + "epoch": 0.1377594948818521, + "grad_norm": 0.2949891686439514, + "learning_rate": 0.001, + "loss": 1.357, + "step": 4320 + }, + { + "epoch": 0.13807838260148603, + "grad_norm": 0.3023112416267395, + "learning_rate": 0.001, + "loss": 1.3667, + "step": 4330 + }, + { + "epoch": 0.13839727032111993, + "grad_norm": 0.2970675528049469, + "learning_rate": 0.001, + "loss": 1.363, + "step": 4340 + }, + { + "epoch": 0.13871615804075385, + "grad_norm": 0.28739404678344727, + "learning_rate": 0.001, + "loss": 1.3696, + "step": 4350 + }, + { + "epoch": 0.13903504576038778, + "grad_norm": 0.30378416180610657, + "learning_rate": 0.001, + "loss": 1.366, + "step": 4360 + }, + { + "epoch": 0.13935393348002167, + "grad_norm": 0.2950860559940338, + "learning_rate": 0.001, + "loss": 1.3759, + "step": 4370 + }, + { + "epoch": 0.1396728211996556, + "grad_norm": 0.2949100732803345, + "learning_rate": 0.001, + "loss": 1.357, + "step": 4380 + }, + { + "epoch": 0.13999170891928953, + "grad_norm": 0.3102058172225952, + "learning_rate": 0.001, + "loss": 1.3654, + "step": 4390 + }, + { + "epoch": 0.14031059663892342, + "grad_norm": 0.3096378445625305, + "learning_rate": 0.001, + "loss": 1.3792, + "step": 4400 + }, + { + "epoch": 0.14062948435855735, + "grad_norm": 0.29340341687202454, + "learning_rate": 0.001, + "loss": 1.3812, + "step": 4410 + }, + { + "epoch": 0.14094837207819128, + "grad_norm": 0.28675857186317444, + "learning_rate": 0.001, + "loss": 1.3428, + "step": 4420 + }, + { + "epoch": 0.14126725979782517, + "grad_norm": 0.2995110750198364, + "learning_rate": 0.001, + "loss": 1.3632, + "step": 4430 + }, + { + "epoch": 0.1415861475174591, + "grad_norm": 0.2885999381542206, + "learning_rate": 0.001, + "loss": 1.347, + "step": 4440 + }, + { + "epoch": 0.14190503523709302, + "grad_norm": 0.29834800958633423, + "learning_rate": 0.001, + "loss": 1.3329, + "step": 4450 + }, + { + "epoch": 0.14222392295672695, + "grad_norm": 0.3012324273586273, + "learning_rate": 0.001, + "loss": 1.3406, + "step": 4460 + }, + { + "epoch": 0.14254281067636085, + "grad_norm": 0.3000870943069458, + "learning_rate": 0.001, + "loss": 1.3511, + "step": 4470 + }, + { + "epoch": 0.14286169839599477, + "grad_norm": 0.28593116998672485, + "learning_rate": 0.001, + "loss": 1.3563, + "step": 4480 + }, + { + "epoch": 0.1431805861156287, + "grad_norm": 0.2958747148513794, + "learning_rate": 0.001, + "loss": 1.3523, + "step": 4490 + }, + { + "epoch": 0.1434994738352626, + "grad_norm": 0.2945686876773834, + "learning_rate": 0.001, + "loss": 1.383, + "step": 4500 + }, + { + "epoch": 0.14381836155489652, + "grad_norm": 0.2951934039592743, + "learning_rate": 0.001, + "loss": 1.3828, + "step": 4510 + }, + { + "epoch": 0.14413724927453045, + "grad_norm": 0.2921164035797119, + "learning_rate": 0.001, + "loss": 1.353, + "step": 4520 + }, + { + "epoch": 0.14445613699416435, + "grad_norm": 0.27818742394447327, + "learning_rate": 0.001, + "loss": 1.348, + "step": 4530 + }, + { + "epoch": 0.14477502471379827, + "grad_norm": 0.2907786965370178, + "learning_rate": 0.001, + "loss": 1.3581, + "step": 4540 + }, + { + "epoch": 0.1450939124334322, + "grad_norm": 0.283187597990036, + "learning_rate": 0.001, + "loss": 1.3547, + "step": 4550 + }, + { + "epoch": 0.1454128001530661, + "grad_norm": 0.2873290479183197, + "learning_rate": 0.001, + "loss": 1.3735, + "step": 4560 + }, + { + "epoch": 0.14573168787270002, + "grad_norm": 0.30278632044792175, + "learning_rate": 0.001, + "loss": 1.3545, + "step": 4570 + }, + { + "epoch": 0.14605057559233395, + "grad_norm": 0.28067076206207275, + "learning_rate": 0.001, + "loss": 1.3435, + "step": 4580 + }, + { + "epoch": 0.14636946331196785, + "grad_norm": 0.29316315054893494, + "learning_rate": 0.001, + "loss": 1.3462, + "step": 4590 + }, + { + "epoch": 0.14668835103160177, + "grad_norm": 0.31010857224464417, + "learning_rate": 0.001, + "loss": 1.3656, + "step": 4600 + }, + { + "epoch": 0.1470072387512357, + "grad_norm": 0.2965826690196991, + "learning_rate": 0.001, + "loss": 1.3534, + "step": 4610 + }, + { + "epoch": 0.1473261264708696, + "grad_norm": 0.27814781665802, + "learning_rate": 0.001, + "loss": 1.3557, + "step": 4620 + }, + { + "epoch": 0.14764501419050352, + "grad_norm": 0.28145530819892883, + "learning_rate": 0.001, + "loss": 1.3534, + "step": 4630 + }, + { + "epoch": 0.14796390191013745, + "grad_norm": 0.2858860492706299, + "learning_rate": 0.001, + "loss": 1.3653, + "step": 4640 + }, + { + "epoch": 0.14828278962977134, + "grad_norm": 0.29033321142196655, + "learning_rate": 0.001, + "loss": 1.3598, + "step": 4650 + }, + { + "epoch": 0.14860167734940527, + "grad_norm": 0.33033251762390137, + "learning_rate": 0.001, + "loss": 1.3537, + "step": 4660 + }, + { + "epoch": 0.1489205650690392, + "grad_norm": 0.2896032929420471, + "learning_rate": 0.001, + "loss": 1.3405, + "step": 4670 + }, + { + "epoch": 0.14923945278867312, + "grad_norm": 0.29234012961387634, + "learning_rate": 0.001, + "loss": 1.3518, + "step": 4680 + }, + { + "epoch": 0.14955834050830702, + "grad_norm": 0.2925928831100464, + "learning_rate": 0.001, + "loss": 1.3544, + "step": 4690 + }, + { + "epoch": 0.14987722822794095, + "grad_norm": 0.3034275472164154, + "learning_rate": 0.001, + "loss": 1.3266, + "step": 4700 + }, + { + "epoch": 0.15019611594757487, + "grad_norm": 0.29347124695777893, + "learning_rate": 0.001, + "loss": 1.3444, + "step": 4710 + }, + { + "epoch": 0.15051500366720877, + "grad_norm": 0.30910998582839966, + "learning_rate": 0.001, + "loss": 1.333, + "step": 4720 + }, + { + "epoch": 0.1508338913868427, + "grad_norm": 0.292319655418396, + "learning_rate": 0.001, + "loss": 1.3427, + "step": 4730 + }, + { + "epoch": 0.15115277910647662, + "grad_norm": 0.29981669783592224, + "learning_rate": 0.001, + "loss": 1.3413, + "step": 4740 + }, + { + "epoch": 0.15147166682611052, + "grad_norm": 0.29529082775115967, + "learning_rate": 0.001, + "loss": 1.3301, + "step": 4750 + }, + { + "epoch": 0.15179055454574444, + "grad_norm": 0.26876968145370483, + "learning_rate": 0.001, + "loss": 1.3274, + "step": 4760 + }, + { + "epoch": 0.15210944226537837, + "grad_norm": 0.2886004149913788, + "learning_rate": 0.001, + "loss": 1.3412, + "step": 4770 + }, + { + "epoch": 0.15242832998501227, + "grad_norm": 0.27672141790390015, + "learning_rate": 0.001, + "loss": 1.3268, + "step": 4780 + }, + { + "epoch": 0.1527472177046462, + "grad_norm": 0.3041819930076599, + "learning_rate": 0.001, + "loss": 1.3546, + "step": 4790 + }, + { + "epoch": 0.15306610542428012, + "grad_norm": 0.2882234454154968, + "learning_rate": 0.001, + "loss": 1.3284, + "step": 4800 + }, + { + "epoch": 0.15338499314391402, + "grad_norm": 0.2742898464202881, + "learning_rate": 0.001, + "loss": 1.3522, + "step": 4810 + }, + { + "epoch": 0.15370388086354794, + "grad_norm": 0.2767809331417084, + "learning_rate": 0.001, + "loss": 1.3441, + "step": 4820 + }, + { + "epoch": 0.15402276858318187, + "grad_norm": 0.2767125070095062, + "learning_rate": 0.001, + "loss": 1.3277, + "step": 4830 + }, + { + "epoch": 0.15434165630281577, + "grad_norm": 0.28059667348861694, + "learning_rate": 0.001, + "loss": 1.3457, + "step": 4840 + }, + { + "epoch": 0.1546605440224497, + "grad_norm": 0.28245338797569275, + "learning_rate": 0.001, + "loss": 1.3371, + "step": 4850 + }, + { + "epoch": 0.15497943174208362, + "grad_norm": 0.27916160225868225, + "learning_rate": 0.001, + "loss": 1.3402, + "step": 4860 + }, + { + "epoch": 0.15529831946171752, + "grad_norm": 0.2856876850128174, + "learning_rate": 0.001, + "loss": 1.3189, + "step": 4870 + }, + { + "epoch": 0.15561720718135144, + "grad_norm": 0.2746897339820862, + "learning_rate": 0.001, + "loss": 1.3146, + "step": 4880 + }, + { + "epoch": 0.15593609490098537, + "grad_norm": 0.2816491723060608, + "learning_rate": 0.001, + "loss": 1.3357, + "step": 4890 + }, + { + "epoch": 0.1562549826206193, + "grad_norm": 0.2790864408016205, + "learning_rate": 0.001, + "loss": 1.3435, + "step": 4900 + }, + { + "epoch": 0.1565738703402532, + "grad_norm": 0.2920440137386322, + "learning_rate": 0.001, + "loss": 1.3517, + "step": 4910 + }, + { + "epoch": 0.15689275805988712, + "grad_norm": 0.28280529379844666, + "learning_rate": 0.001, + "loss": 1.3371, + "step": 4920 + }, + { + "epoch": 0.15721164577952104, + "grad_norm": 0.27726370096206665, + "learning_rate": 0.001, + "loss": 1.3402, + "step": 4930 + }, + { + "epoch": 0.15753053349915494, + "grad_norm": 0.27465248107910156, + "learning_rate": 0.001, + "loss": 1.315, + "step": 4940 + }, + { + "epoch": 0.15784942121878887, + "grad_norm": 0.281573086977005, + "learning_rate": 0.001, + "loss": 1.313, + "step": 4950 + }, + { + "epoch": 0.1581683089384228, + "grad_norm": 0.2761354446411133, + "learning_rate": 0.001, + "loss": 1.3402, + "step": 4960 + }, + { + "epoch": 0.1584871966580567, + "grad_norm": 0.28072458505630493, + "learning_rate": 0.001, + "loss": 1.3357, + "step": 4970 + }, + { + "epoch": 0.15880608437769062, + "grad_norm": 0.2800447344779968, + "learning_rate": 0.001, + "loss": 1.3229, + "step": 4980 + }, + { + "epoch": 0.15912497209732454, + "grad_norm": 0.28519025444984436, + "learning_rate": 0.001, + "loss": 1.3226, + "step": 4990 + }, + { + "epoch": 0.15944385981695844, + "grad_norm": 0.27568909525871277, + "learning_rate": 0.001, + "loss": 1.3009, + "step": 5000 + }, + { + "epoch": 0.15976274753659236, + "grad_norm": 0.28567227721214294, + "learning_rate": 0.001, + "loss": 1.3363, + "step": 5010 + }, + { + "epoch": 0.1600816352562263, + "grad_norm": 0.2807476222515106, + "learning_rate": 0.001, + "loss": 1.3158, + "step": 5020 + }, + { + "epoch": 0.1604005229758602, + "grad_norm": 0.2860855758190155, + "learning_rate": 0.001, + "loss": 1.3546, + "step": 5030 + }, + { + "epoch": 0.1607194106954941, + "grad_norm": 0.27725300192832947, + "learning_rate": 0.001, + "loss": 1.3362, + "step": 5040 + }, + { + "epoch": 0.16103829841512804, + "grad_norm": 0.27752354741096497, + "learning_rate": 0.001, + "loss": 1.3208, + "step": 5050 + }, + { + "epoch": 0.16135718613476194, + "grad_norm": 0.27968016266822815, + "learning_rate": 0.001, + "loss": 1.3253, + "step": 5060 + }, + { + "epoch": 0.16167607385439586, + "grad_norm": 0.2815000116825104, + "learning_rate": 0.001, + "loss": 1.316, + "step": 5070 + }, + { + "epoch": 0.1619949615740298, + "grad_norm": 0.28594574332237244, + "learning_rate": 0.001, + "loss": 1.3247, + "step": 5080 + }, + { + "epoch": 0.16231384929366371, + "grad_norm": 0.2727883756160736, + "learning_rate": 0.001, + "loss": 1.3486, + "step": 5090 + }, + { + "epoch": 0.1626327370132976, + "grad_norm": 0.27950167655944824, + "learning_rate": 0.001, + "loss": 1.3368, + "step": 5100 + }, + { + "epoch": 0.16295162473293154, + "grad_norm": 0.28539350628852844, + "learning_rate": 0.001, + "loss": 1.328, + "step": 5110 + }, + { + "epoch": 0.16327051245256546, + "grad_norm": 0.280464231967926, + "learning_rate": 0.001, + "loss": 1.3091, + "step": 5120 + }, + { + "epoch": 0.16358940017219936, + "grad_norm": 0.2751573622226715, + "learning_rate": 0.001, + "loss": 1.3242, + "step": 5130 + }, + { + "epoch": 0.1639082878918333, + "grad_norm": 0.27065640687942505, + "learning_rate": 0.001, + "loss": 1.3189, + "step": 5140 + }, + { + "epoch": 0.1642271756114672, + "grad_norm": 0.27752992510795593, + "learning_rate": 0.001, + "loss": 1.325, + "step": 5150 + }, + { + "epoch": 0.1645460633311011, + "grad_norm": 0.2825421988964081, + "learning_rate": 0.001, + "loss": 1.3221, + "step": 5160 + }, + { + "epoch": 0.16486495105073504, + "grad_norm": 0.2743771970272064, + "learning_rate": 0.001, + "loss": 1.3272, + "step": 5170 + }, + { + "epoch": 0.16518383877036896, + "grad_norm": 0.2885359823703766, + "learning_rate": 0.001, + "loss": 1.3362, + "step": 5180 + }, + { + "epoch": 0.16550272649000286, + "grad_norm": 0.289696604013443, + "learning_rate": 0.001, + "loss": 1.345, + "step": 5190 + }, + { + "epoch": 0.1658216142096368, + "grad_norm": 0.26721301674842834, + "learning_rate": 0.001, + "loss": 1.3037, + "step": 5200 + }, + { + "epoch": 0.1661405019292707, + "grad_norm": 0.27076032757759094, + "learning_rate": 0.001, + "loss": 1.3044, + "step": 5210 + }, + { + "epoch": 0.1664593896489046, + "grad_norm": 0.27800723910331726, + "learning_rate": 0.001, + "loss": 1.308, + "step": 5220 + }, + { + "epoch": 0.16677827736853854, + "grad_norm": 0.28856852650642395, + "learning_rate": 0.001, + "loss": 1.3066, + "step": 5230 + }, + { + "epoch": 0.16709716508817246, + "grad_norm": 0.27235889434814453, + "learning_rate": 0.001, + "loss": 1.3236, + "step": 5240 + }, + { + "epoch": 0.16741605280780636, + "grad_norm": 0.2833673655986786, + "learning_rate": 0.001, + "loss": 1.3194, + "step": 5250 + }, + { + "epoch": 0.16773494052744028, + "grad_norm": 0.28075218200683594, + "learning_rate": 0.001, + "loss": 1.313, + "step": 5260 + }, + { + "epoch": 0.1680538282470742, + "grad_norm": 0.28121885657310486, + "learning_rate": 0.001, + "loss": 1.3013, + "step": 5270 + }, + { + "epoch": 0.1683727159667081, + "grad_norm": 0.28455910086631775, + "learning_rate": 0.001, + "loss": 1.3183, + "step": 5280 + }, + { + "epoch": 0.16869160368634203, + "grad_norm": 0.27447423338890076, + "learning_rate": 0.001, + "loss": 1.3102, + "step": 5290 + }, + { + "epoch": 0.16901049140597596, + "grad_norm": 0.2774691581726074, + "learning_rate": 0.001, + "loss": 1.3086, + "step": 5300 + }, + { + "epoch": 0.16932937912560989, + "grad_norm": 0.27635547518730164, + "learning_rate": 0.001, + "loss": 1.3152, + "step": 5310 + }, + { + "epoch": 0.16964826684524378, + "grad_norm": 0.28113260865211487, + "learning_rate": 0.001, + "loss": 1.3087, + "step": 5320 + }, + { + "epoch": 0.1699671545648777, + "grad_norm": 0.27385708689689636, + "learning_rate": 0.001, + "loss": 1.3259, + "step": 5330 + }, + { + "epoch": 0.17028604228451164, + "grad_norm": 0.2752876579761505, + "learning_rate": 0.001, + "loss": 1.3323, + "step": 5340 + }, + { + "epoch": 0.17060493000414553, + "grad_norm": 0.29049137234687805, + "learning_rate": 0.001, + "loss": 1.3128, + "step": 5350 + }, + { + "epoch": 0.17092381772377946, + "grad_norm": 0.2888701558113098, + "learning_rate": 0.001, + "loss": 1.3111, + "step": 5360 + }, + { + "epoch": 0.17124270544341338, + "grad_norm": 0.27347761392593384, + "learning_rate": 0.001, + "loss": 1.2978, + "step": 5370 + }, + { + "epoch": 0.17156159316304728, + "grad_norm": 0.26778650283813477, + "learning_rate": 0.001, + "loss": 1.2873, + "step": 5380 + }, + { + "epoch": 0.1718804808826812, + "grad_norm": 0.26770761609077454, + "learning_rate": 0.001, + "loss": 1.3033, + "step": 5390 + }, + { + "epoch": 0.17219936860231513, + "grad_norm": 0.27429458498954773, + "learning_rate": 0.001, + "loss": 1.3285, + "step": 5400 + }, + { + "epoch": 0.17251825632194903, + "grad_norm": 0.26610255241394043, + "learning_rate": 0.001, + "loss": 1.2999, + "step": 5410 + }, + { + "epoch": 0.17283714404158296, + "grad_norm": 0.2606316804885864, + "learning_rate": 0.001, + "loss": 1.3159, + "step": 5420 + }, + { + "epoch": 0.17315603176121688, + "grad_norm": 0.2717187702655792, + "learning_rate": 0.001, + "loss": 1.3083, + "step": 5430 + }, + { + "epoch": 0.17347491948085078, + "grad_norm": 0.2680508494377136, + "learning_rate": 0.001, + "loss": 1.3116, + "step": 5440 + }, + { + "epoch": 0.1737938072004847, + "grad_norm": 0.2761911153793335, + "learning_rate": 0.001, + "loss": 1.3151, + "step": 5450 + }, + { + "epoch": 0.17411269492011863, + "grad_norm": 0.28334981203079224, + "learning_rate": 0.001, + "loss": 1.2956, + "step": 5460 + }, + { + "epoch": 0.17443158263975253, + "grad_norm": 0.2760418653488159, + "learning_rate": 0.001, + "loss": 1.3197, + "step": 5470 + }, + { + "epoch": 0.17475047035938646, + "grad_norm": 0.2959541380405426, + "learning_rate": 0.001, + "loss": 1.3219, + "step": 5480 + }, + { + "epoch": 0.17506935807902038, + "grad_norm": 0.27441099286079407, + "learning_rate": 0.001, + "loss": 1.2796, + "step": 5490 + }, + { + "epoch": 0.1753882457986543, + "grad_norm": 0.27085989713668823, + "learning_rate": 0.001, + "loss": 1.285, + "step": 5500 + }, + { + "epoch": 0.1757071335182882, + "grad_norm": 0.27273258566856384, + "learning_rate": 0.001, + "loss": 1.3222, + "step": 5510 + }, + { + "epoch": 0.17602602123792213, + "grad_norm": 0.2756020724773407, + "learning_rate": 0.001, + "loss": 1.318, + "step": 5520 + }, + { + "epoch": 0.17634490895755606, + "grad_norm": 0.276136577129364, + "learning_rate": 0.001, + "loss": 1.2945, + "step": 5530 + }, + { + "epoch": 0.17666379667718995, + "grad_norm": 0.27136778831481934, + "learning_rate": 0.001, + "loss": 1.3012, + "step": 5540 + }, + { + "epoch": 0.17698268439682388, + "grad_norm": 0.2842538058757782, + "learning_rate": 0.001, + "loss": 1.315, + "step": 5550 + }, + { + "epoch": 0.1773015721164578, + "grad_norm": 0.27779433131217957, + "learning_rate": 0.001, + "loss": 1.3073, + "step": 5560 + }, + { + "epoch": 0.1776204598360917, + "grad_norm": 0.27324429154396057, + "learning_rate": 0.001, + "loss": 1.2957, + "step": 5570 + }, + { + "epoch": 0.17793934755572563, + "grad_norm": 0.272482305765152, + "learning_rate": 0.001, + "loss": 1.2816, + "step": 5580 + }, + { + "epoch": 0.17825823527535956, + "grad_norm": 0.2856181561946869, + "learning_rate": 0.001, + "loss": 1.322, + "step": 5590 + }, + { + "epoch": 0.17857712299499345, + "grad_norm": 0.27352333068847656, + "learning_rate": 0.001, + "loss": 1.3114, + "step": 5600 + }, + { + "epoch": 0.17889601071462738, + "grad_norm": 0.27149438858032227, + "learning_rate": 0.001, + "loss": 1.3057, + "step": 5610 + }, + { + "epoch": 0.1792148984342613, + "grad_norm": 0.2803710997104645, + "learning_rate": 0.001, + "loss": 1.3219, + "step": 5620 + }, + { + "epoch": 0.1795337861538952, + "grad_norm": 0.26612791419029236, + "learning_rate": 0.001, + "loss": 1.3059, + "step": 5630 + }, + { + "epoch": 0.17985267387352913, + "grad_norm": 0.2668563425540924, + "learning_rate": 0.001, + "loss": 1.2957, + "step": 5640 + }, + { + "epoch": 0.18017156159316305, + "grad_norm": 0.2617603540420532, + "learning_rate": 0.001, + "loss": 1.3134, + "step": 5650 + }, + { + "epoch": 0.18049044931279695, + "grad_norm": 0.26398953795433044, + "learning_rate": 0.001, + "loss": 1.3042, + "step": 5660 + }, + { + "epoch": 0.18080933703243088, + "grad_norm": 0.2688938081264496, + "learning_rate": 0.001, + "loss": 1.3085, + "step": 5670 + }, + { + "epoch": 0.1811282247520648, + "grad_norm": 0.2666449248790741, + "learning_rate": 0.001, + "loss": 1.2839, + "step": 5680 + }, + { + "epoch": 0.1814471124716987, + "grad_norm": 0.2589948773384094, + "learning_rate": 0.001, + "loss": 1.2923, + "step": 5690 + }, + { + "epoch": 0.18176600019133263, + "grad_norm": 0.2649987041950226, + "learning_rate": 0.001, + "loss": 1.2997, + "step": 5700 + }, + { + "epoch": 0.18208488791096655, + "grad_norm": 0.26647627353668213, + "learning_rate": 0.001, + "loss": 1.307, + "step": 5710 + }, + { + "epoch": 0.18240377563060048, + "grad_norm": 0.2552083432674408, + "learning_rate": 0.001, + "loss": 1.3178, + "step": 5720 + }, + { + "epoch": 0.18272266335023438, + "grad_norm": 0.28030291199684143, + "learning_rate": 0.001, + "loss": 1.2808, + "step": 5730 + }, + { + "epoch": 0.1830415510698683, + "grad_norm": 0.2664260268211365, + "learning_rate": 0.001, + "loss": 1.2746, + "step": 5740 + }, + { + "epoch": 0.18336043878950223, + "grad_norm": 0.258434921503067, + "learning_rate": 0.001, + "loss": 1.3166, + "step": 5750 + }, + { + "epoch": 0.18367932650913613, + "grad_norm": 0.27148333191871643, + "learning_rate": 0.001, + "loss": 1.3001, + "step": 5760 + }, + { + "epoch": 0.18399821422877005, + "grad_norm": 0.2850393056869507, + "learning_rate": 0.001, + "loss": 1.296, + "step": 5770 + }, + { + "epoch": 0.18431710194840398, + "grad_norm": 0.2537064254283905, + "learning_rate": 0.001, + "loss": 1.2861, + "step": 5780 + }, + { + "epoch": 0.18463598966803788, + "grad_norm": 0.26095375418663025, + "learning_rate": 0.001, + "loss": 1.2824, + "step": 5790 + }, + { + "epoch": 0.1849548773876718, + "grad_norm": 0.2742947041988373, + "learning_rate": 0.001, + "loss": 1.294, + "step": 5800 + }, + { + "epoch": 0.18527376510730573, + "grad_norm": 0.2614261209964752, + "learning_rate": 0.001, + "loss": 1.2955, + "step": 5810 + }, + { + "epoch": 0.18559265282693962, + "grad_norm": 0.2708014249801636, + "learning_rate": 0.001, + "loss": 1.2847, + "step": 5820 + }, + { + "epoch": 0.18591154054657355, + "grad_norm": 0.25221359729766846, + "learning_rate": 0.001, + "loss": 1.2916, + "step": 5830 + }, + { + "epoch": 0.18623042826620748, + "grad_norm": 0.2672080397605896, + "learning_rate": 0.001, + "loss": 1.2905, + "step": 5840 + }, + { + "epoch": 0.18654931598584137, + "grad_norm": 0.27737823128700256, + "learning_rate": 0.001, + "loss": 1.2926, + "step": 5850 + }, + { + "epoch": 0.1868682037054753, + "grad_norm": 0.2654053270816803, + "learning_rate": 0.001, + "loss": 1.2916, + "step": 5860 + }, + { + "epoch": 0.18718709142510923, + "grad_norm": 0.25915852189064026, + "learning_rate": 0.001, + "loss": 1.2842, + "step": 5870 + }, + { + "epoch": 0.18750597914474312, + "grad_norm": 0.26840394735336304, + "learning_rate": 0.001, + "loss": 1.3053, + "step": 5880 + }, + { + "epoch": 0.18782486686437705, + "grad_norm": 0.2692258954048157, + "learning_rate": 0.001, + "loss": 1.2909, + "step": 5890 + }, + { + "epoch": 0.18814375458401097, + "grad_norm": 0.2546669840812683, + "learning_rate": 0.001, + "loss": 1.2853, + "step": 5900 + }, + { + "epoch": 0.18846264230364487, + "grad_norm": 0.27450940012931824, + "learning_rate": 0.001, + "loss": 1.2816, + "step": 5910 + }, + { + "epoch": 0.1887815300232788, + "grad_norm": 0.2487523853778839, + "learning_rate": 0.001, + "loss": 1.2828, + "step": 5920 + }, + { + "epoch": 0.18910041774291272, + "grad_norm": 0.2829253375530243, + "learning_rate": 0.001, + "loss": 1.2841, + "step": 5930 + }, + { + "epoch": 0.18941930546254665, + "grad_norm": 0.26952558755874634, + "learning_rate": 0.001, + "loss": 1.2958, + "step": 5940 + }, + { + "epoch": 0.18973819318218055, + "grad_norm": 0.25571727752685547, + "learning_rate": 0.001, + "loss": 1.2796, + "step": 5950 + }, + { + "epoch": 0.19005708090181447, + "grad_norm": 0.26292943954467773, + "learning_rate": 0.001, + "loss": 1.3109, + "step": 5960 + }, + { + "epoch": 0.1903759686214484, + "grad_norm": 0.2683652937412262, + "learning_rate": 0.001, + "loss": 1.2942, + "step": 5970 + }, + { + "epoch": 0.1906948563410823, + "grad_norm": 0.2673659324645996, + "learning_rate": 0.001, + "loss": 1.2799, + "step": 5980 + }, + { + "epoch": 0.19101374406071622, + "grad_norm": 0.2598000764846802, + "learning_rate": 0.001, + "loss": 1.2904, + "step": 5990 + }, + { + "epoch": 0.19133263178035015, + "grad_norm": 0.2635900676250458, + "learning_rate": 0.001, + "loss": 1.2705, + "step": 6000 + }, + { + "epoch": 0.19165151949998405, + "grad_norm": 0.25515443086624146, + "learning_rate": 0.001, + "loss": 1.2951, + "step": 6010 + }, + { + "epoch": 0.19197040721961797, + "grad_norm": 0.2600856423377991, + "learning_rate": 0.001, + "loss": 1.2733, + "step": 6020 + }, + { + "epoch": 0.1922892949392519, + "grad_norm": 0.2564580738544464, + "learning_rate": 0.001, + "loss": 1.2617, + "step": 6030 + }, + { + "epoch": 0.1926081826588858, + "grad_norm": 0.2747453451156616, + "learning_rate": 0.001, + "loss": 1.2833, + "step": 6040 + }, + { + "epoch": 0.19292707037851972, + "grad_norm": 0.26666703820228577, + "learning_rate": 0.001, + "loss": 1.2765, + "step": 6050 + }, + { + "epoch": 0.19324595809815365, + "grad_norm": 0.27212536334991455, + "learning_rate": 0.001, + "loss": 1.2701, + "step": 6060 + }, + { + "epoch": 0.19356484581778755, + "grad_norm": 0.2613239288330078, + "learning_rate": 0.001, + "loss": 1.278, + "step": 6070 + }, + { + "epoch": 0.19388373353742147, + "grad_norm": 0.27437734603881836, + "learning_rate": 0.001, + "loss": 1.275, + "step": 6080 + }, + { + "epoch": 0.1942026212570554, + "grad_norm": 0.26033124327659607, + "learning_rate": 0.001, + "loss": 1.2795, + "step": 6090 + }, + { + "epoch": 0.1945215089766893, + "grad_norm": 0.25559207797050476, + "learning_rate": 0.001, + "loss": 1.2808, + "step": 6100 + }, + { + "epoch": 0.19484039669632322, + "grad_norm": 0.26534515619277954, + "learning_rate": 0.001, + "loss": 1.2914, + "step": 6110 + }, + { + "epoch": 0.19515928441595715, + "grad_norm": 0.27692893147468567, + "learning_rate": 0.001, + "loss": 1.2815, + "step": 6120 + }, + { + "epoch": 0.19547817213559107, + "grad_norm": 0.27069565653800964, + "learning_rate": 0.001, + "loss": 1.2594, + "step": 6130 + }, + { + "epoch": 0.19579705985522497, + "grad_norm": 0.2529548108577728, + "learning_rate": 0.001, + "loss": 1.2881, + "step": 6140 + }, + { + "epoch": 0.1961159475748589, + "grad_norm": 0.25786975026130676, + "learning_rate": 0.001, + "loss": 1.2543, + "step": 6150 + }, + { + "epoch": 0.19643483529449282, + "grad_norm": 0.25865936279296875, + "learning_rate": 0.001, + "loss": 1.2623, + "step": 6160 + }, + { + "epoch": 0.19675372301412672, + "grad_norm": 0.2520538568496704, + "learning_rate": 0.001, + "loss": 1.2958, + "step": 6170 + }, + { + "epoch": 0.19707261073376064, + "grad_norm": 0.26968657970428467, + "learning_rate": 0.001, + "loss": 1.2757, + "step": 6180 + }, + { + "epoch": 0.19739149845339457, + "grad_norm": 0.2608780264854431, + "learning_rate": 0.001, + "loss": 1.2955, + "step": 6190 + }, + { + "epoch": 0.19771038617302847, + "grad_norm": 0.2636774182319641, + "learning_rate": 0.001, + "loss": 1.281, + "step": 6200 + }, + { + "epoch": 0.1980292738926624, + "grad_norm": 0.250747948884964, + "learning_rate": 0.001, + "loss": 1.286, + "step": 6210 + }, + { + "epoch": 0.19834816161229632, + "grad_norm": 0.2567255198955536, + "learning_rate": 0.001, + "loss": 1.258, + "step": 6220 + }, + { + "epoch": 0.19866704933193022, + "grad_norm": 0.2622133195400238, + "learning_rate": 0.001, + "loss": 1.2623, + "step": 6230 + }, + { + "epoch": 0.19898593705156414, + "grad_norm": 0.2595137655735016, + "learning_rate": 0.001, + "loss": 1.2697, + "step": 6240 + }, + { + "epoch": 0.19930482477119807, + "grad_norm": 0.25806212425231934, + "learning_rate": 0.001, + "loss": 1.2863, + "step": 6250 + }, + { + "epoch": 0.19962371249083197, + "grad_norm": 0.2540823817253113, + "learning_rate": 0.001, + "loss": 1.2602, + "step": 6260 + }, + { + "epoch": 0.1999426002104659, + "grad_norm": 0.26109036803245544, + "learning_rate": 0.001, + "loss": 1.277, + "step": 6270 + }, + { + "epoch": 0.20026148793009982, + "grad_norm": 0.25082194805145264, + "learning_rate": 0.001, + "loss": 1.2713, + "step": 6280 + }, + { + "epoch": 0.20058037564973372, + "grad_norm": 0.26120105385780334, + "learning_rate": 0.001, + "loss": 1.2857, + "step": 6290 + }, + { + "epoch": 0.20089926336936764, + "grad_norm": 0.2576233446598053, + "learning_rate": 0.001, + "loss": 1.2767, + "step": 6300 + }, + { + "epoch": 0.20121815108900157, + "grad_norm": 0.2628646194934845, + "learning_rate": 0.001, + "loss": 1.272, + "step": 6310 + }, + { + "epoch": 0.20153703880863547, + "grad_norm": 0.2471473067998886, + "learning_rate": 0.001, + "loss": 1.2702, + "step": 6320 + }, + { + "epoch": 0.2018559265282694, + "grad_norm": 0.2571963369846344, + "learning_rate": 0.001, + "loss": 1.2662, + "step": 6330 + }, + { + "epoch": 0.20217481424790332, + "grad_norm": 0.25734174251556396, + "learning_rate": 0.001, + "loss": 1.2718, + "step": 6340 + }, + { + "epoch": 0.20249370196753724, + "grad_norm": 0.25071901082992554, + "learning_rate": 0.001, + "loss": 1.2618, + "step": 6350 + }, + { + "epoch": 0.20281258968717114, + "grad_norm": 0.25671815872192383, + "learning_rate": 0.001, + "loss": 1.2696, + "step": 6360 + }, + { + "epoch": 0.20313147740680507, + "grad_norm": 0.2598605453968048, + "learning_rate": 0.001, + "loss": 1.2859, + "step": 6370 + }, + { + "epoch": 0.203450365126439, + "grad_norm": 0.26527634263038635, + "learning_rate": 0.001, + "loss": 1.2794, + "step": 6380 + }, + { + "epoch": 0.2037692528460729, + "grad_norm": 0.26094022393226624, + "learning_rate": 0.001, + "loss": 1.2636, + "step": 6390 + }, + { + "epoch": 0.20408814056570682, + "grad_norm": 0.2590426802635193, + "learning_rate": 0.001, + "loss": 1.2553, + "step": 6400 + }, + { + "epoch": 0.20440702828534074, + "grad_norm": 0.25953012704849243, + "learning_rate": 0.001, + "loss": 1.2787, + "step": 6410 + }, + { + "epoch": 0.20472591600497464, + "grad_norm": 0.25189971923828125, + "learning_rate": 0.001, + "loss": 1.2597, + "step": 6420 + }, + { + "epoch": 0.20504480372460857, + "grad_norm": 0.2543141841888428, + "learning_rate": 0.001, + "loss": 1.2896, + "step": 6430 + }, + { + "epoch": 0.2053636914442425, + "grad_norm": 0.24777406454086304, + "learning_rate": 0.001, + "loss": 1.2656, + "step": 6440 + }, + { + "epoch": 0.2056825791638764, + "grad_norm": 0.25355079770088196, + "learning_rate": 0.001, + "loss": 1.2605, + "step": 6450 + }, + { + "epoch": 0.20600146688351031, + "grad_norm": 0.2523370683193207, + "learning_rate": 0.001, + "loss": 1.2617, + "step": 6460 + }, + { + "epoch": 0.20632035460314424, + "grad_norm": 0.2629269063472748, + "learning_rate": 0.001, + "loss": 1.2741, + "step": 6470 + }, + { + "epoch": 0.20663924232277814, + "grad_norm": 0.2504100799560547, + "learning_rate": 0.001, + "loss": 1.2715, + "step": 6480 + }, + { + "epoch": 0.20695813004241206, + "grad_norm": 0.2450273036956787, + "learning_rate": 0.001, + "loss": 1.2705, + "step": 6490 + }, + { + "epoch": 0.207277017762046, + "grad_norm": 0.2537860870361328, + "learning_rate": 0.001, + "loss": 1.2565, + "step": 6500 + }, + { + "epoch": 0.2075959054816799, + "grad_norm": 0.24981524050235748, + "learning_rate": 0.001, + "loss": 1.2587, + "step": 6510 + }, + { + "epoch": 0.2079147932013138, + "grad_norm": 0.24786381423473358, + "learning_rate": 0.001, + "loss": 1.2628, + "step": 6520 + }, + { + "epoch": 0.20823368092094774, + "grad_norm": 0.25516125559806824, + "learning_rate": 0.001, + "loss": 1.2695, + "step": 6530 + }, + { + "epoch": 0.20855256864058166, + "grad_norm": 0.24738313257694244, + "learning_rate": 0.001, + "loss": 1.2857, + "step": 6540 + }, + { + "epoch": 0.20887145636021556, + "grad_norm": 0.25449132919311523, + "learning_rate": 0.001, + "loss": 1.2663, + "step": 6550 + }, + { + "epoch": 0.2091903440798495, + "grad_norm": 0.24622990190982819, + "learning_rate": 0.001, + "loss": 1.2639, + "step": 6560 + }, + { + "epoch": 0.20950923179948341, + "grad_norm": 0.24932384490966797, + "learning_rate": 0.001, + "loss": 1.2662, + "step": 6570 + }, + { + "epoch": 0.2098281195191173, + "grad_norm": 0.24670833349227905, + "learning_rate": 0.001, + "loss": 1.2637, + "step": 6580 + }, + { + "epoch": 0.21014700723875124, + "grad_norm": 0.24999317526817322, + "learning_rate": 0.001, + "loss": 1.2673, + "step": 6590 + }, + { + "epoch": 0.21046589495838516, + "grad_norm": 0.2563895881175995, + "learning_rate": 0.001, + "loss": 1.249, + "step": 6600 + }, + { + "epoch": 0.21078478267801906, + "grad_norm": 0.2394222617149353, + "learning_rate": 0.001, + "loss": 1.2653, + "step": 6610 + }, + { + "epoch": 0.211103670397653, + "grad_norm": 0.24972309172153473, + "learning_rate": 0.001, + "loss": 1.2585, + "step": 6620 + }, + { + "epoch": 0.2114225581172869, + "grad_norm": 0.2598929703235626, + "learning_rate": 0.001, + "loss": 1.2682, + "step": 6630 + }, + { + "epoch": 0.2117414458369208, + "grad_norm": 0.2623753249645233, + "learning_rate": 0.001, + "loss": 1.2789, + "step": 6640 + }, + { + "epoch": 0.21206033355655474, + "grad_norm": 0.2436663657426834, + "learning_rate": 0.001, + "loss": 1.2495, + "step": 6650 + }, + { + "epoch": 0.21237922127618866, + "grad_norm": 0.25046613812446594, + "learning_rate": 0.001, + "loss": 1.2666, + "step": 6660 + }, + { + "epoch": 0.21269810899582256, + "grad_norm": 0.24722261726856232, + "learning_rate": 0.001, + "loss": 1.2628, + "step": 6670 + }, + { + "epoch": 0.21301699671545649, + "grad_norm": 0.26566001772880554, + "learning_rate": 0.001, + "loss": 1.2589, + "step": 6680 + }, + { + "epoch": 0.2133358844350904, + "grad_norm": 0.259066104888916, + "learning_rate": 0.001, + "loss": 1.2713, + "step": 6690 + }, + { + "epoch": 0.2136547721547243, + "grad_norm": 0.25356295704841614, + "learning_rate": 0.001, + "loss": 1.2642, + "step": 6700 + }, + { + "epoch": 0.21397365987435824, + "grad_norm": 0.24706615507602692, + "learning_rate": 0.001, + "loss": 1.2488, + "step": 6710 + }, + { + "epoch": 0.21429254759399216, + "grad_norm": 0.2540133595466614, + "learning_rate": 0.001, + "loss": 1.2659, + "step": 6720 + }, + { + "epoch": 0.21461143531362606, + "grad_norm": 0.2536522448062897, + "learning_rate": 0.001, + "loss": 1.2542, + "step": 6730 + }, + { + "epoch": 0.21493032303325998, + "grad_norm": 0.24283340573310852, + "learning_rate": 0.001, + "loss": 1.2612, + "step": 6740 + }, + { + "epoch": 0.2152492107528939, + "grad_norm": 0.2563481330871582, + "learning_rate": 0.001, + "loss": 1.2744, + "step": 6750 + }, + { + "epoch": 0.21556809847252784, + "grad_norm": 0.24372535943984985, + "learning_rate": 0.001, + "loss": 1.2694, + "step": 6760 + }, + { + "epoch": 0.21588698619216173, + "grad_norm": 0.24880190193653107, + "learning_rate": 0.001, + "loss": 1.2522, + "step": 6770 + }, + { + "epoch": 0.21620587391179566, + "grad_norm": 0.25291043519973755, + "learning_rate": 0.001, + "loss": 1.2463, + "step": 6780 + }, + { + "epoch": 0.21652476163142959, + "grad_norm": 0.24317456781864166, + "learning_rate": 0.001, + "loss": 1.2431, + "step": 6790 + }, + { + "epoch": 0.21684364935106348, + "grad_norm": 0.2563396990299225, + "learning_rate": 0.001, + "loss": 1.2359, + "step": 6800 + }, + { + "epoch": 0.2171625370706974, + "grad_norm": 0.24072889983654022, + "learning_rate": 0.001, + "loss": 1.2708, + "step": 6810 + }, + { + "epoch": 0.21748142479033133, + "grad_norm": 0.25073298811912537, + "learning_rate": 0.001, + "loss": 1.2582, + "step": 6820 + }, + { + "epoch": 0.21780031250996523, + "grad_norm": 0.25314459204673767, + "learning_rate": 0.001, + "loss": 1.2544, + "step": 6830 + }, + { + "epoch": 0.21811920022959916, + "grad_norm": 0.2519735097885132, + "learning_rate": 0.001, + "loss": 1.2522, + "step": 6840 + }, + { + "epoch": 0.21843808794923308, + "grad_norm": 0.24390500783920288, + "learning_rate": 0.001, + "loss": 1.2366, + "step": 6850 + }, + { + "epoch": 0.21875697566886698, + "grad_norm": 0.2493201494216919, + "learning_rate": 0.001, + "loss": 1.2402, + "step": 6860 + }, + { + "epoch": 0.2190758633885009, + "grad_norm": 0.24233953654766083, + "learning_rate": 0.001, + "loss": 1.2553, + "step": 6870 + }, + { + "epoch": 0.21939475110813483, + "grad_norm": 0.2534473240375519, + "learning_rate": 0.001, + "loss": 1.2575, + "step": 6880 + }, + { + "epoch": 0.21971363882776873, + "grad_norm": 0.24528414011001587, + "learning_rate": 0.001, + "loss": 1.2536, + "step": 6890 + }, + { + "epoch": 0.22003252654740266, + "grad_norm": 0.24577198922634125, + "learning_rate": 0.001, + "loss": 1.262, + "step": 6900 + }, + { + "epoch": 0.22035141426703658, + "grad_norm": 0.25635457038879395, + "learning_rate": 0.001, + "loss": 1.2592, + "step": 6910 + }, + { + "epoch": 0.22067030198667048, + "grad_norm": 0.24953170120716095, + "learning_rate": 0.001, + "loss": 1.2712, + "step": 6920 + }, + { + "epoch": 0.2209891897063044, + "grad_norm": 0.24969792366027832, + "learning_rate": 0.001, + "loss": 1.261, + "step": 6930 + }, + { + "epoch": 0.22130807742593833, + "grad_norm": 0.2447703331708908, + "learning_rate": 0.001, + "loss": 1.2449, + "step": 6940 + }, + { + "epoch": 0.22162696514557226, + "grad_norm": 0.25282493233680725, + "learning_rate": 0.001, + "loss": 1.2434, + "step": 6950 + }, + { + "epoch": 0.22194585286520616, + "grad_norm": 0.24152617156505585, + "learning_rate": 0.001, + "loss": 1.2362, + "step": 6960 + }, + { + "epoch": 0.22226474058484008, + "grad_norm": 0.2417408972978592, + "learning_rate": 0.001, + "loss": 1.2521, + "step": 6970 + }, + { + "epoch": 0.222583628304474, + "grad_norm": 0.2589294910430908, + "learning_rate": 0.001, + "loss": 1.2462, + "step": 6980 + }, + { + "epoch": 0.2229025160241079, + "grad_norm": 0.24300368130207062, + "learning_rate": 0.001, + "loss": 1.2606, + "step": 6990 + }, + { + "epoch": 0.22322140374374183, + "grad_norm": 0.2448713481426239, + "learning_rate": 0.001, + "loss": 1.2741, + "step": 7000 + }, + { + "epoch": 0.22354029146337576, + "grad_norm": 0.2419728934764862, + "learning_rate": 0.001, + "loss": 1.2359, + "step": 7010 + }, + { + "epoch": 0.22385917918300965, + "grad_norm": 0.2582404911518097, + "learning_rate": 0.001, + "loss": 1.2584, + "step": 7020 + }, + { + "epoch": 0.22417806690264358, + "grad_norm": 0.2443564236164093, + "learning_rate": 0.001, + "loss": 1.2251, + "step": 7030 + }, + { + "epoch": 0.2244969546222775, + "grad_norm": 0.2606917917728424, + "learning_rate": 0.001, + "loss": 1.2312, + "step": 7040 + }, + { + "epoch": 0.2248158423419114, + "grad_norm": 0.24423828721046448, + "learning_rate": 0.001, + "loss": 1.2456, + "step": 7050 + }, + { + "epoch": 0.22513473006154533, + "grad_norm": 0.23391813039779663, + "learning_rate": 0.001, + "loss": 1.2435, + "step": 7060 + }, + { + "epoch": 0.22545361778117926, + "grad_norm": 0.2510395050048828, + "learning_rate": 0.001, + "loss": 1.243, + "step": 7070 + }, + { + "epoch": 0.22577250550081315, + "grad_norm": 0.24144716560840607, + "learning_rate": 0.001, + "loss": 1.2645, + "step": 7080 + }, + { + "epoch": 0.22609139322044708, + "grad_norm": 0.2472696304321289, + "learning_rate": 0.001, + "loss": 1.2681, + "step": 7090 + }, + { + "epoch": 0.226410280940081, + "grad_norm": 0.2584279179573059, + "learning_rate": 0.001, + "loss": 1.2481, + "step": 7100 + }, + { + "epoch": 0.2267291686597149, + "grad_norm": 0.24365928769111633, + "learning_rate": 0.001, + "loss": 1.2629, + "step": 7110 + }, + { + "epoch": 0.22704805637934883, + "grad_norm": 0.23801958560943604, + "learning_rate": 0.001, + "loss": 1.24, + "step": 7120 + }, + { + "epoch": 0.22736694409898275, + "grad_norm": 0.2477579265832901, + "learning_rate": 0.001, + "loss": 1.2501, + "step": 7130 + }, + { + "epoch": 0.22768583181861665, + "grad_norm": 0.244456946849823, + "learning_rate": 0.001, + "loss": 1.2524, + "step": 7140 + }, + { + "epoch": 0.22800471953825058, + "grad_norm": 0.24829816818237305, + "learning_rate": 0.001, + "loss": 1.2458, + "step": 7150 + }, + { + "epoch": 0.2283236072578845, + "grad_norm": 0.2522318661212921, + "learning_rate": 0.001, + "loss": 1.2503, + "step": 7160 + }, + { + "epoch": 0.22864249497751843, + "grad_norm": 0.24825812876224518, + "learning_rate": 0.001, + "loss": 1.2666, + "step": 7170 + }, + { + "epoch": 0.22896138269715233, + "grad_norm": 0.2433803528547287, + "learning_rate": 0.001, + "loss": 1.2335, + "step": 7180 + }, + { + "epoch": 0.22928027041678625, + "grad_norm": 0.24022041261196136, + "learning_rate": 0.001, + "loss": 1.2387, + "step": 7190 + }, + { + "epoch": 0.22959915813642018, + "grad_norm": 0.2453920543193817, + "learning_rate": 0.001, + "loss": 1.2411, + "step": 7200 + }, + { + "epoch": 0.22991804585605408, + "grad_norm": 0.25992751121520996, + "learning_rate": 0.001, + "loss": 1.2367, + "step": 7210 + }, + { + "epoch": 0.230236933575688, + "grad_norm": 0.24138158559799194, + "learning_rate": 0.001, + "loss": 1.2157, + "step": 7220 + }, + { + "epoch": 0.23055582129532193, + "grad_norm": 0.23895490169525146, + "learning_rate": 0.001, + "loss": 1.2484, + "step": 7230 + }, + { + "epoch": 0.23087470901495583, + "grad_norm": 0.24053625762462616, + "learning_rate": 0.001, + "loss": 1.2554, + "step": 7240 + }, + { + "epoch": 0.23119359673458975, + "grad_norm": 0.24910859763622284, + "learning_rate": 0.001, + "loss": 1.2431, + "step": 7250 + }, + { + "epoch": 0.23151248445422368, + "grad_norm": 0.24892625212669373, + "learning_rate": 0.001, + "loss": 1.2313, + "step": 7260 + }, + { + "epoch": 0.23183137217385758, + "grad_norm": 0.24436773359775543, + "learning_rate": 0.001, + "loss": 1.2304, + "step": 7270 + }, + { + "epoch": 0.2321502598934915, + "grad_norm": 0.2511202096939087, + "learning_rate": 0.001, + "loss": 1.2574, + "step": 7280 + }, + { + "epoch": 0.23246914761312543, + "grad_norm": 0.24174362421035767, + "learning_rate": 0.001, + "loss": 1.2455, + "step": 7290 + }, + { + "epoch": 0.23278803533275932, + "grad_norm": 0.240260049700737, + "learning_rate": 0.001, + "loss": 1.2362, + "step": 7300 + }, + { + "epoch": 0.23310692305239325, + "grad_norm": 0.23553872108459473, + "learning_rate": 0.001, + "loss": 1.2563, + "step": 7310 + }, + { + "epoch": 0.23342581077202718, + "grad_norm": 0.24824745953083038, + "learning_rate": 0.001, + "loss": 1.2357, + "step": 7320 + }, + { + "epoch": 0.23374469849166107, + "grad_norm": 0.24948884546756744, + "learning_rate": 0.001, + "loss": 1.2236, + "step": 7330 + }, + { + "epoch": 0.234063586211295, + "grad_norm": 0.23782293498516083, + "learning_rate": 0.001, + "loss": 1.2443, + "step": 7340 + }, + { + "epoch": 0.23438247393092893, + "grad_norm": 0.23776574432849884, + "learning_rate": 0.001, + "loss": 1.2445, + "step": 7350 + }, + { + "epoch": 0.23470136165056282, + "grad_norm": 0.24230794608592987, + "learning_rate": 0.001, + "loss": 1.2339, + "step": 7360 + }, + { + "epoch": 0.23502024937019675, + "grad_norm": 0.23560328781604767, + "learning_rate": 0.001, + "loss": 1.2359, + "step": 7370 + }, + { + "epoch": 0.23533913708983067, + "grad_norm": 0.23826272785663605, + "learning_rate": 0.001, + "loss": 1.2416, + "step": 7380 + }, + { + "epoch": 0.2356580248094646, + "grad_norm": 0.2352159023284912, + "learning_rate": 0.001, + "loss": 1.2457, + "step": 7390 + }, + { + "epoch": 0.2359769125290985, + "grad_norm": 0.24641361832618713, + "learning_rate": 0.001, + "loss": 1.2282, + "step": 7400 + }, + { + "epoch": 0.23629580024873242, + "grad_norm": 0.23941314220428467, + "learning_rate": 0.001, + "loss": 1.2317, + "step": 7410 + }, + { + "epoch": 0.23661468796836635, + "grad_norm": 0.25254225730895996, + "learning_rate": 0.001, + "loss": 1.2565, + "step": 7420 + }, + { + "epoch": 0.23693357568800025, + "grad_norm": 0.2551426291465759, + "learning_rate": 0.001, + "loss": 1.2174, + "step": 7430 + }, + { + "epoch": 0.23725246340763417, + "grad_norm": 0.2395494133234024, + "learning_rate": 0.001, + "loss": 1.2315, + "step": 7440 + }, + { + "epoch": 0.2375713511272681, + "grad_norm": 0.24731141328811646, + "learning_rate": 0.001, + "loss": 1.2533, + "step": 7450 + }, + { + "epoch": 0.237890238846902, + "grad_norm": 0.24077807366847992, + "learning_rate": 0.001, + "loss": 1.2508, + "step": 7460 + }, + { + "epoch": 0.23820912656653592, + "grad_norm": 0.23398423194885254, + "learning_rate": 0.001, + "loss": 1.2408, + "step": 7470 + }, + { + "epoch": 0.23852801428616985, + "grad_norm": 0.25103890895843506, + "learning_rate": 0.001, + "loss": 1.2552, + "step": 7480 + }, + { + "epoch": 0.23884690200580375, + "grad_norm": 0.24333910644054413, + "learning_rate": 0.001, + "loss": 1.2284, + "step": 7490 + }, + { + "epoch": 0.23916578972543767, + "grad_norm": 0.24267978966236115, + "learning_rate": 0.001, + "loss": 1.2541, + "step": 7500 + }, + { + "epoch": 0.2394846774450716, + "grad_norm": 0.24442200362682343, + "learning_rate": 0.001, + "loss": 1.2373, + "step": 7510 + }, + { + "epoch": 0.2398035651647055, + "grad_norm": 0.23441925644874573, + "learning_rate": 0.001, + "loss": 1.2411, + "step": 7520 + }, + { + "epoch": 0.24012245288433942, + "grad_norm": 0.2286960482597351, + "learning_rate": 0.001, + "loss": 1.2428, + "step": 7530 + }, + { + "epoch": 0.24044134060397335, + "grad_norm": 0.23532751202583313, + "learning_rate": 0.001, + "loss": 1.2352, + "step": 7540 + }, + { + "epoch": 0.24076022832360724, + "grad_norm": 0.24091851711273193, + "learning_rate": 0.001, + "loss": 1.2205, + "step": 7550 + }, + { + "epoch": 0.24107911604324117, + "grad_norm": 0.23746101558208466, + "learning_rate": 0.001, + "loss": 1.2307, + "step": 7560 + }, + { + "epoch": 0.2413980037628751, + "grad_norm": 0.25066760182380676, + "learning_rate": 0.001, + "loss": 1.2517, + "step": 7570 + }, + { + "epoch": 0.24171689148250902, + "grad_norm": 0.2366878241300583, + "learning_rate": 0.001, + "loss": 1.2313, + "step": 7580 + }, + { + "epoch": 0.24203577920214292, + "grad_norm": 0.2423858940601349, + "learning_rate": 0.001, + "loss": 1.2344, + "step": 7590 + }, + { + "epoch": 0.24235466692177685, + "grad_norm": 0.2289690375328064, + "learning_rate": 0.001, + "loss": 1.2471, + "step": 7600 + }, + { + "epoch": 0.24267355464141077, + "grad_norm": 0.23971182107925415, + "learning_rate": 0.001, + "loss": 1.2137, + "step": 7610 + }, + { + "epoch": 0.24299244236104467, + "grad_norm": 0.252573162317276, + "learning_rate": 0.001, + "loss": 1.2214, + "step": 7620 + }, + { + "epoch": 0.2433113300806786, + "grad_norm": 0.23575535416603088, + "learning_rate": 0.001, + "loss": 1.2391, + "step": 7630 + }, + { + "epoch": 0.24363021780031252, + "grad_norm": 0.23862920701503754, + "learning_rate": 0.001, + "loss": 1.2582, + "step": 7640 + }, + { + "epoch": 0.24394910551994642, + "grad_norm": 0.23157523572444916, + "learning_rate": 0.001, + "loss": 1.2299, + "step": 7650 + }, + { + "epoch": 0.24426799323958034, + "grad_norm": 0.24095961451530457, + "learning_rate": 0.001, + "loss": 1.2256, + "step": 7660 + }, + { + "epoch": 0.24458688095921427, + "grad_norm": 0.23619233071804047, + "learning_rate": 0.001, + "loss": 1.2291, + "step": 7670 + }, + { + "epoch": 0.24490576867884817, + "grad_norm": 0.23273909091949463, + "learning_rate": 0.001, + "loss": 1.2175, + "step": 7680 + }, + { + "epoch": 0.2452246563984821, + "grad_norm": 0.24084524810314178, + "learning_rate": 0.001, + "loss": 1.2414, + "step": 7690 + }, + { + "epoch": 0.24554354411811602, + "grad_norm": 0.25185689330101013, + "learning_rate": 0.001, + "loss": 1.2254, + "step": 7700 + }, + { + "epoch": 0.24586243183774992, + "grad_norm": 0.22970926761627197, + "learning_rate": 0.001, + "loss": 1.2073, + "step": 7710 + }, + { + "epoch": 0.24618131955738384, + "grad_norm": 0.23903733491897583, + "learning_rate": 0.001, + "loss": 1.2254, + "step": 7720 + }, + { + "epoch": 0.24650020727701777, + "grad_norm": 0.23516680300235748, + "learning_rate": 0.001, + "loss": 1.2347, + "step": 7730 + }, + { + "epoch": 0.24681909499665167, + "grad_norm": 0.24322205781936646, + "learning_rate": 0.001, + "loss": 1.2315, + "step": 7740 + }, + { + "epoch": 0.2471379827162856, + "grad_norm": 0.2339169830083847, + "learning_rate": 0.001, + "loss": 1.2511, + "step": 7750 + }, + { + "epoch": 0.24745687043591952, + "grad_norm": 0.23135200142860413, + "learning_rate": 0.001, + "loss": 1.2234, + "step": 7760 + }, + { + "epoch": 0.24777575815555342, + "grad_norm": 0.23176640272140503, + "learning_rate": 0.001, + "loss": 1.2359, + "step": 7770 + }, + { + "epoch": 0.24809464587518734, + "grad_norm": 0.24246467649936676, + "learning_rate": 0.001, + "loss": 1.211, + "step": 7780 + }, + { + "epoch": 0.24841353359482127, + "grad_norm": 0.23513196408748627, + "learning_rate": 0.001, + "loss": 1.2461, + "step": 7790 + }, + { + "epoch": 0.2487324213144552, + "grad_norm": 0.2445318102836609, + "learning_rate": 0.001, + "loss": 1.2237, + "step": 7800 + }, + { + "epoch": 0.2490513090340891, + "grad_norm": 0.2518177628517151, + "learning_rate": 0.001, + "loss": 1.2365, + "step": 7810 + }, + { + "epoch": 0.24937019675372302, + "grad_norm": 0.2446645200252533, + "learning_rate": 0.001, + "loss": 1.2237, + "step": 7820 + }, + { + "epoch": 0.24968908447335694, + "grad_norm": 0.229459747672081, + "learning_rate": 0.001, + "loss": 1.2074, + "step": 7830 + }, + { + "epoch": 0.25000797219299087, + "grad_norm": 0.2381288856267929, + "learning_rate": 0.001, + "loss": 1.214, + "step": 7840 + }, + { + "epoch": 0.25032685991262477, + "grad_norm": 0.2357947826385498, + "learning_rate": 0.001, + "loss": 1.2024, + "step": 7850 + }, + { + "epoch": 0.25064574763225866, + "grad_norm": 0.22776131331920624, + "learning_rate": 0.001, + "loss": 1.2011, + "step": 7860 + }, + { + "epoch": 0.2509646353518926, + "grad_norm": 0.23940008878707886, + "learning_rate": 0.001, + "loss": 1.2098, + "step": 7870 + }, + { + "epoch": 0.2512835230715265, + "grad_norm": 0.23641014099121094, + "learning_rate": 0.001, + "loss": 1.2112, + "step": 7880 + }, + { + "epoch": 0.2516024107911604, + "grad_norm": 0.2425474226474762, + "learning_rate": 0.001, + "loss": 1.2518, + "step": 7890 + }, + { + "epoch": 0.25192129851079437, + "grad_norm": 0.23014706373214722, + "learning_rate": 0.001, + "loss": 1.2273, + "step": 7900 + }, + { + "epoch": 0.25224018623042826, + "grad_norm": 0.24386554956436157, + "learning_rate": 0.001, + "loss": 1.2195, + "step": 7910 + }, + { + "epoch": 0.25255907395006216, + "grad_norm": 0.2404145896434784, + "learning_rate": 0.001, + "loss": 1.2307, + "step": 7920 + }, + { + "epoch": 0.2528779616696961, + "grad_norm": 0.25573667883872986, + "learning_rate": 0.001, + "loss": 1.2186, + "step": 7930 + }, + { + "epoch": 0.25319684938933, + "grad_norm": 0.24742534756660461, + "learning_rate": 0.001, + "loss": 1.2188, + "step": 7940 + }, + { + "epoch": 0.2535157371089639, + "grad_norm": 0.22783447802066803, + "learning_rate": 0.001, + "loss": 1.2203, + "step": 7950 + }, + { + "epoch": 0.25383462482859787, + "grad_norm": 0.24916143715381622, + "learning_rate": 0.001, + "loss": 1.2348, + "step": 7960 + }, + { + "epoch": 0.25415351254823176, + "grad_norm": 0.23791836202144623, + "learning_rate": 0.001, + "loss": 1.2223, + "step": 7970 + }, + { + "epoch": 0.25447240026786566, + "grad_norm": 0.2505311667919159, + "learning_rate": 0.001, + "loss": 1.2459, + "step": 7980 + }, + { + "epoch": 0.2547912879874996, + "grad_norm": 0.23071864247322083, + "learning_rate": 0.001, + "loss": 1.2412, + "step": 7990 + }, + { + "epoch": 0.2551101757071335, + "grad_norm": 0.23231177031993866, + "learning_rate": 0.001, + "loss": 1.2135, + "step": 8000 + }, + { + "epoch": 0.2554290634267674, + "grad_norm": 0.2333109825849533, + "learning_rate": 0.001, + "loss": 1.208, + "step": 8010 + }, + { + "epoch": 0.25574795114640136, + "grad_norm": 0.22724169492721558, + "learning_rate": 0.001, + "loss": 1.224, + "step": 8020 + }, + { + "epoch": 0.25606683886603526, + "grad_norm": 0.21973343193531036, + "learning_rate": 0.001, + "loss": 1.2057, + "step": 8030 + }, + { + "epoch": 0.25638572658566916, + "grad_norm": 0.23393845558166504, + "learning_rate": 0.001, + "loss": 1.2247, + "step": 8040 + }, + { + "epoch": 0.2567046143053031, + "grad_norm": 0.23509255051612854, + "learning_rate": 0.001, + "loss": 1.2381, + "step": 8050 + }, + { + "epoch": 0.257023502024937, + "grad_norm": 0.2345188558101654, + "learning_rate": 0.001, + "loss": 1.2221, + "step": 8060 + }, + { + "epoch": 0.2573423897445709, + "grad_norm": 0.2355467826128006, + "learning_rate": 0.001, + "loss": 1.2338, + "step": 8070 + }, + { + "epoch": 0.25766127746420486, + "grad_norm": 0.2355579137802124, + "learning_rate": 0.001, + "loss": 1.2197, + "step": 8080 + }, + { + "epoch": 0.25798016518383876, + "grad_norm": 0.23406566679477692, + "learning_rate": 0.001, + "loss": 1.2519, + "step": 8090 + }, + { + "epoch": 0.2582990529034727, + "grad_norm": 0.22977030277252197, + "learning_rate": 0.001, + "loss": 1.2381, + "step": 8100 + }, + { + "epoch": 0.2586179406231066, + "grad_norm": 0.23623408377170563, + "learning_rate": 0.001, + "loss": 1.2299, + "step": 8110 + }, + { + "epoch": 0.2589368283427405, + "grad_norm": 0.22804781794548035, + "learning_rate": 0.001, + "loss": 1.2062, + "step": 8120 + }, + { + "epoch": 0.25925571606237446, + "grad_norm": 0.23730701208114624, + "learning_rate": 0.001, + "loss": 1.2091, + "step": 8130 + }, + { + "epoch": 0.25957460378200836, + "grad_norm": 0.23314902186393738, + "learning_rate": 0.001, + "loss": 1.1943, + "step": 8140 + }, + { + "epoch": 0.25989349150164226, + "grad_norm": 0.23350052535533905, + "learning_rate": 0.001, + "loss": 1.2201, + "step": 8150 + }, + { + "epoch": 0.2602123792212762, + "grad_norm": 0.2341201901435852, + "learning_rate": 0.001, + "loss": 1.2133, + "step": 8160 + }, + { + "epoch": 0.2605312669409101, + "grad_norm": 0.2353760153055191, + "learning_rate": 0.001, + "loss": 1.2034, + "step": 8170 + }, + { + "epoch": 0.260850154660544, + "grad_norm": 0.23472733795642853, + "learning_rate": 0.001, + "loss": 1.2234, + "step": 8180 + }, + { + "epoch": 0.26116904238017796, + "grad_norm": 0.23768886923789978, + "learning_rate": 0.001, + "loss": 1.2087, + "step": 8190 + }, + { + "epoch": 0.26148793009981186, + "grad_norm": 0.2352430671453476, + "learning_rate": 0.001, + "loss": 1.2135, + "step": 8200 + }, + { + "epoch": 0.26180681781944576, + "grad_norm": 0.22320342063903809, + "learning_rate": 0.001, + "loss": 1.2301, + "step": 8210 + }, + { + "epoch": 0.2621257055390797, + "grad_norm": 0.2318148910999298, + "learning_rate": 0.001, + "loss": 1.2281, + "step": 8220 + }, + { + "epoch": 0.2624445932587136, + "grad_norm": 0.22970159351825714, + "learning_rate": 0.001, + "loss": 1.2129, + "step": 8230 + }, + { + "epoch": 0.2627634809783475, + "grad_norm": 0.2396457940340042, + "learning_rate": 0.001, + "loss": 1.2104, + "step": 8240 + }, + { + "epoch": 0.26308236869798146, + "grad_norm": 0.22637683153152466, + "learning_rate": 0.001, + "loss": 1.2003, + "step": 8250 + }, + { + "epoch": 0.26340125641761536, + "grad_norm": 0.23237474262714386, + "learning_rate": 0.001, + "loss": 1.2245, + "step": 8260 + }, + { + "epoch": 0.26372014413724926, + "grad_norm": 0.22723113000392914, + "learning_rate": 0.001, + "loss": 1.2201, + "step": 8270 + }, + { + "epoch": 0.2640390318568832, + "grad_norm": 0.23084807395935059, + "learning_rate": 0.001, + "loss": 1.2184, + "step": 8280 + }, + { + "epoch": 0.2643579195765171, + "grad_norm": 0.2344745695590973, + "learning_rate": 0.001, + "loss": 1.22, + "step": 8290 + }, + { + "epoch": 0.264676807296151, + "grad_norm": 0.23803043365478516, + "learning_rate": 0.001, + "loss": 1.2093, + "step": 8300 + }, + { + "epoch": 0.26499569501578496, + "grad_norm": 0.2341003566980362, + "learning_rate": 0.001, + "loss": 1.2055, + "step": 8310 + }, + { + "epoch": 0.26531458273541886, + "grad_norm": 0.21944069862365723, + "learning_rate": 0.001, + "loss": 1.202, + "step": 8320 + }, + { + "epoch": 0.26563347045505276, + "grad_norm": 0.2393493801355362, + "learning_rate": 0.001, + "loss": 1.1969, + "step": 8330 + }, + { + "epoch": 0.2659523581746867, + "grad_norm": 0.23989616334438324, + "learning_rate": 0.001, + "loss": 1.2248, + "step": 8340 + }, + { + "epoch": 0.2662712458943206, + "grad_norm": 0.24278509616851807, + "learning_rate": 0.001, + "loss": 1.2123, + "step": 8350 + }, + { + "epoch": 0.2665901336139545, + "grad_norm": 0.23201477527618408, + "learning_rate": 0.001, + "loss": 1.2448, + "step": 8360 + }, + { + "epoch": 0.26690902133358846, + "grad_norm": 0.23349370062351227, + "learning_rate": 0.001, + "loss": 1.2317, + "step": 8370 + }, + { + "epoch": 0.26722790905322236, + "grad_norm": 0.2284669727087021, + "learning_rate": 0.001, + "loss": 1.2232, + "step": 8380 + }, + { + "epoch": 0.26754679677285625, + "grad_norm": 0.23704218864440918, + "learning_rate": 0.001, + "loss": 1.2192, + "step": 8390 + }, + { + "epoch": 0.2678656844924902, + "grad_norm": 0.2318849116563797, + "learning_rate": 0.001, + "loss": 1.2119, + "step": 8400 + }, + { + "epoch": 0.2681845722121241, + "grad_norm": 0.2331661731004715, + "learning_rate": 0.001, + "loss": 1.2088, + "step": 8410 + }, + { + "epoch": 0.268503459931758, + "grad_norm": 0.22507762908935547, + "learning_rate": 0.001, + "loss": 1.2048, + "step": 8420 + }, + { + "epoch": 0.26882234765139196, + "grad_norm": 0.2239762246608734, + "learning_rate": 0.001, + "loss": 1.2033, + "step": 8430 + }, + { + "epoch": 0.26914123537102586, + "grad_norm": 0.22471489012241364, + "learning_rate": 0.001, + "loss": 1.2004, + "step": 8440 + }, + { + "epoch": 0.26946012309065975, + "grad_norm": 0.22484731674194336, + "learning_rate": 0.001, + "loss": 1.2154, + "step": 8450 + }, + { + "epoch": 0.2697790108102937, + "grad_norm": 0.2240588217973709, + "learning_rate": 0.001, + "loss": 1.2067, + "step": 8460 + }, + { + "epoch": 0.2700978985299276, + "grad_norm": 0.2298148274421692, + "learning_rate": 0.001, + "loss": 1.2215, + "step": 8470 + }, + { + "epoch": 0.2704167862495615, + "grad_norm": 0.22547484934329987, + "learning_rate": 0.001, + "loss": 1.2017, + "step": 8480 + }, + { + "epoch": 0.27073567396919546, + "grad_norm": 0.23824118077754974, + "learning_rate": 0.001, + "loss": 1.2013, + "step": 8490 + }, + { + "epoch": 0.27105456168882935, + "grad_norm": 0.22975823283195496, + "learning_rate": 0.001, + "loss": 1.2293, + "step": 8500 + }, + { + "epoch": 0.2713734494084633, + "grad_norm": 0.22520454227924347, + "learning_rate": 0.001, + "loss": 1.2064, + "step": 8510 + }, + { + "epoch": 0.2716923371280972, + "grad_norm": 0.22511954605579376, + "learning_rate": 0.001, + "loss": 1.1975, + "step": 8520 + }, + { + "epoch": 0.2720112248477311, + "grad_norm": 0.22746147215366364, + "learning_rate": 0.001, + "loss": 1.1967, + "step": 8530 + }, + { + "epoch": 0.27233011256736506, + "grad_norm": 0.2236718386411667, + "learning_rate": 0.001, + "loss": 1.2091, + "step": 8540 + }, + { + "epoch": 0.27264900028699895, + "grad_norm": 0.23237386345863342, + "learning_rate": 0.001, + "loss": 1.2299, + "step": 8550 + }, + { + "epoch": 0.27296788800663285, + "grad_norm": 0.22995509207248688, + "learning_rate": 0.001, + "loss": 1.2193, + "step": 8560 + }, + { + "epoch": 0.2732867757262668, + "grad_norm": 0.231089785695076, + "learning_rate": 0.001, + "loss": 1.2297, + "step": 8570 + }, + { + "epoch": 0.2736056634459007, + "grad_norm": 0.2250330150127411, + "learning_rate": 0.001, + "loss": 1.1958, + "step": 8580 + }, + { + "epoch": 0.2739245511655346, + "grad_norm": 0.2263891100883484, + "learning_rate": 0.001, + "loss": 1.2029, + "step": 8590 + }, + { + "epoch": 0.27424343888516856, + "grad_norm": 0.2355814427137375, + "learning_rate": 0.001, + "loss": 1.2225, + "step": 8600 + }, + { + "epoch": 0.27456232660480245, + "grad_norm": 0.23153933882713318, + "learning_rate": 0.001, + "loss": 1.2279, + "step": 8610 + }, + { + "epoch": 0.27488121432443635, + "grad_norm": 0.23251959681510925, + "learning_rate": 0.001, + "loss": 1.1949, + "step": 8620 + }, + { + "epoch": 0.2752001020440703, + "grad_norm": 0.22222910821437836, + "learning_rate": 0.001, + "loss": 1.1986, + "step": 8630 + }, + { + "epoch": 0.2755189897637042, + "grad_norm": 0.22941821813583374, + "learning_rate": 0.001, + "loss": 1.1969, + "step": 8640 + }, + { + "epoch": 0.2758378774833381, + "grad_norm": 0.2258928269147873, + "learning_rate": 0.001, + "loss": 1.2217, + "step": 8650 + }, + { + "epoch": 0.27615676520297205, + "grad_norm": 0.22354252636432648, + "learning_rate": 0.001, + "loss": 1.2045, + "step": 8660 + }, + { + "epoch": 0.27647565292260595, + "grad_norm": 0.22870154678821564, + "learning_rate": 0.001, + "loss": 1.2092, + "step": 8670 + }, + { + "epoch": 0.27679454064223985, + "grad_norm": 0.2278323620557785, + "learning_rate": 0.001, + "loss": 1.1935, + "step": 8680 + }, + { + "epoch": 0.2771134283618738, + "grad_norm": 0.2331470400094986, + "learning_rate": 0.001, + "loss": 1.1874, + "step": 8690 + }, + { + "epoch": 0.2774323160815077, + "grad_norm": 0.23780789971351624, + "learning_rate": 0.001, + "loss": 1.2081, + "step": 8700 + }, + { + "epoch": 0.2777512038011416, + "grad_norm": 0.22317449748516083, + "learning_rate": 0.001, + "loss": 1.2126, + "step": 8710 + }, + { + "epoch": 0.27807009152077555, + "grad_norm": 0.22962188720703125, + "learning_rate": 0.001, + "loss": 1.209, + "step": 8720 + }, + { + "epoch": 0.27838897924040945, + "grad_norm": 0.2307327538728714, + "learning_rate": 0.001, + "loss": 1.1841, + "step": 8730 + }, + { + "epoch": 0.27870786696004335, + "grad_norm": 0.23068435490131378, + "learning_rate": 0.001, + "loss": 1.2017, + "step": 8740 + }, + { + "epoch": 0.2790267546796773, + "grad_norm": 0.24138770997524261, + "learning_rate": 0.001, + "loss": 1.2083, + "step": 8750 + }, + { + "epoch": 0.2793456423993112, + "grad_norm": 0.23563891649246216, + "learning_rate": 0.001, + "loss": 1.2027, + "step": 8760 + }, + { + "epoch": 0.2796645301189451, + "grad_norm": 0.22962354123592377, + "learning_rate": 0.001, + "loss": 1.2111, + "step": 8770 + }, + { + "epoch": 0.27998341783857905, + "grad_norm": 0.2287108451128006, + "learning_rate": 0.001, + "loss": 1.2177, + "step": 8780 + }, + { + "epoch": 0.28030230555821295, + "grad_norm": 0.22435401380062103, + "learning_rate": 0.001, + "loss": 1.2016, + "step": 8790 + }, + { + "epoch": 0.28062119327784685, + "grad_norm": 0.23022402822971344, + "learning_rate": 0.001, + "loss": 1.1926, + "step": 8800 + }, + { + "epoch": 0.2809400809974808, + "grad_norm": 0.2214146852493286, + "learning_rate": 0.001, + "loss": 1.2209, + "step": 8810 + }, + { + "epoch": 0.2812589687171147, + "grad_norm": 0.22423280775547028, + "learning_rate": 0.001, + "loss": 1.205, + "step": 8820 + }, + { + "epoch": 0.2815778564367486, + "grad_norm": 0.2278013676404953, + "learning_rate": 0.001, + "loss": 1.2274, + "step": 8830 + }, + { + "epoch": 0.28189674415638255, + "grad_norm": 0.22766144573688507, + "learning_rate": 0.001, + "loss": 1.2044, + "step": 8840 + }, + { + "epoch": 0.28221563187601645, + "grad_norm": 0.2270435094833374, + "learning_rate": 0.001, + "loss": 1.2072, + "step": 8850 + }, + { + "epoch": 0.28253451959565035, + "grad_norm": 0.219182550907135, + "learning_rate": 0.001, + "loss": 1.2071, + "step": 8860 + }, + { + "epoch": 0.2828534073152843, + "grad_norm": 0.22934895753860474, + "learning_rate": 0.001, + "loss": 1.192, + "step": 8870 + }, + { + "epoch": 0.2831722950349182, + "grad_norm": 0.22396302223205566, + "learning_rate": 0.001, + "loss": 1.2112, + "step": 8880 + }, + { + "epoch": 0.2834911827545521, + "grad_norm": 0.22344917058944702, + "learning_rate": 0.001, + "loss": 1.1915, + "step": 8890 + }, + { + "epoch": 0.28381007047418605, + "grad_norm": 0.23081816732883453, + "learning_rate": 0.001, + "loss": 1.2115, + "step": 8900 + }, + { + "epoch": 0.28412895819381995, + "grad_norm": 0.22885209321975708, + "learning_rate": 0.001, + "loss": 1.1975, + "step": 8910 + }, + { + "epoch": 0.2844478459134539, + "grad_norm": 0.2284027636051178, + "learning_rate": 0.001, + "loss": 1.1961, + "step": 8920 + }, + { + "epoch": 0.2847667336330878, + "grad_norm": 0.21693913638591766, + "learning_rate": 0.001, + "loss": 1.2041, + "step": 8930 + }, + { + "epoch": 0.2850856213527217, + "grad_norm": 0.22245806455612183, + "learning_rate": 0.001, + "loss": 1.2045, + "step": 8940 + }, + { + "epoch": 0.28540450907235565, + "grad_norm": 0.22184446454048157, + "learning_rate": 0.001, + "loss": 1.2137, + "step": 8950 + }, + { + "epoch": 0.28572339679198955, + "grad_norm": 0.23200933635234833, + "learning_rate": 0.001, + "loss": 1.1944, + "step": 8960 + }, + { + "epoch": 0.28604228451162345, + "grad_norm": 0.22749820351600647, + "learning_rate": 0.001, + "loss": 1.1989, + "step": 8970 + }, + { + "epoch": 0.2863611722312574, + "grad_norm": 0.2261044979095459, + "learning_rate": 0.001, + "loss": 1.2006, + "step": 8980 + }, + { + "epoch": 0.2866800599508913, + "grad_norm": 0.23911482095718384, + "learning_rate": 0.001, + "loss": 1.1997, + "step": 8990 + }, + { + "epoch": 0.2869989476705252, + "grad_norm": 0.2479439377784729, + "learning_rate": 0.001, + "loss": 1.2261, + "step": 9000 + }, + { + "epoch": 0.28731783539015915, + "grad_norm": 0.22959910333156586, + "learning_rate": 0.001, + "loss": 1.182, + "step": 9010 + }, + { + "epoch": 0.28763672310979305, + "grad_norm": 0.2242193967103958, + "learning_rate": 0.001, + "loss": 1.2011, + "step": 9020 + }, + { + "epoch": 0.28795561082942694, + "grad_norm": 0.22244490683078766, + "learning_rate": 0.001, + "loss": 1.1991, + "step": 9030 + }, + { + "epoch": 0.2882744985490609, + "grad_norm": 0.22404681146144867, + "learning_rate": 0.001, + "loss": 1.2082, + "step": 9040 + }, + { + "epoch": 0.2885933862686948, + "grad_norm": 0.2188567817211151, + "learning_rate": 0.001, + "loss": 1.1923, + "step": 9050 + }, + { + "epoch": 0.2889122739883287, + "grad_norm": 0.22456391155719757, + "learning_rate": 0.001, + "loss": 1.1907, + "step": 9060 + }, + { + "epoch": 0.28923116170796265, + "grad_norm": 0.22750291228294373, + "learning_rate": 0.001, + "loss": 1.196, + "step": 9070 + }, + { + "epoch": 0.28955004942759655, + "grad_norm": 0.22974838316440582, + "learning_rate": 0.001, + "loss": 1.2025, + "step": 9080 + }, + { + "epoch": 0.28986893714723044, + "grad_norm": 0.23487216234207153, + "learning_rate": 0.001, + "loss": 1.2249, + "step": 9090 + }, + { + "epoch": 0.2901878248668644, + "grad_norm": 0.2250378280878067, + "learning_rate": 0.001, + "loss": 1.1928, + "step": 9100 + }, + { + "epoch": 0.2905067125864983, + "grad_norm": 0.22203022241592407, + "learning_rate": 0.001, + "loss": 1.1971, + "step": 9110 + }, + { + "epoch": 0.2908256003061322, + "grad_norm": 0.23717299103736877, + "learning_rate": 0.001, + "loss": 1.2061, + "step": 9120 + }, + { + "epoch": 0.29114448802576615, + "grad_norm": 0.22907336056232452, + "learning_rate": 0.001, + "loss": 1.2061, + "step": 9130 + }, + { + "epoch": 0.29146337574540004, + "grad_norm": 0.22980521619319916, + "learning_rate": 0.001, + "loss": 1.1919, + "step": 9140 + }, + { + "epoch": 0.29178226346503394, + "grad_norm": 0.2266918420791626, + "learning_rate": 0.001, + "loss": 1.1871, + "step": 9150 + }, + { + "epoch": 0.2921011511846679, + "grad_norm": 0.2265349179506302, + "learning_rate": 0.001, + "loss": 1.2098, + "step": 9160 + }, + { + "epoch": 0.2924200389043018, + "grad_norm": 0.23188531398773193, + "learning_rate": 0.001, + "loss": 1.2114, + "step": 9170 + }, + { + "epoch": 0.2927389266239357, + "grad_norm": 0.21577180922031403, + "learning_rate": 0.001, + "loss": 1.2019, + "step": 9180 + }, + { + "epoch": 0.29305781434356964, + "grad_norm": 0.2256147712469101, + "learning_rate": 0.001, + "loss": 1.1884, + "step": 9190 + }, + { + "epoch": 0.29337670206320354, + "grad_norm": 0.22333773970603943, + "learning_rate": 0.001, + "loss": 1.1911, + "step": 9200 + }, + { + "epoch": 0.29369558978283744, + "grad_norm": 0.2238871157169342, + "learning_rate": 0.001, + "loss": 1.1881, + "step": 9210 + }, + { + "epoch": 0.2940144775024714, + "grad_norm": 0.22535620629787445, + "learning_rate": 0.001, + "loss": 1.1906, + "step": 9220 + }, + { + "epoch": 0.2943333652221053, + "grad_norm": 0.22432921826839447, + "learning_rate": 0.001, + "loss": 1.2011, + "step": 9230 + }, + { + "epoch": 0.2946522529417392, + "grad_norm": 0.22004643082618713, + "learning_rate": 0.001, + "loss": 1.1829, + "step": 9240 + }, + { + "epoch": 0.29497114066137314, + "grad_norm": 0.223861962556839, + "learning_rate": 0.001, + "loss": 1.1977, + "step": 9250 + }, + { + "epoch": 0.29529002838100704, + "grad_norm": 0.22514422237873077, + "learning_rate": 0.001, + "loss": 1.1897, + "step": 9260 + }, + { + "epoch": 0.29560891610064094, + "grad_norm": 0.214486762881279, + "learning_rate": 0.001, + "loss": 1.1842, + "step": 9270 + }, + { + "epoch": 0.2959278038202749, + "grad_norm": 0.22228612005710602, + "learning_rate": 0.001, + "loss": 1.193, + "step": 9280 + }, + { + "epoch": 0.2962466915399088, + "grad_norm": 0.2186896950006485, + "learning_rate": 0.001, + "loss": 1.1751, + "step": 9290 + }, + { + "epoch": 0.2965655792595427, + "grad_norm": 0.22547854483127594, + "learning_rate": 0.001, + "loss": 1.1931, + "step": 9300 + }, + { + "epoch": 0.29688446697917664, + "grad_norm": 0.22223107516765594, + "learning_rate": 0.001, + "loss": 1.2041, + "step": 9310 + }, + { + "epoch": 0.29720335469881054, + "grad_norm": 0.21878202259540558, + "learning_rate": 0.001, + "loss": 1.1737, + "step": 9320 + }, + { + "epoch": 0.29752224241844444, + "grad_norm": 0.22275276482105255, + "learning_rate": 0.001, + "loss": 1.2027, + "step": 9330 + }, + { + "epoch": 0.2978411301380784, + "grad_norm": 0.21805565059185028, + "learning_rate": 0.001, + "loss": 1.1769, + "step": 9340 + }, + { + "epoch": 0.2981600178577123, + "grad_norm": 0.21685688197612762, + "learning_rate": 0.001, + "loss": 1.1857, + "step": 9350 + }, + { + "epoch": 0.29847890557734624, + "grad_norm": 0.22610515356063843, + "learning_rate": 0.001, + "loss": 1.1919, + "step": 9360 + }, + { + "epoch": 0.29879779329698014, + "grad_norm": 0.23180913925170898, + "learning_rate": 0.001, + "loss": 1.1976, + "step": 9370 + }, + { + "epoch": 0.29911668101661404, + "grad_norm": 0.2190287858247757, + "learning_rate": 0.001, + "loss": 1.1931, + "step": 9380 + }, + { + "epoch": 0.299435568736248, + "grad_norm": 0.22509071230888367, + "learning_rate": 0.001, + "loss": 1.1801, + "step": 9390 + }, + { + "epoch": 0.2997544564558819, + "grad_norm": 0.23787586390972137, + "learning_rate": 0.001, + "loss": 1.1912, + "step": 9400 + }, + { + "epoch": 0.3000733441755158, + "grad_norm": 0.22887369990348816, + "learning_rate": 0.001, + "loss": 1.1961, + "step": 9410 + }, + { + "epoch": 0.30039223189514974, + "grad_norm": 0.2291015237569809, + "learning_rate": 0.001, + "loss": 1.1952, + "step": 9420 + }, + { + "epoch": 0.30071111961478364, + "grad_norm": 0.22769349813461304, + "learning_rate": 0.001, + "loss": 1.1844, + "step": 9430 + }, + { + "epoch": 0.30103000733441754, + "grad_norm": 0.22478654980659485, + "learning_rate": 0.001, + "loss": 1.1955, + "step": 9440 + }, + { + "epoch": 0.3013488950540515, + "grad_norm": 0.2176402360200882, + "learning_rate": 0.001, + "loss": 1.1987, + "step": 9450 + }, + { + "epoch": 0.3016677827736854, + "grad_norm": 0.22715549170970917, + "learning_rate": 0.001, + "loss": 1.1986, + "step": 9460 + }, + { + "epoch": 0.3019866704933193, + "grad_norm": 0.22686997056007385, + "learning_rate": 0.001, + "loss": 1.2123, + "step": 9470 + }, + { + "epoch": 0.30230555821295324, + "grad_norm": 0.22678443789482117, + "learning_rate": 0.001, + "loss": 1.192, + "step": 9480 + }, + { + "epoch": 0.30262444593258714, + "grad_norm": 0.22301578521728516, + "learning_rate": 0.001, + "loss": 1.192, + "step": 9490 + }, + { + "epoch": 0.30294333365222104, + "grad_norm": 0.23071426153182983, + "learning_rate": 0.001, + "loss": 1.194, + "step": 9500 + }, + { + "epoch": 0.303262221371855, + "grad_norm": 0.21674400568008423, + "learning_rate": 0.001, + "loss": 1.1759, + "step": 9510 + }, + { + "epoch": 0.3035811090914889, + "grad_norm": 0.23387682437896729, + "learning_rate": 0.001, + "loss": 1.1884, + "step": 9520 + }, + { + "epoch": 0.3038999968111228, + "grad_norm": 0.21519413590431213, + "learning_rate": 0.001, + "loss": 1.173, + "step": 9530 + }, + { + "epoch": 0.30421888453075674, + "grad_norm": 0.2206396609544754, + "learning_rate": 0.001, + "loss": 1.1847, + "step": 9540 + }, + { + "epoch": 0.30453777225039064, + "grad_norm": 0.2154570370912552, + "learning_rate": 0.001, + "loss": 1.1651, + "step": 9550 + }, + { + "epoch": 0.30485665997002453, + "grad_norm": 0.21689386665821075, + "learning_rate": 0.001, + "loss": 1.185, + "step": 9560 + }, + { + "epoch": 0.3051755476896585, + "grad_norm": 0.21970142424106598, + "learning_rate": 0.001, + "loss": 1.1835, + "step": 9570 + }, + { + "epoch": 0.3054944354092924, + "grad_norm": 0.2152683436870575, + "learning_rate": 0.001, + "loss": 1.201, + "step": 9580 + }, + { + "epoch": 0.3058133231289263, + "grad_norm": 0.21406981348991394, + "learning_rate": 0.001, + "loss": 1.1853, + "step": 9590 + }, + { + "epoch": 0.30613221084856024, + "grad_norm": 0.22537820041179657, + "learning_rate": 0.001, + "loss": 1.1782, + "step": 9600 + }, + { + "epoch": 0.30645109856819414, + "grad_norm": 0.2190530151128769, + "learning_rate": 0.001, + "loss": 1.1859, + "step": 9610 + }, + { + "epoch": 0.30676998628782803, + "grad_norm": 0.22278915345668793, + "learning_rate": 0.001, + "loss": 1.1933, + "step": 9620 + }, + { + "epoch": 0.307088874007462, + "grad_norm": 0.22467225790023804, + "learning_rate": 0.001, + "loss": 1.1582, + "step": 9630 + }, + { + "epoch": 0.3074077617270959, + "grad_norm": 0.21523691713809967, + "learning_rate": 0.001, + "loss": 1.1914, + "step": 9640 + }, + { + "epoch": 0.3077266494467298, + "grad_norm": 0.21753321588039398, + "learning_rate": 0.001, + "loss": 1.1819, + "step": 9650 + }, + { + "epoch": 0.30804553716636374, + "grad_norm": 0.21796949207782745, + "learning_rate": 0.001, + "loss": 1.1754, + "step": 9660 + }, + { + "epoch": 0.30836442488599763, + "grad_norm": 0.23503394424915314, + "learning_rate": 0.001, + "loss": 1.1749, + "step": 9670 + }, + { + "epoch": 0.30868331260563153, + "grad_norm": 0.2093040943145752, + "learning_rate": 0.001, + "loss": 1.1836, + "step": 9680 + }, + { + "epoch": 0.3090022003252655, + "grad_norm": 0.22570084035396576, + "learning_rate": 0.001, + "loss": 1.1994, + "step": 9690 + }, + { + "epoch": 0.3093210880448994, + "grad_norm": 0.2191409170627594, + "learning_rate": 0.001, + "loss": 1.1873, + "step": 9700 + }, + { + "epoch": 0.3096399757645333, + "grad_norm": 0.21672411262989044, + "learning_rate": 0.001, + "loss": 1.1996, + "step": 9710 + }, + { + "epoch": 0.30995886348416724, + "grad_norm": 0.22737610340118408, + "learning_rate": 0.001, + "loss": 1.1935, + "step": 9720 + }, + { + "epoch": 0.31027775120380113, + "grad_norm": 0.22928710281848907, + "learning_rate": 0.001, + "loss": 1.1962, + "step": 9730 + }, + { + "epoch": 0.31059663892343503, + "grad_norm": 0.22526556253433228, + "learning_rate": 0.001, + "loss": 1.1886, + "step": 9740 + }, + { + "epoch": 0.310915526643069, + "grad_norm": 0.2220076620578766, + "learning_rate": 0.001, + "loss": 1.1757, + "step": 9750 + }, + { + "epoch": 0.3112344143627029, + "grad_norm": 0.210331529378891, + "learning_rate": 0.001, + "loss": 1.1849, + "step": 9760 + }, + { + "epoch": 0.31155330208233684, + "grad_norm": 0.22174914181232452, + "learning_rate": 0.001, + "loss": 1.2025, + "step": 9770 + }, + { + "epoch": 0.31187218980197073, + "grad_norm": 0.21560117602348328, + "learning_rate": 0.001, + "loss": 1.1687, + "step": 9780 + }, + { + "epoch": 0.31219107752160463, + "grad_norm": 0.21781526505947113, + "learning_rate": 0.001, + "loss": 1.1778, + "step": 9790 + }, + { + "epoch": 0.3125099652412386, + "grad_norm": 0.21552933752536774, + "learning_rate": 0.001, + "loss": 1.1908, + "step": 9800 + }, + { + "epoch": 0.3128288529608725, + "grad_norm": 0.2246668040752411, + "learning_rate": 0.001, + "loss": 1.1889, + "step": 9810 + }, + { + "epoch": 0.3131477406805064, + "grad_norm": 0.2357940971851349, + "learning_rate": 0.001, + "loss": 1.1943, + "step": 9820 + }, + { + "epoch": 0.31346662840014033, + "grad_norm": 0.22036802768707275, + "learning_rate": 0.001, + "loss": 1.1787, + "step": 9830 + }, + { + "epoch": 0.31378551611977423, + "grad_norm": 0.21087811887264252, + "learning_rate": 0.001, + "loss": 1.1806, + "step": 9840 + }, + { + "epoch": 0.31410440383940813, + "grad_norm": 0.22128775715827942, + "learning_rate": 0.001, + "loss": 1.1631, + "step": 9850 + }, + { + "epoch": 0.3144232915590421, + "grad_norm": 0.21533682942390442, + "learning_rate": 0.001, + "loss": 1.1952, + "step": 9860 + }, + { + "epoch": 0.314742179278676, + "grad_norm": 0.21292875707149506, + "learning_rate": 0.001, + "loss": 1.1896, + "step": 9870 + }, + { + "epoch": 0.3150610669983099, + "grad_norm": 0.2134082019329071, + "learning_rate": 0.001, + "loss": 1.1911, + "step": 9880 + }, + { + "epoch": 0.31537995471794383, + "grad_norm": 0.2236315906047821, + "learning_rate": 0.001, + "loss": 1.1733, + "step": 9890 + }, + { + "epoch": 0.31569884243757773, + "grad_norm": 0.2129340022802353, + "learning_rate": 0.001, + "loss": 1.178, + "step": 9900 + }, + { + "epoch": 0.31601773015721163, + "grad_norm": 0.21715772151947021, + "learning_rate": 0.001, + "loss": 1.1675, + "step": 9910 + }, + { + "epoch": 0.3163366178768456, + "grad_norm": 0.22146131098270416, + "learning_rate": 0.001, + "loss": 1.1753, + "step": 9920 + }, + { + "epoch": 0.3166555055964795, + "grad_norm": 0.21491622924804688, + "learning_rate": 0.001, + "loss": 1.1903, + "step": 9930 + }, + { + "epoch": 0.3169743933161134, + "grad_norm": 0.22050689160823822, + "learning_rate": 0.001, + "loss": 1.1906, + "step": 9940 + }, + { + "epoch": 0.31729328103574733, + "grad_norm": 0.2244400829076767, + "learning_rate": 0.001, + "loss": 1.1912, + "step": 9950 + }, + { + "epoch": 0.31761216875538123, + "grad_norm": 0.21547940373420715, + "learning_rate": 0.001, + "loss": 1.1796, + "step": 9960 + }, + { + "epoch": 0.31793105647501513, + "grad_norm": 0.21386213600635529, + "learning_rate": 0.001, + "loss": 1.2017, + "step": 9970 + }, + { + "epoch": 0.3182499441946491, + "grad_norm": 0.21595734357833862, + "learning_rate": 0.001, + "loss": 1.1788, + "step": 9980 + }, + { + "epoch": 0.318568831914283, + "grad_norm": 0.22140143811702728, + "learning_rate": 0.001, + "loss": 1.1862, + "step": 9990 + }, + { + "epoch": 0.3188877196339169, + "grad_norm": 0.2175026386976242, + "learning_rate": 0.001, + "loss": 1.1879, + "step": 10000 + }, + { + "epoch": 0.31920660735355083, + "grad_norm": 0.22484996914863586, + "learning_rate": 0.001, + "loss": 1.1876, + "step": 10010 + }, + { + "epoch": 0.31952549507318473, + "grad_norm": 0.21960212290287018, + "learning_rate": 0.001, + "loss": 1.174, + "step": 10020 + }, + { + "epoch": 0.3198443827928186, + "grad_norm": 0.21397638320922852, + "learning_rate": 0.001, + "loss": 1.181, + "step": 10030 + }, + { + "epoch": 0.3201632705124526, + "grad_norm": 0.22058583796024323, + "learning_rate": 0.001, + "loss": 1.1784, + "step": 10040 + }, + { + "epoch": 0.3204821582320865, + "grad_norm": 0.21691805124282837, + "learning_rate": 0.001, + "loss": 1.1997, + "step": 10050 + }, + { + "epoch": 0.3208010459517204, + "grad_norm": 0.21837332844734192, + "learning_rate": 0.001, + "loss": 1.1817, + "step": 10060 + }, + { + "epoch": 0.32111993367135433, + "grad_norm": 0.21407966315746307, + "learning_rate": 0.001, + "loss": 1.1682, + "step": 10070 + }, + { + "epoch": 0.3214388213909882, + "grad_norm": 0.22078247368335724, + "learning_rate": 0.001, + "loss": 1.1843, + "step": 10080 + }, + { + "epoch": 0.3217577091106221, + "grad_norm": 0.22645726799964905, + "learning_rate": 0.001, + "loss": 1.1804, + "step": 10090 + }, + { + "epoch": 0.3220765968302561, + "grad_norm": 0.23695030808448792, + "learning_rate": 0.001, + "loss": 1.1888, + "step": 10100 + }, + { + "epoch": 0.32239548454989, + "grad_norm": 0.21678394079208374, + "learning_rate": 0.001, + "loss": 1.1851, + "step": 10110 + }, + { + "epoch": 0.3227143722695239, + "grad_norm": 0.2129441499710083, + "learning_rate": 0.001, + "loss": 1.1582, + "step": 10120 + }, + { + "epoch": 0.32303325998915783, + "grad_norm": 0.23205362260341644, + "learning_rate": 0.001, + "loss": 1.1673, + "step": 10130 + }, + { + "epoch": 0.3233521477087917, + "grad_norm": 0.21547579765319824, + "learning_rate": 0.001, + "loss": 1.1743, + "step": 10140 + }, + { + "epoch": 0.3236710354284256, + "grad_norm": 0.2152055948972702, + "learning_rate": 0.001, + "loss": 1.1855, + "step": 10150 + }, + { + "epoch": 0.3239899231480596, + "grad_norm": 0.21947774291038513, + "learning_rate": 0.001, + "loss": 1.1733, + "step": 10160 + }, + { + "epoch": 0.3243088108676935, + "grad_norm": 0.22177624702453613, + "learning_rate": 0.001, + "loss": 1.172, + "step": 10170 + }, + { + "epoch": 0.32462769858732743, + "grad_norm": 0.21848328411579132, + "learning_rate": 0.001, + "loss": 1.1694, + "step": 10180 + }, + { + "epoch": 0.3249465863069613, + "grad_norm": 0.2304374724626541, + "learning_rate": 0.001, + "loss": 1.1636, + "step": 10190 + }, + { + "epoch": 0.3252654740265952, + "grad_norm": 0.21279925107955933, + "learning_rate": 0.001, + "loss": 1.1744, + "step": 10200 + }, + { + "epoch": 0.3255843617462292, + "grad_norm": 0.22444838285446167, + "learning_rate": 0.001, + "loss": 1.1749, + "step": 10210 + }, + { + "epoch": 0.3259032494658631, + "grad_norm": 0.22544382512569427, + "learning_rate": 0.001, + "loss": 1.1733, + "step": 10220 + }, + { + "epoch": 0.326222137185497, + "grad_norm": 0.20965062081813812, + "learning_rate": 0.001, + "loss": 1.2065, + "step": 10230 + }, + { + "epoch": 0.3265410249051309, + "grad_norm": 0.2140328288078308, + "learning_rate": 0.001, + "loss": 1.1949, + "step": 10240 + }, + { + "epoch": 0.3268599126247648, + "grad_norm": 0.2196279913187027, + "learning_rate": 0.001, + "loss": 1.154, + "step": 10250 + }, + { + "epoch": 0.3271788003443987, + "grad_norm": 0.22404052317142487, + "learning_rate": 0.001, + "loss": 1.1803, + "step": 10260 + }, + { + "epoch": 0.3274976880640327, + "grad_norm": 0.21107755601406097, + "learning_rate": 0.001, + "loss": 1.1745, + "step": 10270 + }, + { + "epoch": 0.3278165757836666, + "grad_norm": 0.22213582694530487, + "learning_rate": 0.001, + "loss": 1.1809, + "step": 10280 + }, + { + "epoch": 0.3281354635033005, + "grad_norm": 0.2129000425338745, + "learning_rate": 0.001, + "loss": 1.1832, + "step": 10290 + }, + { + "epoch": 0.3284543512229344, + "grad_norm": 0.21625253558158875, + "learning_rate": 0.001, + "loss": 1.1723, + "step": 10300 + }, + { + "epoch": 0.3287732389425683, + "grad_norm": 0.2259497046470642, + "learning_rate": 0.001, + "loss": 1.199, + "step": 10310 + }, + { + "epoch": 0.3290921266622022, + "grad_norm": 0.22692418098449707, + "learning_rate": 0.001, + "loss": 1.1668, + "step": 10320 + }, + { + "epoch": 0.3294110143818362, + "grad_norm": 0.2245110273361206, + "learning_rate": 0.001, + "loss": 1.199, + "step": 10330 + }, + { + "epoch": 0.3297299021014701, + "grad_norm": 0.22499682009220123, + "learning_rate": 0.001, + "loss": 1.1874, + "step": 10340 + }, + { + "epoch": 0.33004878982110397, + "grad_norm": 0.2121761441230774, + "learning_rate": 0.001, + "loss": 1.1635, + "step": 10350 + }, + { + "epoch": 0.3303676775407379, + "grad_norm": 0.22067177295684814, + "learning_rate": 0.001, + "loss": 1.1735, + "step": 10360 + }, + { + "epoch": 0.3306865652603718, + "grad_norm": 0.21458233892917633, + "learning_rate": 0.001, + "loss": 1.1928, + "step": 10370 + }, + { + "epoch": 0.3310054529800057, + "grad_norm": 0.21691793203353882, + "learning_rate": 0.001, + "loss": 1.1694, + "step": 10380 + }, + { + "epoch": 0.3313243406996397, + "grad_norm": 0.21943393349647522, + "learning_rate": 0.001, + "loss": 1.1591, + "step": 10390 + }, + { + "epoch": 0.3316432284192736, + "grad_norm": 0.20909449458122253, + "learning_rate": 0.001, + "loss": 1.1801, + "step": 10400 + }, + { + "epoch": 0.33196211613890747, + "grad_norm": 0.20836131274700165, + "learning_rate": 0.001, + "loss": 1.1636, + "step": 10410 + }, + { + "epoch": 0.3322810038585414, + "grad_norm": 0.21603058278560638, + "learning_rate": 0.001, + "loss": 1.1708, + "step": 10420 + }, + { + "epoch": 0.3325998915781753, + "grad_norm": 0.20854921638965607, + "learning_rate": 0.001, + "loss": 1.1604, + "step": 10430 + }, + { + "epoch": 0.3329187792978092, + "grad_norm": 0.21955880522727966, + "learning_rate": 0.001, + "loss": 1.1963, + "step": 10440 + }, + { + "epoch": 0.3332376670174432, + "grad_norm": 0.21727994084358215, + "learning_rate": 0.001, + "loss": 1.1648, + "step": 10450 + }, + { + "epoch": 0.33355655473707707, + "grad_norm": 0.21586079895496368, + "learning_rate": 0.001, + "loss": 1.1633, + "step": 10460 + }, + { + "epoch": 0.33387544245671097, + "grad_norm": 0.22296805679798126, + "learning_rate": 0.001, + "loss": 1.1799, + "step": 10470 + }, + { + "epoch": 0.3341943301763449, + "grad_norm": 0.21028365194797516, + "learning_rate": 0.001, + "loss": 1.1718, + "step": 10480 + }, + { + "epoch": 0.3345132178959788, + "grad_norm": 0.21386286616325378, + "learning_rate": 0.001, + "loss": 1.1644, + "step": 10490 + }, + { + "epoch": 0.3348321056156127, + "grad_norm": 0.22596792876720428, + "learning_rate": 0.001, + "loss": 1.1676, + "step": 10500 + }, + { + "epoch": 0.33515099333524667, + "grad_norm": 0.21902908384799957, + "learning_rate": 0.001, + "loss": 1.178, + "step": 10510 + }, + { + "epoch": 0.33546988105488057, + "grad_norm": 0.22614213824272156, + "learning_rate": 0.001, + "loss": 1.1866, + "step": 10520 + }, + { + "epoch": 0.33578876877451447, + "grad_norm": 0.21819354593753815, + "learning_rate": 0.001, + "loss": 1.1793, + "step": 10530 + }, + { + "epoch": 0.3361076564941484, + "grad_norm": 0.20999178290367126, + "learning_rate": 0.001, + "loss": 1.1555, + "step": 10540 + }, + { + "epoch": 0.3364265442137823, + "grad_norm": 0.2194094955921173, + "learning_rate": 0.001, + "loss": 1.1641, + "step": 10550 + }, + { + "epoch": 0.3367454319334162, + "grad_norm": 0.21941301226615906, + "learning_rate": 0.001, + "loss": 1.1692, + "step": 10560 + }, + { + "epoch": 0.33706431965305017, + "grad_norm": 0.2173202931880951, + "learning_rate": 0.001, + "loss": 1.1612, + "step": 10570 + }, + { + "epoch": 0.33738320737268407, + "grad_norm": 0.21929022669792175, + "learning_rate": 0.001, + "loss": 1.168, + "step": 10580 + }, + { + "epoch": 0.337702095092318, + "grad_norm": 0.22014977037906647, + "learning_rate": 0.001, + "loss": 1.1395, + "step": 10590 + }, + { + "epoch": 0.3380209828119519, + "grad_norm": 0.22155609726905823, + "learning_rate": 0.001, + "loss": 1.1624, + "step": 10600 + }, + { + "epoch": 0.3383398705315858, + "grad_norm": 0.23078712821006775, + "learning_rate": 0.001, + "loss": 1.1666, + "step": 10610 + }, + { + "epoch": 0.33865875825121977, + "grad_norm": 0.2146841287612915, + "learning_rate": 0.001, + "loss": 1.1943, + "step": 10620 + }, + { + "epoch": 0.33897764597085367, + "grad_norm": 0.21783071756362915, + "learning_rate": 0.001, + "loss": 1.1676, + "step": 10630 + }, + { + "epoch": 0.33929653369048757, + "grad_norm": 0.21608036756515503, + "learning_rate": 0.001, + "loss": 1.1781, + "step": 10640 + }, + { + "epoch": 0.3396154214101215, + "grad_norm": 0.22186727821826935, + "learning_rate": 0.001, + "loss": 1.1702, + "step": 10650 + }, + { + "epoch": 0.3399343091297554, + "grad_norm": 0.2069188356399536, + "learning_rate": 0.001, + "loss": 1.1704, + "step": 10660 + }, + { + "epoch": 0.3402531968493893, + "grad_norm": 0.20236782729625702, + "learning_rate": 0.001, + "loss": 1.1769, + "step": 10670 + }, + { + "epoch": 0.34057208456902327, + "grad_norm": 0.21130560338497162, + "learning_rate": 0.001, + "loss": 1.1815, + "step": 10680 + }, + { + "epoch": 0.34089097228865717, + "grad_norm": 0.2142263948917389, + "learning_rate": 0.001, + "loss": 1.1666, + "step": 10690 + }, + { + "epoch": 0.34120986000829107, + "grad_norm": 0.21785111725330353, + "learning_rate": 0.001, + "loss": 1.1745, + "step": 10700 + }, + { + "epoch": 0.341528747727925, + "grad_norm": 0.20937016606330872, + "learning_rate": 0.001, + "loss": 1.1686, + "step": 10710 + }, + { + "epoch": 0.3418476354475589, + "grad_norm": 0.21203775703907013, + "learning_rate": 0.001, + "loss": 1.1754, + "step": 10720 + }, + { + "epoch": 0.3421665231671928, + "grad_norm": 0.21170243620872498, + "learning_rate": 0.001, + "loss": 1.1505, + "step": 10730 + }, + { + "epoch": 0.34248541088682677, + "grad_norm": 0.21524657309055328, + "learning_rate": 0.001, + "loss": 1.1787, + "step": 10740 + }, + { + "epoch": 0.34280429860646067, + "grad_norm": 0.2171153426170349, + "learning_rate": 0.001, + "loss": 1.1622, + "step": 10750 + }, + { + "epoch": 0.34312318632609456, + "grad_norm": 0.21716740727424622, + "learning_rate": 0.001, + "loss": 1.1576, + "step": 10760 + }, + { + "epoch": 0.3434420740457285, + "grad_norm": 0.21167676150798798, + "learning_rate": 0.001, + "loss": 1.1459, + "step": 10770 + }, + { + "epoch": 0.3437609617653624, + "grad_norm": 0.2166099101305008, + "learning_rate": 0.001, + "loss": 1.1741, + "step": 10780 + }, + { + "epoch": 0.3440798494849963, + "grad_norm": 0.22174325585365295, + "learning_rate": 0.001, + "loss": 1.1635, + "step": 10790 + }, + { + "epoch": 0.34439873720463027, + "grad_norm": 0.217443585395813, + "learning_rate": 0.001, + "loss": 1.1738, + "step": 10800 + }, + { + "epoch": 0.34471762492426417, + "grad_norm": 0.2111525535583496, + "learning_rate": 0.001, + "loss": 1.1894, + "step": 10810 + }, + { + "epoch": 0.34503651264389806, + "grad_norm": 0.22243352234363556, + "learning_rate": 0.001, + "loss": 1.178, + "step": 10820 + }, + { + "epoch": 0.345355400363532, + "grad_norm": 0.21252469718456268, + "learning_rate": 0.001, + "loss": 1.1657, + "step": 10830 + }, + { + "epoch": 0.3456742880831659, + "grad_norm": 0.20956046879291534, + "learning_rate": 0.001, + "loss": 1.1531, + "step": 10840 + }, + { + "epoch": 0.3459931758027998, + "grad_norm": 0.20752333104610443, + "learning_rate": 0.001, + "loss": 1.1582, + "step": 10850 + }, + { + "epoch": 0.34631206352243377, + "grad_norm": 0.2089800238609314, + "learning_rate": 0.001, + "loss": 1.1547, + "step": 10860 + }, + { + "epoch": 0.34663095124206766, + "grad_norm": 0.21103443205356598, + "learning_rate": 0.001, + "loss": 1.1723, + "step": 10870 + }, + { + "epoch": 0.34694983896170156, + "grad_norm": 0.21871396899223328, + "learning_rate": 0.001, + "loss": 1.1838, + "step": 10880 + }, + { + "epoch": 0.3472687266813355, + "grad_norm": 0.21302813291549683, + "learning_rate": 0.001, + "loss": 1.1713, + "step": 10890 + }, + { + "epoch": 0.3475876144009694, + "grad_norm": 0.22152137756347656, + "learning_rate": 0.001, + "loss": 1.1773, + "step": 10900 + }, + { + "epoch": 0.3479065021206033, + "grad_norm": 0.2116517275571823, + "learning_rate": 0.001, + "loss": 1.1505, + "step": 10910 + }, + { + "epoch": 0.34822538984023726, + "grad_norm": 0.21990253031253815, + "learning_rate": 0.001, + "loss": 1.1738, + "step": 10920 + }, + { + "epoch": 0.34854427755987116, + "grad_norm": 0.22551077604293823, + "learning_rate": 0.001, + "loss": 1.1788, + "step": 10930 + }, + { + "epoch": 0.34886316527950506, + "grad_norm": 0.21925295889377594, + "learning_rate": 0.001, + "loss": 1.1585, + "step": 10940 + }, + { + "epoch": 0.349182052999139, + "grad_norm": 0.2133491337299347, + "learning_rate": 0.001, + "loss": 1.1832, + "step": 10950 + }, + { + "epoch": 0.3495009407187729, + "grad_norm": 0.20826612412929535, + "learning_rate": 0.001, + "loss": 1.1685, + "step": 10960 + }, + { + "epoch": 0.3498198284384068, + "grad_norm": 0.2133883684873581, + "learning_rate": 0.001, + "loss": 1.1692, + "step": 10970 + }, + { + "epoch": 0.35013871615804076, + "grad_norm": 0.21480365097522736, + "learning_rate": 0.001, + "loss": 1.1698, + "step": 10980 + }, + { + "epoch": 0.35045760387767466, + "grad_norm": 0.21561360359191895, + "learning_rate": 0.001, + "loss": 1.1494, + "step": 10990 + }, + { + "epoch": 0.3507764915973086, + "grad_norm": 0.21999944746494293, + "learning_rate": 0.001, + "loss": 1.161, + "step": 11000 + }, + { + "epoch": 0.3510953793169425, + "grad_norm": 0.21331818401813507, + "learning_rate": 0.001, + "loss": 1.1743, + "step": 11010 + }, + { + "epoch": 0.3514142670365764, + "grad_norm": 0.2172483205795288, + "learning_rate": 0.001, + "loss": 1.1575, + "step": 11020 + }, + { + "epoch": 0.35173315475621036, + "grad_norm": 0.20713630318641663, + "learning_rate": 0.001, + "loss": 1.139, + "step": 11030 + }, + { + "epoch": 0.35205204247584426, + "grad_norm": 0.21412107348442078, + "learning_rate": 0.001, + "loss": 1.1533, + "step": 11040 + }, + { + "epoch": 0.35237093019547816, + "grad_norm": 0.21454772353172302, + "learning_rate": 0.001, + "loss": 1.1401, + "step": 11050 + }, + { + "epoch": 0.3526898179151121, + "grad_norm": 0.20491410791873932, + "learning_rate": 0.001, + "loss": 1.1654, + "step": 11060 + }, + { + "epoch": 0.353008705634746, + "grad_norm": 0.20974987745285034, + "learning_rate": 0.001, + "loss": 1.156, + "step": 11070 + }, + { + "epoch": 0.3533275933543799, + "grad_norm": 0.2226075381040573, + "learning_rate": 0.001, + "loss": 1.1584, + "step": 11080 + }, + { + "epoch": 0.35364648107401386, + "grad_norm": 0.21573099493980408, + "learning_rate": 0.001, + "loss": 1.1468, + "step": 11090 + }, + { + "epoch": 0.35396536879364776, + "grad_norm": 0.2131635844707489, + "learning_rate": 0.001, + "loss": 1.1439, + "step": 11100 + }, + { + "epoch": 0.35428425651328166, + "grad_norm": 0.2120184600353241, + "learning_rate": 0.001, + "loss": 1.1671, + "step": 11110 + }, + { + "epoch": 0.3546031442329156, + "grad_norm": 0.21845276653766632, + "learning_rate": 0.001, + "loss": 1.1712, + "step": 11120 + }, + { + "epoch": 0.3549220319525495, + "grad_norm": 0.2118763029575348, + "learning_rate": 0.001, + "loss": 1.1679, + "step": 11130 + }, + { + "epoch": 0.3552409196721834, + "grad_norm": 0.21182695031166077, + "learning_rate": 0.001, + "loss": 1.1633, + "step": 11140 + }, + { + "epoch": 0.35555980739181736, + "grad_norm": 0.2156038135290146, + "learning_rate": 0.001, + "loss": 1.1644, + "step": 11150 + }, + { + "epoch": 0.35587869511145126, + "grad_norm": 0.20957325398921967, + "learning_rate": 0.001, + "loss": 1.1658, + "step": 11160 + }, + { + "epoch": 0.35619758283108516, + "grad_norm": 0.20978455245494843, + "learning_rate": 0.001, + "loss": 1.1679, + "step": 11170 + }, + { + "epoch": 0.3565164705507191, + "grad_norm": 0.21704687178134918, + "learning_rate": 0.001, + "loss": 1.1588, + "step": 11180 + }, + { + "epoch": 0.356835358270353, + "grad_norm": 0.2168053388595581, + "learning_rate": 0.001, + "loss": 1.1589, + "step": 11190 + }, + { + "epoch": 0.3571542459899869, + "grad_norm": 0.2075323611497879, + "learning_rate": 0.001, + "loss": 1.1368, + "step": 11200 + }, + { + "epoch": 0.35747313370962086, + "grad_norm": 0.21764330565929413, + "learning_rate": 0.001, + "loss": 1.1713, + "step": 11210 + }, + { + "epoch": 0.35779202142925476, + "grad_norm": 0.2073378711938858, + "learning_rate": 0.001, + "loss": 1.1598, + "step": 11220 + }, + { + "epoch": 0.35811090914888866, + "grad_norm": 0.21896976232528687, + "learning_rate": 0.001, + "loss": 1.1781, + "step": 11230 + }, + { + "epoch": 0.3584297968685226, + "grad_norm": 0.21934987604618073, + "learning_rate": 0.001, + "loss": 1.1788, + "step": 11240 + }, + { + "epoch": 0.3587486845881565, + "grad_norm": 0.2167501598596573, + "learning_rate": 0.001, + "loss": 1.155, + "step": 11250 + }, + { + "epoch": 0.3590675723077904, + "grad_norm": 0.20548115670681, + "learning_rate": 0.001, + "loss": 1.1457, + "step": 11260 + }, + { + "epoch": 0.35938646002742436, + "grad_norm": 0.2144881784915924, + "learning_rate": 0.001, + "loss": 1.1708, + "step": 11270 + }, + { + "epoch": 0.35970534774705826, + "grad_norm": 0.20482467114925385, + "learning_rate": 0.001, + "loss": 1.141, + "step": 11280 + }, + { + "epoch": 0.36002423546669216, + "grad_norm": 0.21546229720115662, + "learning_rate": 0.001, + "loss": 1.1507, + "step": 11290 + }, + { + "epoch": 0.3603431231863261, + "grad_norm": 0.2183115929365158, + "learning_rate": 0.001, + "loss": 1.1648, + "step": 11300 + }, + { + "epoch": 0.36066201090596, + "grad_norm": 0.20953315496444702, + "learning_rate": 0.001, + "loss": 1.1527, + "step": 11310 + }, + { + "epoch": 0.3609808986255939, + "grad_norm": 0.21324065327644348, + "learning_rate": 0.001, + "loss": 1.1472, + "step": 11320 + }, + { + "epoch": 0.36129978634522786, + "grad_norm": 0.20822182297706604, + "learning_rate": 0.001, + "loss": 1.145, + "step": 11330 + }, + { + "epoch": 0.36161867406486176, + "grad_norm": 0.21162369847297668, + "learning_rate": 0.001, + "loss": 1.1699, + "step": 11340 + }, + { + "epoch": 0.36193756178449565, + "grad_norm": 0.21036869287490845, + "learning_rate": 0.001, + "loss": 1.1592, + "step": 11350 + }, + { + "epoch": 0.3622564495041296, + "grad_norm": 0.21742284297943115, + "learning_rate": 0.001, + "loss": 1.1698, + "step": 11360 + }, + { + "epoch": 0.3625753372237635, + "grad_norm": 0.20884467661380768, + "learning_rate": 0.001, + "loss": 1.155, + "step": 11370 + }, + { + "epoch": 0.3628942249433974, + "grad_norm": 0.21736779808998108, + "learning_rate": 0.001, + "loss": 1.1549, + "step": 11380 + }, + { + "epoch": 0.36321311266303136, + "grad_norm": 0.2081272453069687, + "learning_rate": 0.001, + "loss": 1.1599, + "step": 11390 + }, + { + "epoch": 0.36353200038266525, + "grad_norm": 0.21277934312820435, + "learning_rate": 0.001, + "loss": 1.1681, + "step": 11400 + }, + { + "epoch": 0.3638508881022992, + "grad_norm": 0.21342188119888306, + "learning_rate": 0.001, + "loss": 1.1478, + "step": 11410 + }, + { + "epoch": 0.3641697758219331, + "grad_norm": 0.21795684099197388, + "learning_rate": 0.001, + "loss": 1.1564, + "step": 11420 + }, + { + "epoch": 0.364488663541567, + "grad_norm": 0.2076573669910431, + "learning_rate": 0.001, + "loss": 1.1331, + "step": 11430 + }, + { + "epoch": 0.36480755126120096, + "grad_norm": 0.22240781784057617, + "learning_rate": 0.001, + "loss": 1.1609, + "step": 11440 + }, + { + "epoch": 0.36512643898083486, + "grad_norm": 0.208001971244812, + "learning_rate": 0.001, + "loss": 1.1581, + "step": 11450 + }, + { + "epoch": 0.36544532670046875, + "grad_norm": 0.20453792810440063, + "learning_rate": 0.001, + "loss": 1.1437, + "step": 11460 + }, + { + "epoch": 0.3657642144201027, + "grad_norm": 0.2149641364812851, + "learning_rate": 0.001, + "loss": 1.1582, + "step": 11470 + }, + { + "epoch": 0.3660831021397366, + "grad_norm": 0.21504372358322144, + "learning_rate": 0.001, + "loss": 1.1506, + "step": 11480 + }, + { + "epoch": 0.3664019898593705, + "grad_norm": 0.21626321971416473, + "learning_rate": 0.001, + "loss": 1.157, + "step": 11490 + }, + { + "epoch": 0.36672087757900446, + "grad_norm": 0.22062943875789642, + "learning_rate": 0.001, + "loss": 1.1718, + "step": 11500 + }, + { + "epoch": 0.36703976529863835, + "grad_norm": 0.20778238773345947, + "learning_rate": 0.001, + "loss": 1.1441, + "step": 11510 + }, + { + "epoch": 0.36735865301827225, + "grad_norm": 0.21755580604076385, + "learning_rate": 0.001, + "loss": 1.1711, + "step": 11520 + }, + { + "epoch": 0.3676775407379062, + "grad_norm": 0.21479518711566925, + "learning_rate": 0.001, + "loss": 1.1441, + "step": 11530 + }, + { + "epoch": 0.3679964284575401, + "grad_norm": 0.2130654752254486, + "learning_rate": 0.001, + "loss": 1.1366, + "step": 11540 + }, + { + "epoch": 0.368315316177174, + "grad_norm": 0.21198046207427979, + "learning_rate": 0.001, + "loss": 1.1578, + "step": 11550 + }, + { + "epoch": 0.36863420389680795, + "grad_norm": 0.2044605165719986, + "learning_rate": 0.001, + "loss": 1.1484, + "step": 11560 + }, + { + "epoch": 0.36895309161644185, + "grad_norm": 0.21563421189785004, + "learning_rate": 0.001, + "loss": 1.1645, + "step": 11570 + }, + { + "epoch": 0.36927197933607575, + "grad_norm": 0.20926934480667114, + "learning_rate": 0.001, + "loss": 1.131, + "step": 11580 + }, + { + "epoch": 0.3695908670557097, + "grad_norm": 0.21198894083499908, + "learning_rate": 0.001, + "loss": 1.1424, + "step": 11590 + }, + { + "epoch": 0.3699097547753436, + "grad_norm": 0.20869193971157074, + "learning_rate": 0.001, + "loss": 1.1747, + "step": 11600 + }, + { + "epoch": 0.3702286424949775, + "grad_norm": 0.20894703269004822, + "learning_rate": 0.001, + "loss": 1.1578, + "step": 11610 + }, + { + "epoch": 0.37054753021461145, + "grad_norm": 0.2092287838459015, + "learning_rate": 0.001, + "loss": 1.1677, + "step": 11620 + }, + { + "epoch": 0.37086641793424535, + "grad_norm": 0.21486634016036987, + "learning_rate": 0.001, + "loss": 1.1626, + "step": 11630 + }, + { + "epoch": 0.37118530565387925, + "grad_norm": 0.2121204286813736, + "learning_rate": 0.001, + "loss": 1.1816, + "step": 11640 + }, + { + "epoch": 0.3715041933735132, + "grad_norm": 0.20252400636672974, + "learning_rate": 0.001, + "loss": 1.1521, + "step": 11650 + }, + { + "epoch": 0.3718230810931471, + "grad_norm": 0.22328728437423706, + "learning_rate": 0.001, + "loss": 1.1693, + "step": 11660 + }, + { + "epoch": 0.372141968812781, + "grad_norm": 0.2148735076189041, + "learning_rate": 0.001, + "loss": 1.1723, + "step": 11670 + }, + { + "epoch": 0.37246085653241495, + "grad_norm": 0.20686081051826477, + "learning_rate": 0.001, + "loss": 1.1521, + "step": 11680 + }, + { + "epoch": 0.37277974425204885, + "grad_norm": 0.2087356299161911, + "learning_rate": 0.001, + "loss": 1.1516, + "step": 11690 + }, + { + "epoch": 0.37309863197168275, + "grad_norm": 0.20420542359352112, + "learning_rate": 0.001, + "loss": 1.1394, + "step": 11700 + }, + { + "epoch": 0.3734175196913167, + "grad_norm": 0.20632268488407135, + "learning_rate": 0.001, + "loss": 1.152, + "step": 11710 + }, + { + "epoch": 0.3737364074109506, + "grad_norm": 0.20878000557422638, + "learning_rate": 0.001, + "loss": 1.1492, + "step": 11720 + }, + { + "epoch": 0.3740552951305845, + "grad_norm": 0.21679173409938812, + "learning_rate": 0.001, + "loss": 1.1608, + "step": 11730 + }, + { + "epoch": 0.37437418285021845, + "grad_norm": 0.2118385136127472, + "learning_rate": 0.001, + "loss": 1.1604, + "step": 11740 + }, + { + "epoch": 0.37469307056985235, + "grad_norm": 0.2076609879732132, + "learning_rate": 0.001, + "loss": 1.1706, + "step": 11750 + }, + { + "epoch": 0.37501195828948625, + "grad_norm": 0.2135295271873474, + "learning_rate": 0.001, + "loss": 1.1611, + "step": 11760 + }, + { + "epoch": 0.3753308460091202, + "grad_norm": 0.21652574837207794, + "learning_rate": 0.001, + "loss": 1.1626, + "step": 11770 + }, + { + "epoch": 0.3756497337287541, + "grad_norm": 0.20713207125663757, + "learning_rate": 0.001, + "loss": 1.1533, + "step": 11780 + }, + { + "epoch": 0.375968621448388, + "grad_norm": 0.21347831189632416, + "learning_rate": 0.001, + "loss": 1.1775, + "step": 11790 + }, + { + "epoch": 0.37628750916802195, + "grad_norm": 0.2028295397758484, + "learning_rate": 0.001, + "loss": 1.1549, + "step": 11800 + }, + { + "epoch": 0.37660639688765585, + "grad_norm": 0.20557062327861786, + "learning_rate": 0.001, + "loss": 1.159, + "step": 11810 + }, + { + "epoch": 0.37692528460728975, + "grad_norm": 0.20521752536296844, + "learning_rate": 0.001, + "loss": 1.1624, + "step": 11820 + }, + { + "epoch": 0.3772441723269237, + "grad_norm": 0.20790110528469086, + "learning_rate": 0.001, + "loss": 1.1359, + "step": 11830 + }, + { + "epoch": 0.3775630600465576, + "grad_norm": 0.20268124341964722, + "learning_rate": 0.001, + "loss": 1.1597, + "step": 11840 + }, + { + "epoch": 0.37788194776619155, + "grad_norm": 0.20956410467624664, + "learning_rate": 0.001, + "loss": 1.1579, + "step": 11850 + }, + { + "epoch": 0.37820083548582545, + "grad_norm": 0.20633342862129211, + "learning_rate": 0.001, + "loss": 1.165, + "step": 11860 + }, + { + "epoch": 0.37851972320545935, + "grad_norm": 0.20639200508594513, + "learning_rate": 0.001, + "loss": 1.1598, + "step": 11870 + }, + { + "epoch": 0.3788386109250933, + "grad_norm": 0.2101225107908249, + "learning_rate": 0.001, + "loss": 1.1587, + "step": 11880 + }, + { + "epoch": 0.3791574986447272, + "grad_norm": 0.2012529969215393, + "learning_rate": 0.001, + "loss": 1.1504, + "step": 11890 + }, + { + "epoch": 0.3794763863643611, + "grad_norm": 0.2067694514989853, + "learning_rate": 0.001, + "loss": 1.1442, + "step": 11900 + }, + { + "epoch": 0.37979527408399505, + "grad_norm": 0.20460672676563263, + "learning_rate": 0.001, + "loss": 1.1604, + "step": 11910 + }, + { + "epoch": 0.38011416180362895, + "grad_norm": 0.20649559795856476, + "learning_rate": 0.001, + "loss": 1.164, + "step": 11920 + }, + { + "epoch": 0.38043304952326285, + "grad_norm": 0.20930494368076324, + "learning_rate": 0.001, + "loss": 1.1323, + "step": 11930 + }, + { + "epoch": 0.3807519372428968, + "grad_norm": 0.20886340737342834, + "learning_rate": 0.001, + "loss": 1.1271, + "step": 11940 + }, + { + "epoch": 0.3810708249625307, + "grad_norm": 0.21348540484905243, + "learning_rate": 0.001, + "loss": 1.1522, + "step": 11950 + }, + { + "epoch": 0.3813897126821646, + "grad_norm": 0.21220609545707703, + "learning_rate": 0.001, + "loss": 1.1615, + "step": 11960 + }, + { + "epoch": 0.38170860040179855, + "grad_norm": 0.2051163762807846, + "learning_rate": 0.001, + "loss": 1.1568, + "step": 11970 + }, + { + "epoch": 0.38202748812143245, + "grad_norm": 0.20502525568008423, + "learning_rate": 0.001, + "loss": 1.1374, + "step": 11980 + }, + { + "epoch": 0.38234637584106634, + "grad_norm": 0.21495133638381958, + "learning_rate": 0.001, + "loss": 1.1492, + "step": 11990 + }, + { + "epoch": 0.3826652635607003, + "grad_norm": 0.2174481600522995, + "learning_rate": 0.001, + "loss": 1.1474, + "step": 12000 + }, + { + "epoch": 0.3829841512803342, + "grad_norm": 0.21735884249210358, + "learning_rate": 0.001, + "loss": 1.1598, + "step": 12010 + }, + { + "epoch": 0.3833030389999681, + "grad_norm": 0.21421964466571808, + "learning_rate": 0.001, + "loss": 1.1757, + "step": 12020 + }, + { + "epoch": 0.38362192671960205, + "grad_norm": 0.2076413929462433, + "learning_rate": 0.001, + "loss": 1.1523, + "step": 12030 + }, + { + "epoch": 0.38394081443923594, + "grad_norm": 0.2062806487083435, + "learning_rate": 0.001, + "loss": 1.1421, + "step": 12040 + }, + { + "epoch": 0.38425970215886984, + "grad_norm": 0.2101665884256363, + "learning_rate": 0.001, + "loss": 1.1509, + "step": 12050 + }, + { + "epoch": 0.3845785898785038, + "grad_norm": 0.2052222043275833, + "learning_rate": 0.001, + "loss": 1.1387, + "step": 12060 + }, + { + "epoch": 0.3848974775981377, + "grad_norm": 0.20605748891830444, + "learning_rate": 0.001, + "loss": 1.1564, + "step": 12070 + }, + { + "epoch": 0.3852163653177716, + "grad_norm": 0.2109629064798355, + "learning_rate": 0.001, + "loss": 1.1539, + "step": 12080 + }, + { + "epoch": 0.38553525303740555, + "grad_norm": 0.21306735277175903, + "learning_rate": 0.001, + "loss": 1.1447, + "step": 12090 + }, + { + "epoch": 0.38585414075703944, + "grad_norm": 0.2046939879655838, + "learning_rate": 0.001, + "loss": 1.1549, + "step": 12100 + }, + { + "epoch": 0.38617302847667334, + "grad_norm": 0.20633463561534882, + "learning_rate": 0.001, + "loss": 1.1547, + "step": 12110 + }, + { + "epoch": 0.3864919161963073, + "grad_norm": 0.20908667147159576, + "learning_rate": 0.001, + "loss": 1.1626, + "step": 12120 + }, + { + "epoch": 0.3868108039159412, + "grad_norm": 0.2130550891160965, + "learning_rate": 0.001, + "loss": 1.1633, + "step": 12130 + }, + { + "epoch": 0.3871296916355751, + "grad_norm": 0.2054852545261383, + "learning_rate": 0.001, + "loss": 1.1264, + "step": 12140 + }, + { + "epoch": 0.38744857935520904, + "grad_norm": 0.2040906846523285, + "learning_rate": 0.001, + "loss": 1.1707, + "step": 12150 + }, + { + "epoch": 0.38776746707484294, + "grad_norm": 0.20435848832130432, + "learning_rate": 0.001, + "loss": 1.165, + "step": 12160 + }, + { + "epoch": 0.38808635479447684, + "grad_norm": 0.2084130495786667, + "learning_rate": 0.001, + "loss": 1.1264, + "step": 12170 + }, + { + "epoch": 0.3884052425141108, + "grad_norm": 0.2129623293876648, + "learning_rate": 0.001, + "loss": 1.1697, + "step": 12180 + }, + { + "epoch": 0.3887241302337447, + "grad_norm": 0.21030132472515106, + "learning_rate": 0.001, + "loss": 1.1461, + "step": 12190 + }, + { + "epoch": 0.3890430179533786, + "grad_norm": 0.20669963955879211, + "learning_rate": 0.001, + "loss": 1.1445, + "step": 12200 + }, + { + "epoch": 0.38936190567301254, + "grad_norm": 0.2081628292798996, + "learning_rate": 0.001, + "loss": 1.1528, + "step": 12210 + }, + { + "epoch": 0.38968079339264644, + "grad_norm": 0.20809587836265564, + "learning_rate": 0.001, + "loss": 1.1561, + "step": 12220 + }, + { + "epoch": 0.38999968111228034, + "grad_norm": 0.20185744762420654, + "learning_rate": 0.001, + "loss": 1.1413, + "step": 12230 + }, + { + "epoch": 0.3903185688319143, + "grad_norm": 0.20260155200958252, + "learning_rate": 0.001, + "loss": 1.1399, + "step": 12240 + }, + { + "epoch": 0.3906374565515482, + "grad_norm": 0.21670649945735931, + "learning_rate": 0.001, + "loss": 1.1275, + "step": 12250 + }, + { + "epoch": 0.39095634427118214, + "grad_norm": 0.21641764044761658, + "learning_rate": 0.001, + "loss": 1.1474, + "step": 12260 + }, + { + "epoch": 0.39127523199081604, + "grad_norm": 0.20669713616371155, + "learning_rate": 0.001, + "loss": 1.1597, + "step": 12270 + }, + { + "epoch": 0.39159411971044994, + "grad_norm": 0.21705591678619385, + "learning_rate": 0.001, + "loss": 1.1473, + "step": 12280 + }, + { + "epoch": 0.3919130074300839, + "grad_norm": 0.2091154009103775, + "learning_rate": 0.001, + "loss": 1.1463, + "step": 12290 + }, + { + "epoch": 0.3922318951497178, + "grad_norm": 0.20971107482910156, + "learning_rate": 0.001, + "loss": 1.1617, + "step": 12300 + }, + { + "epoch": 0.3925507828693517, + "grad_norm": 0.2061934471130371, + "learning_rate": 0.001, + "loss": 1.1493, + "step": 12310 + }, + { + "epoch": 0.39286967058898564, + "grad_norm": 0.20891344547271729, + "learning_rate": 0.001, + "loss": 1.1559, + "step": 12320 + }, + { + "epoch": 0.39318855830861954, + "grad_norm": 0.21326862275600433, + "learning_rate": 0.001, + "loss": 1.1546, + "step": 12330 + }, + { + "epoch": 0.39350744602825344, + "grad_norm": 0.2107415497303009, + "learning_rate": 0.001, + "loss": 1.1388, + "step": 12340 + }, + { + "epoch": 0.3938263337478874, + "grad_norm": 0.20264574885368347, + "learning_rate": 0.001, + "loss": 1.1338, + "step": 12350 + }, + { + "epoch": 0.3941452214675213, + "grad_norm": 0.20951254665851593, + "learning_rate": 0.001, + "loss": 1.1702, + "step": 12360 + }, + { + "epoch": 0.3944641091871552, + "grad_norm": 0.2009592056274414, + "learning_rate": 0.001, + "loss": 1.1421, + "step": 12370 + }, + { + "epoch": 0.39478299690678914, + "grad_norm": 0.2120770961046219, + "learning_rate": 0.001, + "loss": 1.1384, + "step": 12380 + }, + { + "epoch": 0.39510188462642304, + "grad_norm": 0.22238101065158844, + "learning_rate": 0.001, + "loss": 1.1391, + "step": 12390 + }, + { + "epoch": 0.39542077234605694, + "grad_norm": 0.20835790038108826, + "learning_rate": 0.001, + "loss": 1.154, + "step": 12400 + }, + { + "epoch": 0.3957396600656909, + "grad_norm": 0.20659910142421722, + "learning_rate": 0.001, + "loss": 1.1403, + "step": 12410 + }, + { + "epoch": 0.3960585477853248, + "grad_norm": 0.21030953526496887, + "learning_rate": 0.001, + "loss": 1.1501, + "step": 12420 + }, + { + "epoch": 0.3963774355049587, + "grad_norm": 0.20341965556144714, + "learning_rate": 0.001, + "loss": 1.1516, + "step": 12430 + }, + { + "epoch": 0.39669632322459264, + "grad_norm": 0.20903363823890686, + "learning_rate": 0.001, + "loss": 1.129, + "step": 12440 + }, + { + "epoch": 0.39701521094422654, + "grad_norm": 0.210405170917511, + "learning_rate": 0.001, + "loss": 1.1531, + "step": 12450 + }, + { + "epoch": 0.39733409866386044, + "grad_norm": 0.20492255687713623, + "learning_rate": 0.001, + "loss": 1.1362, + "step": 12460 + }, + { + "epoch": 0.3976529863834944, + "grad_norm": 0.21225130558013916, + "learning_rate": 0.001, + "loss": 1.1609, + "step": 12470 + }, + { + "epoch": 0.3979718741031283, + "grad_norm": 0.2099357396364212, + "learning_rate": 0.001, + "loss": 1.158, + "step": 12480 + }, + { + "epoch": 0.3982907618227622, + "grad_norm": 0.2044241577386856, + "learning_rate": 0.001, + "loss": 1.1476, + "step": 12490 + }, + { + "epoch": 0.39860964954239614, + "grad_norm": 0.20494481921195984, + "learning_rate": 0.001, + "loss": 1.1621, + "step": 12500 + }, + { + "epoch": 0.39892853726203004, + "grad_norm": 0.2142401486635208, + "learning_rate": 0.001, + "loss": 1.1352, + "step": 12510 + }, + { + "epoch": 0.39924742498166393, + "grad_norm": 0.2000083476305008, + "learning_rate": 0.001, + "loss": 1.1388, + "step": 12520 + }, + { + "epoch": 0.3995663127012979, + "grad_norm": 0.2018578201532364, + "learning_rate": 0.001, + "loss": 1.1563, + "step": 12530 + }, + { + "epoch": 0.3998852004209318, + "grad_norm": 0.21176375448703766, + "learning_rate": 0.001, + "loss": 1.1445, + "step": 12540 + }, + { + "epoch": 0.4002040881405657, + "grad_norm": 0.21311414241790771, + "learning_rate": 0.001, + "loss": 1.1257, + "step": 12550 + }, + { + "epoch": 0.40052297586019964, + "grad_norm": 0.19823896884918213, + "learning_rate": 0.001, + "loss": 1.1407, + "step": 12560 + }, + { + "epoch": 0.40084186357983353, + "grad_norm": 0.2098669707775116, + "learning_rate": 0.001, + "loss": 1.1684, + "step": 12570 + }, + { + "epoch": 0.40116075129946743, + "grad_norm": 0.20935393869876862, + "learning_rate": 0.001, + "loss": 1.1175, + "step": 12580 + }, + { + "epoch": 0.4014796390191014, + "grad_norm": 0.20373448729515076, + "learning_rate": 0.001, + "loss": 1.1666, + "step": 12590 + }, + { + "epoch": 0.4017985267387353, + "grad_norm": 0.2081720381975174, + "learning_rate": 0.001, + "loss": 1.1402, + "step": 12600 + }, + { + "epoch": 0.4021174144583692, + "grad_norm": 0.20864781737327576, + "learning_rate": 0.001, + "loss": 1.1447, + "step": 12610 + }, + { + "epoch": 0.40243630217800314, + "grad_norm": 0.20449693500995636, + "learning_rate": 0.001, + "loss": 1.1525, + "step": 12620 + }, + { + "epoch": 0.40275518989763703, + "grad_norm": 0.20613113045692444, + "learning_rate": 0.001, + "loss": 1.1263, + "step": 12630 + }, + { + "epoch": 0.40307407761727093, + "grad_norm": 0.20619328320026398, + "learning_rate": 0.001, + "loss": 1.127, + "step": 12640 + }, + { + "epoch": 0.4033929653369049, + "grad_norm": 0.20716503262519836, + "learning_rate": 0.001, + "loss": 1.1516, + "step": 12650 + }, + { + "epoch": 0.4037118530565388, + "grad_norm": 0.20824389159679413, + "learning_rate": 0.001, + "loss": 1.141, + "step": 12660 + }, + { + "epoch": 0.40403074077617274, + "grad_norm": 0.2075486034154892, + "learning_rate": 0.001, + "loss": 1.138, + "step": 12670 + }, + { + "epoch": 0.40434962849580663, + "grad_norm": 0.21800272166728973, + "learning_rate": 0.001, + "loss": 1.1466, + "step": 12680 + }, + { + "epoch": 0.40466851621544053, + "grad_norm": 0.2054111361503601, + "learning_rate": 0.001, + "loss": 1.1323, + "step": 12690 + }, + { + "epoch": 0.4049874039350745, + "grad_norm": 0.20511585474014282, + "learning_rate": 0.001, + "loss": 1.1468, + "step": 12700 + }, + { + "epoch": 0.4053062916547084, + "grad_norm": 0.21086783707141876, + "learning_rate": 0.001, + "loss": 1.1305, + "step": 12710 + }, + { + "epoch": 0.4056251793743423, + "grad_norm": 0.2127915620803833, + "learning_rate": 0.001, + "loss": 1.1521, + "step": 12720 + }, + { + "epoch": 0.40594406709397624, + "grad_norm": 0.2181047797203064, + "learning_rate": 0.001, + "loss": 1.1347, + "step": 12730 + }, + { + "epoch": 0.40626295481361013, + "grad_norm": 0.21023984253406525, + "learning_rate": 0.001, + "loss": 1.1454, + "step": 12740 + }, + { + "epoch": 0.40658184253324403, + "grad_norm": 0.1992928385734558, + "learning_rate": 0.001, + "loss": 1.1357, + "step": 12750 + }, + { + "epoch": 0.406900730252878, + "grad_norm": 0.20112110674381256, + "learning_rate": 0.001, + "loss": 1.141, + "step": 12760 + }, + { + "epoch": 0.4072196179725119, + "grad_norm": 0.2041330635547638, + "learning_rate": 0.001, + "loss": 1.1541, + "step": 12770 + }, + { + "epoch": 0.4075385056921458, + "grad_norm": 0.20971736311912537, + "learning_rate": 0.001, + "loss": 1.1343, + "step": 12780 + }, + { + "epoch": 0.40785739341177973, + "grad_norm": 0.20947255194187164, + "learning_rate": 0.001, + "loss": 1.1403, + "step": 12790 + }, + { + "epoch": 0.40817628113141363, + "grad_norm": 0.2103559523820877, + "learning_rate": 0.001, + "loss": 1.1574, + "step": 12800 + }, + { + "epoch": 0.40849516885104753, + "grad_norm": 0.20097647607326508, + "learning_rate": 0.001, + "loss": 1.1358, + "step": 12810 + }, + { + "epoch": 0.4088140565706815, + "grad_norm": 0.20684273540973663, + "learning_rate": 0.001, + "loss": 1.1376, + "step": 12820 + }, + { + "epoch": 0.4091329442903154, + "grad_norm": 0.2013324797153473, + "learning_rate": 0.001, + "loss": 1.1315, + "step": 12830 + }, + { + "epoch": 0.4094518320099493, + "grad_norm": 0.20758014917373657, + "learning_rate": 0.001, + "loss": 1.1097, + "step": 12840 + }, + { + "epoch": 0.40977071972958323, + "grad_norm": 0.21744020283222198, + "learning_rate": 0.001, + "loss": 1.1434, + "step": 12850 + }, + { + "epoch": 0.41008960744921713, + "grad_norm": 0.19597770273685455, + "learning_rate": 0.001, + "loss": 1.1233, + "step": 12860 + }, + { + "epoch": 0.41040849516885103, + "grad_norm": 0.19693255424499512, + "learning_rate": 0.001, + "loss": 1.1297, + "step": 12870 + }, + { + "epoch": 0.410727382888485, + "grad_norm": 0.2022457867860794, + "learning_rate": 0.001, + "loss": 1.144, + "step": 12880 + }, + { + "epoch": 0.4110462706081189, + "grad_norm": 0.20683251321315765, + "learning_rate": 0.001, + "loss": 1.149, + "step": 12890 + }, + { + "epoch": 0.4113651583277528, + "grad_norm": 0.202144593000412, + "learning_rate": 0.001, + "loss": 1.1299, + "step": 12900 + }, + { + "epoch": 0.41168404604738673, + "grad_norm": 0.1991073489189148, + "learning_rate": 0.001, + "loss": 1.1262, + "step": 12910 + }, + { + "epoch": 0.41200293376702063, + "grad_norm": 0.20432505011558533, + "learning_rate": 0.001, + "loss": 1.137, + "step": 12920 + }, + { + "epoch": 0.4123218214866545, + "grad_norm": 0.2065206915140152, + "learning_rate": 0.001, + "loss": 1.1301, + "step": 12930 + }, + { + "epoch": 0.4126407092062885, + "grad_norm": 0.21112370491027832, + "learning_rate": 0.001, + "loss": 1.1481, + "step": 12940 + }, + { + "epoch": 0.4129595969259224, + "grad_norm": 0.215112566947937, + "learning_rate": 0.001, + "loss": 1.1468, + "step": 12950 + }, + { + "epoch": 0.4132784846455563, + "grad_norm": 0.20078018307685852, + "learning_rate": 0.001, + "loss": 1.1475, + "step": 12960 + }, + { + "epoch": 0.41359737236519023, + "grad_norm": 0.21014413237571716, + "learning_rate": 0.001, + "loss": 1.1507, + "step": 12970 + }, + { + "epoch": 0.41391626008482413, + "grad_norm": 0.2088528871536255, + "learning_rate": 0.001, + "loss": 1.1526, + "step": 12980 + }, + { + "epoch": 0.414235147804458, + "grad_norm": 0.2205217182636261, + "learning_rate": 0.001, + "loss": 1.1377, + "step": 12990 + }, + { + "epoch": 0.414554035524092, + "grad_norm": 0.20459334552288055, + "learning_rate": 0.001, + "loss": 1.1384, + "step": 13000 + }, + { + "epoch": 0.4148729232437259, + "grad_norm": 0.21253494918346405, + "learning_rate": 0.001, + "loss": 1.1215, + "step": 13010 + }, + { + "epoch": 0.4151918109633598, + "grad_norm": 0.20328834652900696, + "learning_rate": 0.001, + "loss": 1.1553, + "step": 13020 + }, + { + "epoch": 0.41551069868299373, + "grad_norm": 0.2183370292186737, + "learning_rate": 0.001, + "loss": 1.1477, + "step": 13030 + }, + { + "epoch": 0.4158295864026276, + "grad_norm": 0.2050125151872635, + "learning_rate": 0.001, + "loss": 1.1318, + "step": 13040 + }, + { + "epoch": 0.4161484741222615, + "grad_norm": 0.20545805990695953, + "learning_rate": 0.001, + "loss": 1.1359, + "step": 13050 + }, + { + "epoch": 0.4164673618418955, + "grad_norm": 0.2132645845413208, + "learning_rate": 0.001, + "loss": 1.156, + "step": 13060 + }, + { + "epoch": 0.4167862495615294, + "grad_norm": 0.2059139609336853, + "learning_rate": 0.001, + "loss": 1.1395, + "step": 13070 + }, + { + "epoch": 0.41710513728116333, + "grad_norm": 0.2027749866247177, + "learning_rate": 0.001, + "loss": 1.1437, + "step": 13080 + }, + { + "epoch": 0.4174240250007972, + "grad_norm": 0.20797421038150787, + "learning_rate": 0.001, + "loss": 1.1403, + "step": 13090 + }, + { + "epoch": 0.4177429127204311, + "grad_norm": 0.20062734186649323, + "learning_rate": 0.001, + "loss": 1.1325, + "step": 13100 + }, + { + "epoch": 0.4180618004400651, + "grad_norm": 0.21262063086032867, + "learning_rate": 0.001, + "loss": 1.1396, + "step": 13110 + }, + { + "epoch": 0.418380688159699, + "grad_norm": 0.2075001448392868, + "learning_rate": 0.001, + "loss": 1.149, + "step": 13120 + }, + { + "epoch": 0.4186995758793329, + "grad_norm": 0.20594367384910583, + "learning_rate": 0.001, + "loss": 1.1424, + "step": 13130 + }, + { + "epoch": 0.41901846359896683, + "grad_norm": 0.20854982733726501, + "learning_rate": 0.001, + "loss": 1.1376, + "step": 13140 + }, + { + "epoch": 0.4193373513186007, + "grad_norm": 0.20273441076278687, + "learning_rate": 0.001, + "loss": 1.148, + "step": 13150 + }, + { + "epoch": 0.4196562390382346, + "grad_norm": 0.20512281358242035, + "learning_rate": 0.001, + "loss": 1.1313, + "step": 13160 + }, + { + "epoch": 0.4199751267578686, + "grad_norm": 0.20326954126358032, + "learning_rate": 0.001, + "loss": 1.1415, + "step": 13170 + }, + { + "epoch": 0.4202940144775025, + "grad_norm": 0.21317848563194275, + "learning_rate": 0.001, + "loss": 1.1229, + "step": 13180 + }, + { + "epoch": 0.4206129021971364, + "grad_norm": 0.2062990963459015, + "learning_rate": 0.001, + "loss": 1.1389, + "step": 13190 + }, + { + "epoch": 0.4209317899167703, + "grad_norm": 0.2079552263021469, + "learning_rate": 0.001, + "loss": 1.1538, + "step": 13200 + }, + { + "epoch": 0.4212506776364042, + "grad_norm": 0.20866861939430237, + "learning_rate": 0.001, + "loss": 1.1403, + "step": 13210 + }, + { + "epoch": 0.4215695653560381, + "grad_norm": 0.2059970647096634, + "learning_rate": 0.001, + "loss": 1.1388, + "step": 13220 + }, + { + "epoch": 0.4218884530756721, + "grad_norm": 0.20208317041397095, + "learning_rate": 0.001, + "loss": 1.1436, + "step": 13230 + }, + { + "epoch": 0.422207340795306, + "grad_norm": 0.20684318244457245, + "learning_rate": 0.001, + "loss": 1.1254, + "step": 13240 + }, + { + "epoch": 0.42252622851493987, + "grad_norm": 0.2068798840045929, + "learning_rate": 0.001, + "loss": 1.1176, + "step": 13250 + }, + { + "epoch": 0.4228451162345738, + "grad_norm": 0.21011073887348175, + "learning_rate": 0.001, + "loss": 1.1267, + "step": 13260 + }, + { + "epoch": 0.4231640039542077, + "grad_norm": 0.20945926010608673, + "learning_rate": 0.001, + "loss": 1.1438, + "step": 13270 + }, + { + "epoch": 0.4234828916738416, + "grad_norm": 0.20559492707252502, + "learning_rate": 0.001, + "loss": 1.1207, + "step": 13280 + }, + { + "epoch": 0.4238017793934756, + "grad_norm": 0.19994908571243286, + "learning_rate": 0.001, + "loss": 1.1322, + "step": 13290 + }, + { + "epoch": 0.4241206671131095, + "grad_norm": 0.2100694179534912, + "learning_rate": 0.001, + "loss": 1.1563, + "step": 13300 + }, + { + "epoch": 0.42443955483274337, + "grad_norm": 0.2056904286146164, + "learning_rate": 0.001, + "loss": 1.1523, + "step": 13310 + }, + { + "epoch": 0.4247584425523773, + "grad_norm": 0.20013177394866943, + "learning_rate": 0.001, + "loss": 1.1321, + "step": 13320 + }, + { + "epoch": 0.4250773302720112, + "grad_norm": 0.20355407893657684, + "learning_rate": 0.001, + "loss": 1.1393, + "step": 13330 + }, + { + "epoch": 0.4253962179916451, + "grad_norm": 0.2022884041070938, + "learning_rate": 0.001, + "loss": 1.1307, + "step": 13340 + }, + { + "epoch": 0.4257151057112791, + "grad_norm": 0.2134474664926529, + "learning_rate": 0.001, + "loss": 1.1358, + "step": 13350 + }, + { + "epoch": 0.42603399343091297, + "grad_norm": 0.21307939291000366, + "learning_rate": 0.001, + "loss": 1.1268, + "step": 13360 + }, + { + "epoch": 0.42635288115054687, + "grad_norm": 0.21968849003314972, + "learning_rate": 0.001, + "loss": 1.1401, + "step": 13370 + }, + { + "epoch": 0.4266717688701808, + "grad_norm": 0.2057107537984848, + "learning_rate": 0.001, + "loss": 1.1221, + "step": 13380 + }, + { + "epoch": 0.4269906565898147, + "grad_norm": 0.20316410064697266, + "learning_rate": 0.001, + "loss": 1.1284, + "step": 13390 + }, + { + "epoch": 0.4273095443094486, + "grad_norm": 0.201827734708786, + "learning_rate": 0.001, + "loss": 1.1229, + "step": 13400 + }, + { + "epoch": 0.4276284320290826, + "grad_norm": 0.1960659623146057, + "learning_rate": 0.001, + "loss": 1.1161, + "step": 13410 + }, + { + "epoch": 0.42794731974871647, + "grad_norm": 0.209204763174057, + "learning_rate": 0.001, + "loss": 1.1459, + "step": 13420 + }, + { + "epoch": 0.42826620746835037, + "grad_norm": 0.2040780484676361, + "learning_rate": 0.001, + "loss": 1.1405, + "step": 13430 + }, + { + "epoch": 0.4285850951879843, + "grad_norm": 0.2102985382080078, + "learning_rate": 0.001, + "loss": 1.1436, + "step": 13440 + }, + { + "epoch": 0.4289039829076182, + "grad_norm": 0.2021404355764389, + "learning_rate": 0.001, + "loss": 1.1329, + "step": 13450 + }, + { + "epoch": 0.4292228706272521, + "grad_norm": 0.1990777999162674, + "learning_rate": 0.001, + "loss": 1.134, + "step": 13460 + }, + { + "epoch": 0.42954175834688607, + "grad_norm": 0.20001259446144104, + "learning_rate": 0.001, + "loss": 1.1359, + "step": 13470 + }, + { + "epoch": 0.42986064606651997, + "grad_norm": 0.2053236961364746, + "learning_rate": 0.001, + "loss": 1.1493, + "step": 13480 + }, + { + "epoch": 0.4301795337861539, + "grad_norm": 0.19334106147289276, + "learning_rate": 0.001, + "loss": 1.129, + "step": 13490 + }, + { + "epoch": 0.4304984215057878, + "grad_norm": 0.20170816779136658, + "learning_rate": 0.001, + "loss": 1.1302, + "step": 13500 + }, + { + "epoch": 0.4308173092254217, + "grad_norm": 0.2028067409992218, + "learning_rate": 0.001, + "loss": 1.1287, + "step": 13510 + }, + { + "epoch": 0.43113619694505567, + "grad_norm": 0.20177865028381348, + "learning_rate": 0.001, + "loss": 1.1547, + "step": 13520 + }, + { + "epoch": 0.43145508466468957, + "grad_norm": 0.20576360821723938, + "learning_rate": 0.001, + "loss": 1.1246, + "step": 13530 + }, + { + "epoch": 0.43177397238432347, + "grad_norm": 0.2029898315668106, + "learning_rate": 0.001, + "loss": 1.1383, + "step": 13540 + }, + { + "epoch": 0.4320928601039574, + "grad_norm": 0.2129756659269333, + "learning_rate": 0.001, + "loss": 1.1465, + "step": 13550 + }, + { + "epoch": 0.4324117478235913, + "grad_norm": 0.20829495787620544, + "learning_rate": 0.001, + "loss": 1.1352, + "step": 13560 + }, + { + "epoch": 0.4327306355432252, + "grad_norm": 0.20578406751155853, + "learning_rate": 0.001, + "loss": 1.1344, + "step": 13570 + }, + { + "epoch": 0.43304952326285917, + "grad_norm": 0.20792439579963684, + "learning_rate": 0.001, + "loss": 1.144, + "step": 13580 + }, + { + "epoch": 0.43336841098249307, + "grad_norm": 0.207744300365448, + "learning_rate": 0.001, + "loss": 1.1303, + "step": 13590 + }, + { + "epoch": 0.43368729870212697, + "grad_norm": 0.2021283656358719, + "learning_rate": 0.001, + "loss": 1.1297, + "step": 13600 + }, + { + "epoch": 0.4340061864217609, + "grad_norm": 0.2113650143146515, + "learning_rate": 0.001, + "loss": 1.1322, + "step": 13610 + }, + { + "epoch": 0.4343250741413948, + "grad_norm": 0.20138460397720337, + "learning_rate": 0.001, + "loss": 1.1487, + "step": 13620 + }, + { + "epoch": 0.4346439618610287, + "grad_norm": 0.19764484465122223, + "learning_rate": 0.001, + "loss": 1.1179, + "step": 13630 + }, + { + "epoch": 0.43496284958066267, + "grad_norm": 0.20085632801055908, + "learning_rate": 0.001, + "loss": 1.1391, + "step": 13640 + }, + { + "epoch": 0.43528173730029657, + "grad_norm": 0.1991196572780609, + "learning_rate": 0.001, + "loss": 1.1327, + "step": 13650 + }, + { + "epoch": 0.43560062501993047, + "grad_norm": 0.1970689445734024, + "learning_rate": 0.001, + "loss": 1.1204, + "step": 13660 + }, + { + "epoch": 0.4359195127395644, + "grad_norm": 0.2032547891139984, + "learning_rate": 0.001, + "loss": 1.1384, + "step": 13670 + }, + { + "epoch": 0.4362384004591983, + "grad_norm": 0.20708051323890686, + "learning_rate": 0.001, + "loss": 1.1258, + "step": 13680 + }, + { + "epoch": 0.4365572881788322, + "grad_norm": 0.21240440011024475, + "learning_rate": 0.001, + "loss": 1.129, + "step": 13690 + }, + { + "epoch": 0.43687617589846617, + "grad_norm": 0.21483121812343597, + "learning_rate": 0.001, + "loss": 1.1412, + "step": 13700 + }, + { + "epoch": 0.43719506361810007, + "grad_norm": 0.20378629863262177, + "learning_rate": 0.001, + "loss": 1.1283, + "step": 13710 + }, + { + "epoch": 0.43751395133773396, + "grad_norm": 0.19939197599887848, + "learning_rate": 0.001, + "loss": 1.1122, + "step": 13720 + }, + { + "epoch": 0.4378328390573679, + "grad_norm": 0.19610774517059326, + "learning_rate": 0.001, + "loss": 1.1482, + "step": 13730 + }, + { + "epoch": 0.4381517267770018, + "grad_norm": 0.20175138115882874, + "learning_rate": 0.001, + "loss": 1.1367, + "step": 13740 + }, + { + "epoch": 0.4384706144966357, + "grad_norm": 0.20364269614219666, + "learning_rate": 0.001, + "loss": 1.1276, + "step": 13750 + }, + { + "epoch": 0.43878950221626967, + "grad_norm": 0.20994988083839417, + "learning_rate": 0.001, + "loss": 1.126, + "step": 13760 + }, + { + "epoch": 0.43910838993590356, + "grad_norm": 0.2069726586341858, + "learning_rate": 0.001, + "loss": 1.1354, + "step": 13770 + }, + { + "epoch": 0.43942727765553746, + "grad_norm": 0.2140015810728073, + "learning_rate": 0.001, + "loss": 1.1374, + "step": 13780 + }, + { + "epoch": 0.4397461653751714, + "grad_norm": 0.20899558067321777, + "learning_rate": 0.001, + "loss": 1.1374, + "step": 13790 + }, + { + "epoch": 0.4400650530948053, + "grad_norm": 0.20549681782722473, + "learning_rate": 0.001, + "loss": 1.1326, + "step": 13800 + }, + { + "epoch": 0.4403839408144392, + "grad_norm": 0.20833928883075714, + "learning_rate": 0.001, + "loss": 1.1228, + "step": 13810 + }, + { + "epoch": 0.44070282853407317, + "grad_norm": 0.20258395373821259, + "learning_rate": 0.001, + "loss": 1.1241, + "step": 13820 + }, + { + "epoch": 0.44102171625370706, + "grad_norm": 0.20737484097480774, + "learning_rate": 0.001, + "loss": 1.1264, + "step": 13830 + }, + { + "epoch": 0.44134060397334096, + "grad_norm": 0.21442458033561707, + "learning_rate": 0.001, + "loss": 1.1135, + "step": 13840 + }, + { + "epoch": 0.4416594916929749, + "grad_norm": 0.20314937829971313, + "learning_rate": 0.001, + "loss": 1.1266, + "step": 13850 + }, + { + "epoch": 0.4419783794126088, + "grad_norm": 0.2008056789636612, + "learning_rate": 0.001, + "loss": 1.1277, + "step": 13860 + }, + { + "epoch": 0.4422972671322427, + "grad_norm": 0.20103736221790314, + "learning_rate": 0.001, + "loss": 1.118, + "step": 13870 + }, + { + "epoch": 0.44261615485187666, + "grad_norm": 0.20233802497386932, + "learning_rate": 0.001, + "loss": 1.109, + "step": 13880 + }, + { + "epoch": 0.44293504257151056, + "grad_norm": 0.19475015997886658, + "learning_rate": 0.001, + "loss": 1.1169, + "step": 13890 + }, + { + "epoch": 0.4432539302911445, + "grad_norm": 0.20203594863414764, + "learning_rate": 0.001, + "loss": 1.1366, + "step": 13900 + }, + { + "epoch": 0.4435728180107784, + "grad_norm": 0.19808048009872437, + "learning_rate": 0.001, + "loss": 1.1152, + "step": 13910 + }, + { + "epoch": 0.4438917057304123, + "grad_norm": 0.19753964245319366, + "learning_rate": 0.001, + "loss": 1.1312, + "step": 13920 + }, + { + "epoch": 0.44421059345004626, + "grad_norm": 0.20471017062664032, + "learning_rate": 0.001, + "loss": 1.1402, + "step": 13930 + }, + { + "epoch": 0.44452948116968016, + "grad_norm": 0.2034641057252884, + "learning_rate": 0.001, + "loss": 1.1154, + "step": 13940 + }, + { + "epoch": 0.44484836888931406, + "grad_norm": 0.20656068623065948, + "learning_rate": 0.001, + "loss": 1.1271, + "step": 13950 + }, + { + "epoch": 0.445167256608948, + "grad_norm": 0.19708023965358734, + "learning_rate": 0.001, + "loss": 1.1462, + "step": 13960 + }, + { + "epoch": 0.4454861443285819, + "grad_norm": 0.20702509582042694, + "learning_rate": 0.001, + "loss": 1.1216, + "step": 13970 + }, + { + "epoch": 0.4458050320482158, + "grad_norm": 0.20129531621932983, + "learning_rate": 0.001, + "loss": 1.127, + "step": 13980 + }, + { + "epoch": 0.44612391976784976, + "grad_norm": 0.19751425087451935, + "learning_rate": 0.001, + "loss": 1.1247, + "step": 13990 + }, + { + "epoch": 0.44644280748748366, + "grad_norm": 0.20703715085983276, + "learning_rate": 0.001, + "loss": 1.121, + "step": 14000 + }, + { + "epoch": 0.44676169520711756, + "grad_norm": 0.20253314077854156, + "learning_rate": 0.001, + "loss": 1.1366, + "step": 14010 + }, + { + "epoch": 0.4470805829267515, + "grad_norm": 0.21088387072086334, + "learning_rate": 0.001, + "loss": 1.1354, + "step": 14020 + }, + { + "epoch": 0.4473994706463854, + "grad_norm": 0.212107852101326, + "learning_rate": 0.001, + "loss": 1.1329, + "step": 14030 + }, + { + "epoch": 0.4477183583660193, + "grad_norm": 0.19477473199367523, + "learning_rate": 0.001, + "loss": 1.1186, + "step": 14040 + }, + { + "epoch": 0.44803724608565326, + "grad_norm": 0.20723167061805725, + "learning_rate": 0.001, + "loss": 1.1134, + "step": 14050 + }, + { + "epoch": 0.44835613380528716, + "grad_norm": 0.20093318819999695, + "learning_rate": 0.001, + "loss": 1.1169, + "step": 14060 + }, + { + "epoch": 0.44867502152492106, + "grad_norm": 0.2053205668926239, + "learning_rate": 0.001, + "loss": 1.1109, + "step": 14070 + }, + { + "epoch": 0.448993909244555, + "grad_norm": 0.20331956446170807, + "learning_rate": 0.001, + "loss": 1.125, + "step": 14080 + }, + { + "epoch": 0.4493127969641889, + "grad_norm": 0.2000836879014969, + "learning_rate": 0.001, + "loss": 1.1215, + "step": 14090 + }, + { + "epoch": 0.4496316846838228, + "grad_norm": 0.1976880431175232, + "learning_rate": 0.001, + "loss": 1.1243, + "step": 14100 + }, + { + "epoch": 0.44995057240345676, + "grad_norm": 0.20531907677650452, + "learning_rate": 0.001, + "loss": 1.1278, + "step": 14110 + }, + { + "epoch": 0.45026946012309066, + "grad_norm": 0.2220134139060974, + "learning_rate": 0.001, + "loss": 1.1162, + "step": 14120 + }, + { + "epoch": 0.45058834784272456, + "grad_norm": 0.19986401498317719, + "learning_rate": 0.001, + "loss": 1.1199, + "step": 14130 + }, + { + "epoch": 0.4509072355623585, + "grad_norm": 0.20359410345554352, + "learning_rate": 0.001, + "loss": 1.1143, + "step": 14140 + }, + { + "epoch": 0.4512261232819924, + "grad_norm": 0.19630028307437897, + "learning_rate": 0.001, + "loss": 1.147, + "step": 14150 + }, + { + "epoch": 0.4515450110016263, + "grad_norm": 0.20296724140644073, + "learning_rate": 0.001, + "loss": 1.1201, + "step": 14160 + }, + { + "epoch": 0.45186389872126026, + "grad_norm": 0.19895164668560028, + "learning_rate": 0.001, + "loss": 1.1155, + "step": 14170 + }, + { + "epoch": 0.45218278644089416, + "grad_norm": 0.19755735993385315, + "learning_rate": 0.001, + "loss": 1.1226, + "step": 14180 + }, + { + "epoch": 0.45250167416052806, + "grad_norm": 0.19897565245628357, + "learning_rate": 0.001, + "loss": 1.106, + "step": 14190 + }, + { + "epoch": 0.452820561880162, + "grad_norm": 0.20602868497371674, + "learning_rate": 0.001, + "loss": 1.1359, + "step": 14200 + }, + { + "epoch": 0.4531394495997959, + "grad_norm": 0.19570337235927582, + "learning_rate": 0.001, + "loss": 1.1332, + "step": 14210 + }, + { + "epoch": 0.4534583373194298, + "grad_norm": 0.198469340801239, + "learning_rate": 0.001, + "loss": 1.1235, + "step": 14220 + }, + { + "epoch": 0.45377722503906376, + "grad_norm": 0.21264946460723877, + "learning_rate": 0.001, + "loss": 1.1112, + "step": 14230 + }, + { + "epoch": 0.45409611275869766, + "grad_norm": 0.2016083151102066, + "learning_rate": 0.001, + "loss": 1.1111, + "step": 14240 + }, + { + "epoch": 0.45441500047833155, + "grad_norm": 0.20173093676567078, + "learning_rate": 0.001, + "loss": 1.1295, + "step": 14250 + }, + { + "epoch": 0.4547338881979655, + "grad_norm": 0.20293079316616058, + "learning_rate": 0.001, + "loss": 1.1402, + "step": 14260 + }, + { + "epoch": 0.4550527759175994, + "grad_norm": 0.19944404065608978, + "learning_rate": 0.001, + "loss": 1.1199, + "step": 14270 + }, + { + "epoch": 0.4553716636372333, + "grad_norm": 0.20182962715625763, + "learning_rate": 0.001, + "loss": 1.1321, + "step": 14280 + }, + { + "epoch": 0.45569055135686726, + "grad_norm": 0.21239547431468964, + "learning_rate": 0.001, + "loss": 1.1333, + "step": 14290 + }, + { + "epoch": 0.45600943907650116, + "grad_norm": 0.1999959498643875, + "learning_rate": 0.001, + "loss": 1.1371, + "step": 14300 + }, + { + "epoch": 0.45632832679613505, + "grad_norm": 0.20423386991024017, + "learning_rate": 0.001, + "loss": 1.1355, + "step": 14310 + }, + { + "epoch": 0.456647214515769, + "grad_norm": 0.205377459526062, + "learning_rate": 0.001, + "loss": 1.1277, + "step": 14320 + }, + { + "epoch": 0.4569661022354029, + "grad_norm": 0.20549103617668152, + "learning_rate": 0.001, + "loss": 1.1211, + "step": 14330 + }, + { + "epoch": 0.45728498995503686, + "grad_norm": 0.20159433782100677, + "learning_rate": 0.001, + "loss": 1.1114, + "step": 14340 + }, + { + "epoch": 0.45760387767467076, + "grad_norm": 0.19585607945919037, + "learning_rate": 0.001, + "loss": 1.1233, + "step": 14350 + }, + { + "epoch": 0.45792276539430465, + "grad_norm": 0.19485332071781158, + "learning_rate": 0.001, + "loss": 1.1195, + "step": 14360 + }, + { + "epoch": 0.4582416531139386, + "grad_norm": 0.20004454255104065, + "learning_rate": 0.001, + "loss": 1.148, + "step": 14370 + }, + { + "epoch": 0.4585605408335725, + "grad_norm": 0.19846779108047485, + "learning_rate": 0.001, + "loss": 1.1371, + "step": 14380 + }, + { + "epoch": 0.4588794285532064, + "grad_norm": 0.20324277877807617, + "learning_rate": 0.001, + "loss": 1.1138, + "step": 14390 + }, + { + "epoch": 0.45919831627284036, + "grad_norm": 0.20810678601264954, + "learning_rate": 0.001, + "loss": 1.1397, + "step": 14400 + }, + { + "epoch": 0.45951720399247425, + "grad_norm": 0.20195460319519043, + "learning_rate": 0.001, + "loss": 1.0977, + "step": 14410 + }, + { + "epoch": 0.45983609171210815, + "grad_norm": 0.208050936460495, + "learning_rate": 0.001, + "loss": 1.1208, + "step": 14420 + }, + { + "epoch": 0.4601549794317421, + "grad_norm": 0.21316717565059662, + "learning_rate": 0.001, + "loss": 1.1178, + "step": 14430 + }, + { + "epoch": 0.460473867151376, + "grad_norm": 0.20156152546405792, + "learning_rate": 0.001, + "loss": 1.1368, + "step": 14440 + }, + { + "epoch": 0.4607927548710099, + "grad_norm": 0.19752919673919678, + "learning_rate": 0.001, + "loss": 1.1164, + "step": 14450 + }, + { + "epoch": 0.46111164259064386, + "grad_norm": 0.2068374752998352, + "learning_rate": 0.001, + "loss": 1.1285, + "step": 14460 + }, + { + "epoch": 0.46143053031027775, + "grad_norm": 0.20846422016620636, + "learning_rate": 0.001, + "loss": 1.1185, + "step": 14470 + }, + { + "epoch": 0.46174941802991165, + "grad_norm": 0.2047877013683319, + "learning_rate": 0.001, + "loss": 1.1047, + "step": 14480 + }, + { + "epoch": 0.4620683057495456, + "grad_norm": 0.2022201269865036, + "learning_rate": 0.001, + "loss": 1.1189, + "step": 14490 + }, + { + "epoch": 0.4623871934691795, + "grad_norm": 0.2064102441072464, + "learning_rate": 0.001, + "loss": 1.119, + "step": 14500 + }, + { + "epoch": 0.4627060811888134, + "grad_norm": 0.20427323877811432, + "learning_rate": 0.001, + "loss": 1.1152, + "step": 14510 + }, + { + "epoch": 0.46302496890844735, + "grad_norm": 0.20223671197891235, + "learning_rate": 0.001, + "loss": 1.1162, + "step": 14520 + }, + { + "epoch": 0.46334385662808125, + "grad_norm": 0.21089716255664825, + "learning_rate": 0.001, + "loss": 1.1452, + "step": 14530 + }, + { + "epoch": 0.46366274434771515, + "grad_norm": 0.19083629548549652, + "learning_rate": 0.001, + "loss": 1.1092, + "step": 14540 + }, + { + "epoch": 0.4639816320673491, + "grad_norm": 0.20347940921783447, + "learning_rate": 0.001, + "loss": 1.1336, + "step": 14550 + }, + { + "epoch": 0.464300519786983, + "grad_norm": 0.19843998551368713, + "learning_rate": 0.001, + "loss": 1.1206, + "step": 14560 + }, + { + "epoch": 0.4646194075066169, + "grad_norm": 0.2033349722623825, + "learning_rate": 0.001, + "loss": 1.1205, + "step": 14570 + }, + { + "epoch": 0.46493829522625085, + "grad_norm": 0.20412671566009521, + "learning_rate": 0.001, + "loss": 1.1181, + "step": 14580 + }, + { + "epoch": 0.46525718294588475, + "grad_norm": 0.19806016981601715, + "learning_rate": 0.001, + "loss": 1.1177, + "step": 14590 + }, + { + "epoch": 0.46557607066551865, + "grad_norm": 0.2021852433681488, + "learning_rate": 0.001, + "loss": 1.1228, + "step": 14600 + }, + { + "epoch": 0.4658949583851526, + "grad_norm": 0.20214885473251343, + "learning_rate": 0.001, + "loss": 1.1265, + "step": 14610 + }, + { + "epoch": 0.4662138461047865, + "grad_norm": 0.1979108452796936, + "learning_rate": 0.001, + "loss": 1.1219, + "step": 14620 + }, + { + "epoch": 0.4665327338244204, + "grad_norm": 0.2091963291168213, + "learning_rate": 0.001, + "loss": 1.1226, + "step": 14630 + }, + { + "epoch": 0.46685162154405435, + "grad_norm": 0.19643768668174744, + "learning_rate": 0.001, + "loss": 1.0983, + "step": 14640 + }, + { + "epoch": 0.46717050926368825, + "grad_norm": 0.20225919783115387, + "learning_rate": 0.001, + "loss": 1.1236, + "step": 14650 + }, + { + "epoch": 0.46748939698332215, + "grad_norm": 0.20334993302822113, + "learning_rate": 0.001, + "loss": 1.1202, + "step": 14660 + }, + { + "epoch": 0.4678082847029561, + "grad_norm": 0.20838162302970886, + "learning_rate": 0.001, + "loss": 1.1189, + "step": 14670 + }, + { + "epoch": 0.46812717242259, + "grad_norm": 0.19877314567565918, + "learning_rate": 0.001, + "loss": 1.1088, + "step": 14680 + }, + { + "epoch": 0.4684460601422239, + "grad_norm": 0.19828420877456665, + "learning_rate": 0.001, + "loss": 1.1117, + "step": 14690 + }, + { + "epoch": 0.46876494786185785, + "grad_norm": 0.20643897354602814, + "learning_rate": 0.001, + "loss": 1.1153, + "step": 14700 + }, + { + "epoch": 0.46908383558149175, + "grad_norm": 0.20390458405017853, + "learning_rate": 0.001, + "loss": 1.1338, + "step": 14710 + }, + { + "epoch": 0.46940272330112565, + "grad_norm": 0.20136936008930206, + "learning_rate": 0.001, + "loss": 1.1234, + "step": 14720 + }, + { + "epoch": 0.4697216110207596, + "grad_norm": 0.2019462287425995, + "learning_rate": 0.001, + "loss": 1.1193, + "step": 14730 + }, + { + "epoch": 0.4700404987403935, + "grad_norm": 0.20261630415916443, + "learning_rate": 0.001, + "loss": 1.1325, + "step": 14740 + }, + { + "epoch": 0.47035938646002745, + "grad_norm": 0.2060079276561737, + "learning_rate": 0.001, + "loss": 1.1314, + "step": 14750 + }, + { + "epoch": 0.47067827417966135, + "grad_norm": 0.19139251112937927, + "learning_rate": 0.001, + "loss": 1.1073, + "step": 14760 + }, + { + "epoch": 0.47099716189929525, + "grad_norm": 0.1992751955986023, + "learning_rate": 0.001, + "loss": 1.0953, + "step": 14770 + }, + { + "epoch": 0.4713160496189292, + "grad_norm": 0.19770146906375885, + "learning_rate": 0.001, + "loss": 1.1258, + "step": 14780 + }, + { + "epoch": 0.4716349373385631, + "grad_norm": 0.2079428881406784, + "learning_rate": 0.001, + "loss": 1.1327, + "step": 14790 + }, + { + "epoch": 0.471953825058197, + "grad_norm": 0.20283734798431396, + "learning_rate": 0.001, + "loss": 1.1325, + "step": 14800 + }, + { + "epoch": 0.47227271277783095, + "grad_norm": 0.19868163764476776, + "learning_rate": 0.001, + "loss": 1.1156, + "step": 14810 + }, + { + "epoch": 0.47259160049746485, + "grad_norm": 0.20224575698375702, + "learning_rate": 0.001, + "loss": 1.1284, + "step": 14820 + }, + { + "epoch": 0.47291048821709875, + "grad_norm": 0.19769024848937988, + "learning_rate": 0.001, + "loss": 1.1313, + "step": 14830 + }, + { + "epoch": 0.4732293759367327, + "grad_norm": 0.19698044657707214, + "learning_rate": 0.001, + "loss": 1.1305, + "step": 14840 + }, + { + "epoch": 0.4735482636563666, + "grad_norm": 0.19292454421520233, + "learning_rate": 0.001, + "loss": 1.1227, + "step": 14850 + }, + { + "epoch": 0.4738671513760005, + "grad_norm": 0.20179186761379242, + "learning_rate": 0.001, + "loss": 1.11, + "step": 14860 + }, + { + "epoch": 0.47418603909563445, + "grad_norm": 0.19742195308208466, + "learning_rate": 0.001, + "loss": 1.1086, + "step": 14870 + }, + { + "epoch": 0.47450492681526835, + "grad_norm": 0.2044827789068222, + "learning_rate": 0.001, + "loss": 1.1171, + "step": 14880 + }, + { + "epoch": 0.47482381453490224, + "grad_norm": 0.20579135417938232, + "learning_rate": 0.001, + "loss": 1.1231, + "step": 14890 + }, + { + "epoch": 0.4751427022545362, + "grad_norm": 0.20538045465946198, + "learning_rate": 0.001, + "loss": 1.1226, + "step": 14900 + }, + { + "epoch": 0.4754615899741701, + "grad_norm": 0.2013697773218155, + "learning_rate": 0.001, + "loss": 1.1342, + "step": 14910 + }, + { + "epoch": 0.475780477693804, + "grad_norm": 0.20036698877811432, + "learning_rate": 0.001, + "loss": 1.1222, + "step": 14920 + }, + { + "epoch": 0.47609936541343795, + "grad_norm": 0.20143401622772217, + "learning_rate": 0.001, + "loss": 1.1294, + "step": 14930 + }, + { + "epoch": 0.47641825313307185, + "grad_norm": 0.1989293098449707, + "learning_rate": 0.001, + "loss": 1.1357, + "step": 14940 + }, + { + "epoch": 0.47673714085270574, + "grad_norm": 0.19735567271709442, + "learning_rate": 0.001, + "loss": 1.1111, + "step": 14950 + }, + { + "epoch": 0.4770560285723397, + "grad_norm": 0.19401957094669342, + "learning_rate": 0.001, + "loss": 1.1239, + "step": 14960 + }, + { + "epoch": 0.4773749162919736, + "grad_norm": 0.2115725427865982, + "learning_rate": 0.001, + "loss": 1.1239, + "step": 14970 + }, + { + "epoch": 0.4776938040116075, + "grad_norm": 0.20321685075759888, + "learning_rate": 0.001, + "loss": 1.112, + "step": 14980 + }, + { + "epoch": 0.47801269173124145, + "grad_norm": 0.19996105134487152, + "learning_rate": 0.001, + "loss": 1.1056, + "step": 14990 + }, + { + "epoch": 0.47833157945087534, + "grad_norm": 0.20889067649841309, + "learning_rate": 0.001, + "loss": 1.1347, + "step": 15000 + }, + { + "epoch": 0.47865046717050924, + "grad_norm": 0.20725928246974945, + "learning_rate": 0.001, + "loss": 1.1317, + "step": 15010 + }, + { + "epoch": 0.4789693548901432, + "grad_norm": 0.21042446792125702, + "learning_rate": 0.001, + "loss": 1.1133, + "step": 15020 + }, + { + "epoch": 0.4792882426097771, + "grad_norm": 0.2007140964269638, + "learning_rate": 0.001, + "loss": 1.1308, + "step": 15030 + }, + { + "epoch": 0.479607130329411, + "grad_norm": 0.19597560167312622, + "learning_rate": 0.001, + "loss": 1.1055, + "step": 15040 + }, + { + "epoch": 0.47992601804904494, + "grad_norm": 0.20281028747558594, + "learning_rate": 0.001, + "loss": 1.1226, + "step": 15050 + }, + { + "epoch": 0.48024490576867884, + "grad_norm": 0.1976708173751831, + "learning_rate": 0.001, + "loss": 1.1159, + "step": 15060 + }, + { + "epoch": 0.48056379348831274, + "grad_norm": 0.19908636808395386, + "learning_rate": 0.001, + "loss": 1.1279, + "step": 15070 + }, + { + "epoch": 0.4808826812079467, + "grad_norm": 0.20221206545829773, + "learning_rate": 0.001, + "loss": 1.123, + "step": 15080 + }, + { + "epoch": 0.4812015689275806, + "grad_norm": 0.2094741016626358, + "learning_rate": 0.001, + "loss": 1.115, + "step": 15090 + }, + { + "epoch": 0.4815204566472145, + "grad_norm": 0.20078401267528534, + "learning_rate": 0.001, + "loss": 1.1298, + "step": 15100 + }, + { + "epoch": 0.48183934436684844, + "grad_norm": 0.20339038968086243, + "learning_rate": 0.001, + "loss": 1.1008, + "step": 15110 + }, + { + "epoch": 0.48215823208648234, + "grad_norm": 0.20455828309059143, + "learning_rate": 0.001, + "loss": 1.0992, + "step": 15120 + }, + { + "epoch": 0.48247711980611624, + "grad_norm": 0.2017097920179367, + "learning_rate": 0.001, + "loss": 1.1076, + "step": 15130 + }, + { + "epoch": 0.4827960075257502, + "grad_norm": 0.19957958161830902, + "learning_rate": 0.001, + "loss": 1.1125, + "step": 15140 + }, + { + "epoch": 0.4831148952453841, + "grad_norm": 0.19430184364318848, + "learning_rate": 0.001, + "loss": 1.1012, + "step": 15150 + }, + { + "epoch": 0.48343378296501804, + "grad_norm": 0.196171373128891, + "learning_rate": 0.001, + "loss": 1.1116, + "step": 15160 + }, + { + "epoch": 0.48375267068465194, + "grad_norm": 0.2169899195432663, + "learning_rate": 0.001, + "loss": 1.1154, + "step": 15170 + }, + { + "epoch": 0.48407155840428584, + "grad_norm": 0.20589181780815125, + "learning_rate": 0.001, + "loss": 1.1101, + "step": 15180 + }, + { + "epoch": 0.4843904461239198, + "grad_norm": 0.20493051409721375, + "learning_rate": 0.001, + "loss": 1.103, + "step": 15190 + }, + { + "epoch": 0.4847093338435537, + "grad_norm": 0.2098941057920456, + "learning_rate": 0.001, + "loss": 1.1414, + "step": 15200 + }, + { + "epoch": 0.4850282215631876, + "grad_norm": 0.1904533952474594, + "learning_rate": 0.001, + "loss": 1.1027, + "step": 15210 + }, + { + "epoch": 0.48534710928282154, + "grad_norm": 0.1936672180891037, + "learning_rate": 0.001, + "loss": 1.1141, + "step": 15220 + }, + { + "epoch": 0.48566599700245544, + "grad_norm": 0.20026364922523499, + "learning_rate": 0.001, + "loss": 1.1243, + "step": 15230 + }, + { + "epoch": 0.48598488472208934, + "grad_norm": 0.20649927854537964, + "learning_rate": 0.001, + "loss": 1.1259, + "step": 15240 + }, + { + "epoch": 0.4863037724417233, + "grad_norm": 0.19340914487838745, + "learning_rate": 0.001, + "loss": 1.1079, + "step": 15250 + }, + { + "epoch": 0.4866226601613572, + "grad_norm": 0.19627441465854645, + "learning_rate": 0.001, + "loss": 1.1149, + "step": 15260 + }, + { + "epoch": 0.4869415478809911, + "grad_norm": 0.19537758827209473, + "learning_rate": 0.001, + "loss": 1.1113, + "step": 15270 + }, + { + "epoch": 0.48726043560062504, + "grad_norm": 0.19534988701343536, + "learning_rate": 0.001, + "loss": 1.0957, + "step": 15280 + }, + { + "epoch": 0.48757932332025894, + "grad_norm": 0.19658777117729187, + "learning_rate": 0.001, + "loss": 1.1178, + "step": 15290 + }, + { + "epoch": 0.48789821103989284, + "grad_norm": 0.19526197016239166, + "learning_rate": 0.001, + "loss": 1.1025, + "step": 15300 + }, + { + "epoch": 0.4882170987595268, + "grad_norm": 0.2003452330827713, + "learning_rate": 0.001, + "loss": 1.0961, + "step": 15310 + }, + { + "epoch": 0.4885359864791607, + "grad_norm": 0.19825130701065063, + "learning_rate": 0.001, + "loss": 1.1145, + "step": 15320 + }, + { + "epoch": 0.4888548741987946, + "grad_norm": 0.19999739527702332, + "learning_rate": 0.001, + "loss": 1.1157, + "step": 15330 + }, + { + "epoch": 0.48917376191842854, + "grad_norm": 0.2152135968208313, + "learning_rate": 0.001, + "loss": 1.1051, + "step": 15340 + }, + { + "epoch": 0.48949264963806244, + "grad_norm": 0.21462447941303253, + "learning_rate": 0.001, + "loss": 1.1062, + "step": 15350 + }, + { + "epoch": 0.48981153735769634, + "grad_norm": 0.1989239752292633, + "learning_rate": 0.001, + "loss": 1.0992, + "step": 15360 + }, + { + "epoch": 0.4901304250773303, + "grad_norm": 0.19589900970458984, + "learning_rate": 0.001, + "loss": 1.1241, + "step": 15370 + }, + { + "epoch": 0.4904493127969642, + "grad_norm": 0.2006465196609497, + "learning_rate": 0.001, + "loss": 1.1142, + "step": 15380 + }, + { + "epoch": 0.4907682005165981, + "grad_norm": 0.19966904819011688, + "learning_rate": 0.001, + "loss": 1.1233, + "step": 15390 + }, + { + "epoch": 0.49108708823623204, + "grad_norm": 0.20915593206882477, + "learning_rate": 0.001, + "loss": 1.1042, + "step": 15400 + }, + { + "epoch": 0.49140597595586594, + "grad_norm": 0.19926536083221436, + "learning_rate": 0.001, + "loss": 1.1226, + "step": 15410 + }, + { + "epoch": 0.49172486367549983, + "grad_norm": 0.20816414058208466, + "learning_rate": 0.001, + "loss": 1.121, + "step": 15420 + }, + { + "epoch": 0.4920437513951338, + "grad_norm": 0.19813525676727295, + "learning_rate": 0.001, + "loss": 1.1168, + "step": 15430 + }, + { + "epoch": 0.4923626391147677, + "grad_norm": 0.19787755608558655, + "learning_rate": 0.001, + "loss": 1.1333, + "step": 15440 + }, + { + "epoch": 0.4926815268344016, + "grad_norm": 0.20607894659042358, + "learning_rate": 0.001, + "loss": 1.1423, + "step": 15450 + }, + { + "epoch": 0.49300041455403554, + "grad_norm": 0.19217760860919952, + "learning_rate": 0.001, + "loss": 1.1317, + "step": 15460 + }, + { + "epoch": 0.49331930227366944, + "grad_norm": 0.215703085064888, + "learning_rate": 0.001, + "loss": 1.1366, + "step": 15470 + }, + { + "epoch": 0.49363818999330333, + "grad_norm": 0.20476995408535004, + "learning_rate": 0.001, + "loss": 1.1027, + "step": 15480 + }, + { + "epoch": 0.4939570777129373, + "grad_norm": 0.2130293995141983, + "learning_rate": 0.001, + "loss": 1.1285, + "step": 15490 + }, + { + "epoch": 0.4942759654325712, + "grad_norm": 0.1988312005996704, + "learning_rate": 0.001, + "loss": 1.108, + "step": 15500 + }, + { + "epoch": 0.4945948531522051, + "grad_norm": 0.21203584969043732, + "learning_rate": 0.001, + "loss": 1.1159, + "step": 15510 + }, + { + "epoch": 0.49491374087183904, + "grad_norm": 0.194093719124794, + "learning_rate": 0.001, + "loss": 1.127, + "step": 15520 + }, + { + "epoch": 0.49523262859147293, + "grad_norm": 0.19752824306488037, + "learning_rate": 0.001, + "loss": 1.1186, + "step": 15530 + }, + { + "epoch": 0.49555151631110683, + "grad_norm": 0.19278797507286072, + "learning_rate": 0.001, + "loss": 1.1151, + "step": 15540 + }, + { + "epoch": 0.4958704040307408, + "grad_norm": 0.20309525728225708, + "learning_rate": 0.001, + "loss": 1.1221, + "step": 15550 + }, + { + "epoch": 0.4961892917503747, + "grad_norm": 0.19723108410835266, + "learning_rate": 0.001, + "loss": 1.1124, + "step": 15560 + }, + { + "epoch": 0.49650817947000864, + "grad_norm": 0.19733686745166779, + "learning_rate": 0.001, + "loss": 1.1239, + "step": 15570 + }, + { + "epoch": 0.49682706718964254, + "grad_norm": 0.19719557464122772, + "learning_rate": 0.001, + "loss": 1.1165, + "step": 15580 + }, + { + "epoch": 0.49714595490927643, + "grad_norm": 0.19925282895565033, + "learning_rate": 0.001, + "loss": 1.1261, + "step": 15590 + }, + { + "epoch": 0.4974648426289104, + "grad_norm": 0.1964748650789261, + "learning_rate": 0.001, + "loss": 1.1078, + "step": 15600 + }, + { + "epoch": 0.4977837303485443, + "grad_norm": 0.20201486349105835, + "learning_rate": 0.001, + "loss": 1.108, + "step": 15610 + }, + { + "epoch": 0.4981026180681782, + "grad_norm": 0.19340449571609497, + "learning_rate": 0.001, + "loss": 1.1143, + "step": 15620 + }, + { + "epoch": 0.49842150578781214, + "grad_norm": 0.1944858431816101, + "learning_rate": 0.001, + "loss": 1.1001, + "step": 15630 + }, + { + "epoch": 0.49874039350744603, + "grad_norm": 0.20309315621852875, + "learning_rate": 0.001, + "loss": 1.1148, + "step": 15640 + }, + { + "epoch": 0.49905928122707993, + "grad_norm": 0.20255059003829956, + "learning_rate": 0.001, + "loss": 1.1152, + "step": 15650 + }, + { + "epoch": 0.4993781689467139, + "grad_norm": 0.2079618126153946, + "learning_rate": 0.001, + "loss": 1.1346, + "step": 15660 + }, + { + "epoch": 0.4996970566663478, + "grad_norm": 0.20329873263835907, + "learning_rate": 0.001, + "loss": 1.1143, + "step": 15670 + }, + { + "epoch": 0.5000159443859817, + "grad_norm": 0.2044350504875183, + "learning_rate": 0.001, + "loss": 1.1169, + "step": 15680 + }, + { + "epoch": 0.5003348321056156, + "grad_norm": 0.19735576212406158, + "learning_rate": 0.001, + "loss": 1.1092, + "step": 15690 + }, + { + "epoch": 0.5006537198252495, + "grad_norm": 0.19718340039253235, + "learning_rate": 0.001, + "loss": 1.1114, + "step": 15700 + }, + { + "epoch": 0.5009726075448835, + "grad_norm": 0.20076552033424377, + "learning_rate": 0.001, + "loss": 1.1162, + "step": 15710 + }, + { + "epoch": 0.5012914952645173, + "grad_norm": 0.19438549876213074, + "learning_rate": 0.001, + "loss": 1.1284, + "step": 15720 + }, + { + "epoch": 0.5016103829841513, + "grad_norm": 0.2000940889120102, + "learning_rate": 0.001, + "loss": 1.105, + "step": 15730 + }, + { + "epoch": 0.5019292707037852, + "grad_norm": 0.19995105266571045, + "learning_rate": 0.001, + "loss": 1.1332, + "step": 15740 + }, + { + "epoch": 0.5022481584234191, + "grad_norm": 0.1938658207654953, + "learning_rate": 0.001, + "loss": 1.1162, + "step": 15750 + }, + { + "epoch": 0.502567046143053, + "grad_norm": 0.20208021998405457, + "learning_rate": 0.001, + "loss": 1.0963, + "step": 15760 + }, + { + "epoch": 0.502885933862687, + "grad_norm": 0.19796696305274963, + "learning_rate": 0.001, + "loss": 1.1174, + "step": 15770 + }, + { + "epoch": 0.5032048215823208, + "grad_norm": 0.20823107659816742, + "learning_rate": 0.001, + "loss": 1.115, + "step": 15780 + }, + { + "epoch": 0.5035237093019548, + "grad_norm": 0.19293028116226196, + "learning_rate": 0.001, + "loss": 1.107, + "step": 15790 + }, + { + "epoch": 0.5038425970215887, + "grad_norm": 0.19398380815982819, + "learning_rate": 0.001, + "loss": 1.1052, + "step": 15800 + }, + { + "epoch": 0.5041614847412226, + "grad_norm": 0.19862490892410278, + "learning_rate": 0.001, + "loss": 1.1215, + "step": 15810 + }, + { + "epoch": 0.5044803724608565, + "grad_norm": 0.1949176788330078, + "learning_rate": 0.001, + "loss": 1.1116, + "step": 15820 + }, + { + "epoch": 0.5047992601804905, + "grad_norm": 0.20239092409610748, + "learning_rate": 0.001, + "loss": 1.1019, + "step": 15830 + }, + { + "epoch": 0.5051181479001243, + "grad_norm": 0.19988751411437988, + "learning_rate": 0.001, + "loss": 1.1158, + "step": 15840 + }, + { + "epoch": 0.5054370356197583, + "grad_norm": 0.2054993063211441, + "learning_rate": 0.001, + "loss": 1.1217, + "step": 15850 + }, + { + "epoch": 0.5057559233393922, + "grad_norm": 0.20914222300052643, + "learning_rate": 0.001, + "loss": 1.1258, + "step": 15860 + }, + { + "epoch": 0.5060748110590261, + "grad_norm": 0.19964182376861572, + "learning_rate": 0.001, + "loss": 1.106, + "step": 15870 + }, + { + "epoch": 0.50639369877866, + "grad_norm": 0.20104973018169403, + "learning_rate": 0.001, + "loss": 1.1128, + "step": 15880 + }, + { + "epoch": 0.506712586498294, + "grad_norm": 0.20262691378593445, + "learning_rate": 0.001, + "loss": 1.0998, + "step": 15890 + }, + { + "epoch": 0.5070314742179278, + "grad_norm": 0.20666274428367615, + "learning_rate": 0.001, + "loss": 1.1046, + "step": 15900 + }, + { + "epoch": 0.5073503619375618, + "grad_norm": 0.19419854879379272, + "learning_rate": 0.001, + "loss": 1.1065, + "step": 15910 + }, + { + "epoch": 0.5076692496571957, + "grad_norm": 0.20134715735912323, + "learning_rate": 0.001, + "loss": 1.1036, + "step": 15920 + }, + { + "epoch": 0.5079881373768296, + "grad_norm": 0.20258976519107819, + "learning_rate": 0.001, + "loss": 1.1118, + "step": 15930 + }, + { + "epoch": 0.5083070250964635, + "grad_norm": 0.19312818348407745, + "learning_rate": 0.001, + "loss": 1.1039, + "step": 15940 + }, + { + "epoch": 0.5086259128160975, + "grad_norm": 0.20441237092018127, + "learning_rate": 0.001, + "loss": 1.106, + "step": 15950 + }, + { + "epoch": 0.5089448005357313, + "grad_norm": 0.19846174120903015, + "learning_rate": 0.001, + "loss": 1.1146, + "step": 15960 + }, + { + "epoch": 0.5092636882553653, + "grad_norm": 0.2000012993812561, + "learning_rate": 0.001, + "loss": 1.1157, + "step": 15970 + }, + { + "epoch": 0.5095825759749992, + "grad_norm": 0.19293838739395142, + "learning_rate": 0.001, + "loss": 1.1043, + "step": 15980 + }, + { + "epoch": 0.5099014636946331, + "grad_norm": 0.1996237188577652, + "learning_rate": 0.001, + "loss": 1.0916, + "step": 15990 + }, + { + "epoch": 0.510220351414267, + "grad_norm": 0.20133572816848755, + "learning_rate": 0.001, + "loss": 1.101, + "step": 16000 + }, + { + "epoch": 0.510539239133901, + "grad_norm": 0.20171783864498138, + "learning_rate": 0.001, + "loss": 1.1077, + "step": 16010 + }, + { + "epoch": 0.5108581268535348, + "grad_norm": 0.20236876606941223, + "learning_rate": 0.001, + "loss": 1.1173, + "step": 16020 + }, + { + "epoch": 0.5111770145731688, + "grad_norm": 0.19765743613243103, + "learning_rate": 0.001, + "loss": 1.0923, + "step": 16030 + }, + { + "epoch": 0.5114959022928027, + "grad_norm": 0.2046414166688919, + "learning_rate": 0.001, + "loss": 1.1292, + "step": 16040 + }, + { + "epoch": 0.5118147900124366, + "grad_norm": 0.20035295188426971, + "learning_rate": 0.001, + "loss": 1.1027, + "step": 16050 + }, + { + "epoch": 0.5121336777320705, + "grad_norm": 0.20384572446346283, + "learning_rate": 0.001, + "loss": 1.1259, + "step": 16060 + }, + { + "epoch": 0.5124525654517045, + "grad_norm": 0.19551537930965424, + "learning_rate": 0.001, + "loss": 1.081, + "step": 16070 + }, + { + "epoch": 0.5127714531713383, + "grad_norm": 0.19394861161708832, + "learning_rate": 0.001, + "loss": 1.1006, + "step": 16080 + }, + { + "epoch": 0.5130903408909723, + "grad_norm": 0.19792471826076508, + "learning_rate": 0.001, + "loss": 1.1112, + "step": 16090 + }, + { + "epoch": 0.5134092286106062, + "grad_norm": 0.19944746792316437, + "learning_rate": 0.001, + "loss": 1.1147, + "step": 16100 + }, + { + "epoch": 0.5137281163302401, + "grad_norm": 0.20174691081047058, + "learning_rate": 0.001, + "loss": 1.1072, + "step": 16110 + }, + { + "epoch": 0.514047004049874, + "grad_norm": 0.1961384117603302, + "learning_rate": 0.001, + "loss": 1.1048, + "step": 16120 + }, + { + "epoch": 0.514365891769508, + "grad_norm": 0.19337189197540283, + "learning_rate": 0.001, + "loss": 1.1041, + "step": 16130 + }, + { + "epoch": 0.5146847794891418, + "grad_norm": 0.1975313425064087, + "learning_rate": 0.001, + "loss": 1.1047, + "step": 16140 + }, + { + "epoch": 0.5150036672087758, + "grad_norm": 0.20119927823543549, + "learning_rate": 0.001, + "loss": 1.1132, + "step": 16150 + }, + { + "epoch": 0.5153225549284097, + "grad_norm": 0.19607770442962646, + "learning_rate": 0.001, + "loss": 1.1088, + "step": 16160 + }, + { + "epoch": 0.5156414426480436, + "grad_norm": 0.2076045274734497, + "learning_rate": 0.001, + "loss": 1.0965, + "step": 16170 + }, + { + "epoch": 0.5159603303676775, + "grad_norm": 0.20277442038059235, + "learning_rate": 0.001, + "loss": 1.0985, + "step": 16180 + }, + { + "epoch": 0.5162792180873115, + "grad_norm": 0.19827525317668915, + "learning_rate": 0.001, + "loss": 1.1024, + "step": 16190 + }, + { + "epoch": 0.5165981058069454, + "grad_norm": 0.19448797404766083, + "learning_rate": 0.001, + "loss": 1.1168, + "step": 16200 + }, + { + "epoch": 0.5169169935265793, + "grad_norm": 0.20615480840206146, + "learning_rate": 0.001, + "loss": 1.1078, + "step": 16210 + }, + { + "epoch": 0.5172358812462132, + "grad_norm": 0.19415822625160217, + "learning_rate": 0.001, + "loss": 1.1123, + "step": 16220 + }, + { + "epoch": 0.5175547689658472, + "grad_norm": 0.19522392749786377, + "learning_rate": 0.001, + "loss": 1.1041, + "step": 16230 + }, + { + "epoch": 0.517873656685481, + "grad_norm": 0.199362650513649, + "learning_rate": 0.001, + "loss": 1.1114, + "step": 16240 + }, + { + "epoch": 0.518192544405115, + "grad_norm": 0.19673673808574677, + "learning_rate": 0.001, + "loss": 1.1173, + "step": 16250 + }, + { + "epoch": 0.5185114321247489, + "grad_norm": 0.19982314109802246, + "learning_rate": 0.001, + "loss": 1.1089, + "step": 16260 + }, + { + "epoch": 0.5188303198443828, + "grad_norm": 0.19413277506828308, + "learning_rate": 0.001, + "loss": 1.0947, + "step": 16270 + }, + { + "epoch": 0.5191492075640167, + "grad_norm": 0.19558317959308624, + "learning_rate": 0.001, + "loss": 1.0951, + "step": 16280 + }, + { + "epoch": 0.5194680952836507, + "grad_norm": 0.20333553850650787, + "learning_rate": 0.001, + "loss": 1.1017, + "step": 16290 + }, + { + "epoch": 0.5197869830032845, + "grad_norm": 0.1913626492023468, + "learning_rate": 0.001, + "loss": 1.1263, + "step": 16300 + }, + { + "epoch": 0.5201058707229185, + "grad_norm": 0.19983579218387604, + "learning_rate": 0.001, + "loss": 1.1234, + "step": 16310 + }, + { + "epoch": 0.5204247584425524, + "grad_norm": 0.1915716677904129, + "learning_rate": 0.001, + "loss": 1.102, + "step": 16320 + }, + { + "epoch": 0.5207436461621863, + "grad_norm": 0.2049056887626648, + "learning_rate": 0.001, + "loss": 1.107, + "step": 16330 + }, + { + "epoch": 0.5210625338818202, + "grad_norm": 0.20463578402996063, + "learning_rate": 0.001, + "loss": 1.1209, + "step": 16340 + }, + { + "epoch": 0.5213814216014542, + "grad_norm": 0.20603425800800323, + "learning_rate": 0.001, + "loss": 1.0963, + "step": 16350 + }, + { + "epoch": 0.521700309321088, + "grad_norm": 0.19194631278514862, + "learning_rate": 0.001, + "loss": 1.0881, + "step": 16360 + }, + { + "epoch": 0.522019197040722, + "grad_norm": 0.20453418791294098, + "learning_rate": 0.001, + "loss": 1.098, + "step": 16370 + }, + { + "epoch": 0.5223380847603559, + "grad_norm": 0.1975255161523819, + "learning_rate": 0.001, + "loss": 1.0926, + "step": 16380 + }, + { + "epoch": 0.5226569724799898, + "grad_norm": 0.20485936105251312, + "learning_rate": 0.001, + "loss": 1.1387, + "step": 16390 + }, + { + "epoch": 0.5229758601996237, + "grad_norm": 0.20453734695911407, + "learning_rate": 0.001, + "loss": 1.0976, + "step": 16400 + }, + { + "epoch": 0.5232947479192577, + "grad_norm": 0.20409847795963287, + "learning_rate": 0.001, + "loss": 1.0998, + "step": 16410 + }, + { + "epoch": 0.5236136356388915, + "grad_norm": 0.19635087251663208, + "learning_rate": 0.001, + "loss": 1.0953, + "step": 16420 + }, + { + "epoch": 0.5239325233585255, + "grad_norm": 0.19614706933498383, + "learning_rate": 0.001, + "loss": 1.1247, + "step": 16430 + }, + { + "epoch": 0.5242514110781594, + "grad_norm": 0.19600021839141846, + "learning_rate": 0.001, + "loss": 1.115, + "step": 16440 + }, + { + "epoch": 0.5245702987977933, + "grad_norm": 0.19973431527614594, + "learning_rate": 0.001, + "loss": 1.1194, + "step": 16450 + }, + { + "epoch": 0.5248891865174272, + "grad_norm": 0.20397818088531494, + "learning_rate": 0.001, + "loss": 1.1036, + "step": 16460 + }, + { + "epoch": 0.5252080742370612, + "grad_norm": 0.19828130304813385, + "learning_rate": 0.001, + "loss": 1.1153, + "step": 16470 + }, + { + "epoch": 0.525526961956695, + "grad_norm": 0.19535043835639954, + "learning_rate": 0.001, + "loss": 1.1259, + "step": 16480 + }, + { + "epoch": 0.525845849676329, + "grad_norm": 0.20396125316619873, + "learning_rate": 0.001, + "loss": 1.1176, + "step": 16490 + }, + { + "epoch": 0.5261647373959629, + "grad_norm": 0.1998380720615387, + "learning_rate": 0.001, + "loss": 1.0899, + "step": 16500 + }, + { + "epoch": 0.5264836251155968, + "grad_norm": 0.1916278749704361, + "learning_rate": 0.001, + "loss": 1.0945, + "step": 16510 + }, + { + "epoch": 0.5268025128352307, + "grad_norm": 0.2070852816104889, + "learning_rate": 0.001, + "loss": 1.1125, + "step": 16520 + }, + { + "epoch": 0.5271214005548647, + "grad_norm": 0.19235506653785706, + "learning_rate": 0.001, + "loss": 1.0999, + "step": 16530 + }, + { + "epoch": 0.5274402882744985, + "grad_norm": 0.2007400244474411, + "learning_rate": 0.001, + "loss": 1.1357, + "step": 16540 + }, + { + "epoch": 0.5277591759941325, + "grad_norm": 0.19115379452705383, + "learning_rate": 0.001, + "loss": 1.0922, + "step": 16550 + }, + { + "epoch": 0.5280780637137664, + "grad_norm": 0.19835355877876282, + "learning_rate": 0.001, + "loss": 1.1089, + "step": 16560 + }, + { + "epoch": 0.5283969514334003, + "grad_norm": 0.19772939383983612, + "learning_rate": 0.001, + "loss": 1.0973, + "step": 16570 + }, + { + "epoch": 0.5287158391530342, + "grad_norm": 0.20019714534282684, + "learning_rate": 0.001, + "loss": 1.0867, + "step": 16580 + }, + { + "epoch": 0.5290347268726682, + "grad_norm": 0.2011161893606186, + "learning_rate": 0.001, + "loss": 1.096, + "step": 16590 + }, + { + "epoch": 0.529353614592302, + "grad_norm": 0.1947372555732727, + "learning_rate": 0.001, + "loss": 1.1014, + "step": 16600 + }, + { + "epoch": 0.529672502311936, + "grad_norm": 0.19452321529388428, + "learning_rate": 0.001, + "loss": 1.0926, + "step": 16610 + }, + { + "epoch": 0.5299913900315699, + "grad_norm": 0.19821277260780334, + "learning_rate": 0.001, + "loss": 1.1306, + "step": 16620 + }, + { + "epoch": 0.5303102777512038, + "grad_norm": 0.1981065422296524, + "learning_rate": 0.001, + "loss": 1.1072, + "step": 16630 + }, + { + "epoch": 0.5306291654708377, + "grad_norm": 0.20271334052085876, + "learning_rate": 0.001, + "loss": 1.1066, + "step": 16640 + }, + { + "epoch": 0.5309480531904717, + "grad_norm": 0.20248804986476898, + "learning_rate": 0.001, + "loss": 1.1099, + "step": 16650 + }, + { + "epoch": 0.5312669409101055, + "grad_norm": 0.1929582953453064, + "learning_rate": 0.001, + "loss": 1.113, + "step": 16660 + }, + { + "epoch": 0.5315858286297395, + "grad_norm": 0.19067426025867462, + "learning_rate": 0.001, + "loss": 1.0873, + "step": 16670 + }, + { + "epoch": 0.5319047163493734, + "grad_norm": 0.2004399299621582, + "learning_rate": 0.001, + "loss": 1.1371, + "step": 16680 + }, + { + "epoch": 0.5322236040690073, + "grad_norm": 0.19235877692699432, + "learning_rate": 0.001, + "loss": 1.0983, + "step": 16690 + }, + { + "epoch": 0.5325424917886412, + "grad_norm": 0.19555066525936127, + "learning_rate": 0.001, + "loss": 1.0975, + "step": 16700 + }, + { + "epoch": 0.5328613795082752, + "grad_norm": 0.19806230068206787, + "learning_rate": 0.001, + "loss": 1.093, + "step": 16710 + }, + { + "epoch": 0.533180267227909, + "grad_norm": 0.20040126144886017, + "learning_rate": 0.001, + "loss": 1.1206, + "step": 16720 + }, + { + "epoch": 0.533499154947543, + "grad_norm": 0.21600055694580078, + "learning_rate": 0.001, + "loss": 1.11, + "step": 16730 + }, + { + "epoch": 0.5338180426671769, + "grad_norm": 0.19331254065036774, + "learning_rate": 0.001, + "loss": 1.1009, + "step": 16740 + }, + { + "epoch": 0.5341369303868108, + "grad_norm": 0.1930786818265915, + "learning_rate": 0.001, + "loss": 1.0901, + "step": 16750 + }, + { + "epoch": 0.5344558181064447, + "grad_norm": 0.19137626886367798, + "learning_rate": 0.001, + "loss": 1.1151, + "step": 16760 + }, + { + "epoch": 0.5347747058260787, + "grad_norm": 0.19468235969543457, + "learning_rate": 0.001, + "loss": 1.1088, + "step": 16770 + }, + { + "epoch": 0.5350935935457125, + "grad_norm": 0.20413386821746826, + "learning_rate": 0.001, + "loss": 1.1125, + "step": 16780 + }, + { + "epoch": 0.5354124812653465, + "grad_norm": 0.20103423297405243, + "learning_rate": 0.001, + "loss": 1.1109, + "step": 16790 + }, + { + "epoch": 0.5357313689849804, + "grad_norm": 0.19415734708309174, + "learning_rate": 0.001, + "loss": 1.1001, + "step": 16800 + }, + { + "epoch": 0.5360502567046143, + "grad_norm": 0.20362050831317902, + "learning_rate": 0.001, + "loss": 1.1277, + "step": 16810 + }, + { + "epoch": 0.5363691444242482, + "grad_norm": 0.1981305480003357, + "learning_rate": 0.001, + "loss": 1.106, + "step": 16820 + }, + { + "epoch": 0.5366880321438822, + "grad_norm": 0.19732020795345306, + "learning_rate": 0.001, + "loss": 1.1032, + "step": 16830 + }, + { + "epoch": 0.537006919863516, + "grad_norm": 0.2011566162109375, + "learning_rate": 0.001, + "loss": 1.1282, + "step": 16840 + }, + { + "epoch": 0.53732580758315, + "grad_norm": 0.19841685891151428, + "learning_rate": 0.001, + "loss": 1.1094, + "step": 16850 + }, + { + "epoch": 0.5376446953027839, + "grad_norm": 0.1937408298254013, + "learning_rate": 0.001, + "loss": 1.1098, + "step": 16860 + }, + { + "epoch": 0.5379635830224178, + "grad_norm": 0.20316290855407715, + "learning_rate": 0.001, + "loss": 1.1002, + "step": 16870 + }, + { + "epoch": 0.5382824707420517, + "grad_norm": 0.1968275010585785, + "learning_rate": 0.001, + "loss": 1.0835, + "step": 16880 + }, + { + "epoch": 0.5386013584616857, + "grad_norm": 0.1885264664888382, + "learning_rate": 0.001, + "loss": 1.114, + "step": 16890 + }, + { + "epoch": 0.5389202461813195, + "grad_norm": 0.19835618138313293, + "learning_rate": 0.001, + "loss": 1.1116, + "step": 16900 + }, + { + "epoch": 0.5392391339009535, + "grad_norm": 0.20179498195648193, + "learning_rate": 0.001, + "loss": 1.0926, + "step": 16910 + }, + { + "epoch": 0.5395580216205874, + "grad_norm": 0.20029352605342865, + "learning_rate": 0.001, + "loss": 1.1197, + "step": 16920 + }, + { + "epoch": 0.5398769093402213, + "grad_norm": 0.19566185772418976, + "learning_rate": 0.001, + "loss": 1.105, + "step": 16930 + }, + { + "epoch": 0.5401957970598552, + "grad_norm": 0.19709070026874542, + "learning_rate": 0.001, + "loss": 1.1057, + "step": 16940 + }, + { + "epoch": 0.5405146847794892, + "grad_norm": 0.20236507058143616, + "learning_rate": 0.001, + "loss": 1.0964, + "step": 16950 + }, + { + "epoch": 0.540833572499123, + "grad_norm": 0.2061866670846939, + "learning_rate": 0.001, + "loss": 1.104, + "step": 16960 + }, + { + "epoch": 0.541152460218757, + "grad_norm": 0.1935662180185318, + "learning_rate": 0.001, + "loss": 1.1032, + "step": 16970 + }, + { + "epoch": 0.5414713479383909, + "grad_norm": 0.19856195151805878, + "learning_rate": 0.001, + "loss": 1.0977, + "step": 16980 + }, + { + "epoch": 0.5417902356580248, + "grad_norm": 0.19449791312217712, + "learning_rate": 0.001, + "loss": 1.1232, + "step": 16990 + }, + { + "epoch": 0.5421091233776587, + "grad_norm": 0.18957769870758057, + "learning_rate": 0.001, + "loss": 1.0844, + "step": 17000 + }, + { + "epoch": 0.5424280110972927, + "grad_norm": 0.19556500017642975, + "learning_rate": 0.001, + "loss": 1.0877, + "step": 17010 + }, + { + "epoch": 0.5427468988169266, + "grad_norm": 0.19305263459682465, + "learning_rate": 0.001, + "loss": 1.0766, + "step": 17020 + }, + { + "epoch": 0.5430657865365605, + "grad_norm": 0.20026710629463196, + "learning_rate": 0.001, + "loss": 1.1082, + "step": 17030 + }, + { + "epoch": 0.5433846742561944, + "grad_norm": 0.19406114518642426, + "learning_rate": 0.001, + "loss": 1.0823, + "step": 17040 + }, + { + "epoch": 0.5437035619758284, + "grad_norm": 0.1891794353723526, + "learning_rate": 0.001, + "loss": 1.0884, + "step": 17050 + }, + { + "epoch": 0.5440224496954622, + "grad_norm": 0.20411112904548645, + "learning_rate": 0.001, + "loss": 1.1138, + "step": 17060 + }, + { + "epoch": 0.5443413374150962, + "grad_norm": 0.19278182089328766, + "learning_rate": 0.001, + "loss": 1.1148, + "step": 17070 + }, + { + "epoch": 0.5446602251347301, + "grad_norm": 0.1992575079202652, + "learning_rate": 0.001, + "loss": 1.1188, + "step": 17080 + }, + { + "epoch": 0.544979112854364, + "grad_norm": 0.21864908933639526, + "learning_rate": 0.001, + "loss": 1.1079, + "step": 17090 + }, + { + "epoch": 0.5452980005739979, + "grad_norm": 0.1993020623922348, + "learning_rate": 0.001, + "loss": 1.0993, + "step": 17100 + }, + { + "epoch": 0.5456168882936319, + "grad_norm": 0.19873854517936707, + "learning_rate": 0.001, + "loss": 1.1108, + "step": 17110 + }, + { + "epoch": 0.5459357760132657, + "grad_norm": 0.200069859623909, + "learning_rate": 0.001, + "loss": 1.1162, + "step": 17120 + }, + { + "epoch": 0.5462546637328997, + "grad_norm": 0.19639627635478973, + "learning_rate": 0.001, + "loss": 1.1054, + "step": 17130 + }, + { + "epoch": 0.5465735514525336, + "grad_norm": 0.19923728704452515, + "learning_rate": 0.001, + "loss": 1.0938, + "step": 17140 + }, + { + "epoch": 0.5468924391721675, + "grad_norm": 0.20334571599960327, + "learning_rate": 0.001, + "loss": 1.1029, + "step": 17150 + }, + { + "epoch": 0.5472113268918014, + "grad_norm": 0.19749508798122406, + "learning_rate": 0.001, + "loss": 1.0991, + "step": 17160 + }, + { + "epoch": 0.5475302146114354, + "grad_norm": 0.18736779689788818, + "learning_rate": 0.001, + "loss": 1.0926, + "step": 17170 + }, + { + "epoch": 0.5478491023310692, + "grad_norm": 0.20096038281917572, + "learning_rate": 0.001, + "loss": 1.1176, + "step": 17180 + }, + { + "epoch": 0.5481679900507032, + "grad_norm": 0.20153848826885223, + "learning_rate": 0.001, + "loss": 1.0939, + "step": 17190 + }, + { + "epoch": 0.5484868777703371, + "grad_norm": 0.19858387112617493, + "learning_rate": 0.001, + "loss": 1.11, + "step": 17200 + }, + { + "epoch": 0.548805765489971, + "grad_norm": 0.196171373128891, + "learning_rate": 0.001, + "loss": 1.1042, + "step": 17210 + }, + { + "epoch": 0.5491246532096049, + "grad_norm": 0.19899526238441467, + "learning_rate": 0.001, + "loss": 1.093, + "step": 17220 + }, + { + "epoch": 0.5494435409292389, + "grad_norm": 0.19485172629356384, + "learning_rate": 0.001, + "loss": 1.0926, + "step": 17230 + }, + { + "epoch": 0.5497624286488727, + "grad_norm": 0.1970384567975998, + "learning_rate": 0.001, + "loss": 1.1095, + "step": 17240 + }, + { + "epoch": 0.5500813163685067, + "grad_norm": 0.19277705252170563, + "learning_rate": 0.001, + "loss": 1.1061, + "step": 17250 + }, + { + "epoch": 0.5504002040881406, + "grad_norm": 0.204323410987854, + "learning_rate": 0.001, + "loss": 1.1005, + "step": 17260 + }, + { + "epoch": 0.5507190918077745, + "grad_norm": 0.19751951098442078, + "learning_rate": 0.001, + "loss": 1.1238, + "step": 17270 + }, + { + "epoch": 0.5510379795274084, + "grad_norm": 0.20313957333564758, + "learning_rate": 0.001, + "loss": 1.0872, + "step": 17280 + }, + { + "epoch": 0.5513568672470424, + "grad_norm": 0.2005039006471634, + "learning_rate": 0.001, + "loss": 1.1231, + "step": 17290 + }, + { + "epoch": 0.5516757549666762, + "grad_norm": 0.19764991104602814, + "learning_rate": 0.001, + "loss": 1.0972, + "step": 17300 + }, + { + "epoch": 0.5519946426863102, + "grad_norm": 0.19491420686244965, + "learning_rate": 0.001, + "loss": 1.1061, + "step": 17310 + }, + { + "epoch": 0.5523135304059441, + "grad_norm": 0.19485335052013397, + "learning_rate": 0.001, + "loss": 1.1027, + "step": 17320 + }, + { + "epoch": 0.552632418125578, + "grad_norm": 0.1993619054555893, + "learning_rate": 0.001, + "loss": 1.0933, + "step": 17330 + }, + { + "epoch": 0.5529513058452119, + "grad_norm": 0.19590185582637787, + "learning_rate": 0.001, + "loss": 1.1032, + "step": 17340 + }, + { + "epoch": 0.5532701935648459, + "grad_norm": 0.19090668857097626, + "learning_rate": 0.001, + "loss": 1.1033, + "step": 17350 + }, + { + "epoch": 0.5535890812844797, + "grad_norm": 0.1968427151441574, + "learning_rate": 0.001, + "loss": 1.0788, + "step": 17360 + }, + { + "epoch": 0.5539079690041137, + "grad_norm": 0.19366152584552765, + "learning_rate": 0.001, + "loss": 1.0949, + "step": 17370 + }, + { + "epoch": 0.5542268567237476, + "grad_norm": 0.20130929350852966, + "learning_rate": 0.001, + "loss": 1.1111, + "step": 17380 + }, + { + "epoch": 0.5545457444433814, + "grad_norm": 0.2025395929813385, + "learning_rate": 0.001, + "loss": 1.101, + "step": 17390 + }, + { + "epoch": 0.5548646321630154, + "grad_norm": 0.19849498569965363, + "learning_rate": 0.001, + "loss": 1.127, + "step": 17400 + }, + { + "epoch": 0.5551835198826494, + "grad_norm": 0.20587202906608582, + "learning_rate": 0.001, + "loss": 1.1102, + "step": 17410 + }, + { + "epoch": 0.5555024076022832, + "grad_norm": 0.19566045701503754, + "learning_rate": 0.001, + "loss": 1.0888, + "step": 17420 + }, + { + "epoch": 0.5558212953219172, + "grad_norm": 0.2037980705499649, + "learning_rate": 0.001, + "loss": 1.1007, + "step": 17430 + }, + { + "epoch": 0.5561401830415511, + "grad_norm": 0.20354115962982178, + "learning_rate": 0.001, + "loss": 1.1055, + "step": 17440 + }, + { + "epoch": 0.556459070761185, + "grad_norm": 0.19119271636009216, + "learning_rate": 0.001, + "loss": 1.1044, + "step": 17450 + }, + { + "epoch": 0.5567779584808189, + "grad_norm": 0.19770164787769318, + "learning_rate": 0.001, + "loss": 1.106, + "step": 17460 + }, + { + "epoch": 0.5570968462004529, + "grad_norm": 0.19170503318309784, + "learning_rate": 0.001, + "loss": 1.109, + "step": 17470 + }, + { + "epoch": 0.5574157339200867, + "grad_norm": 0.19421005249023438, + "learning_rate": 0.001, + "loss": 1.083, + "step": 17480 + }, + { + "epoch": 0.5577346216397207, + "grad_norm": 0.1924920678138733, + "learning_rate": 0.001, + "loss": 1.109, + "step": 17490 + }, + { + "epoch": 0.5580535093593546, + "grad_norm": 0.1946050226688385, + "learning_rate": 0.001, + "loss": 1.1043, + "step": 17500 + }, + { + "epoch": 0.5583723970789884, + "grad_norm": 0.20246347784996033, + "learning_rate": 0.001, + "loss": 1.1085, + "step": 17510 + }, + { + "epoch": 0.5586912847986224, + "grad_norm": 0.20876635611057281, + "learning_rate": 0.001, + "loss": 1.1011, + "step": 17520 + }, + { + "epoch": 0.5590101725182564, + "grad_norm": 0.2076573222875595, + "learning_rate": 0.001, + "loss": 1.0885, + "step": 17530 + }, + { + "epoch": 0.5593290602378902, + "grad_norm": 0.1925816386938095, + "learning_rate": 0.001, + "loss": 1.0931, + "step": 17540 + }, + { + "epoch": 0.5596479479575242, + "grad_norm": 0.1946088969707489, + "learning_rate": 0.001, + "loss": 1.1242, + "step": 17550 + }, + { + "epoch": 0.5599668356771581, + "grad_norm": 0.2025405466556549, + "learning_rate": 0.001, + "loss": 1.0851, + "step": 17560 + }, + { + "epoch": 0.560285723396792, + "grad_norm": 0.19911693036556244, + "learning_rate": 0.001, + "loss": 1.1009, + "step": 17570 + }, + { + "epoch": 0.5606046111164259, + "grad_norm": 0.2055385559797287, + "learning_rate": 0.001, + "loss": 1.1024, + "step": 17580 + }, + { + "epoch": 0.5609234988360599, + "grad_norm": 0.20296630263328552, + "learning_rate": 0.001, + "loss": 1.0984, + "step": 17590 + }, + { + "epoch": 0.5612423865556937, + "grad_norm": 0.19320446252822876, + "learning_rate": 0.001, + "loss": 1.0961, + "step": 17600 + }, + { + "epoch": 0.5615612742753276, + "grad_norm": 0.1940065175294876, + "learning_rate": 0.001, + "loss": 1.1012, + "step": 17610 + }, + { + "epoch": 0.5618801619949616, + "grad_norm": 0.1887366771697998, + "learning_rate": 0.001, + "loss": 1.0996, + "step": 17620 + }, + { + "epoch": 0.5621990497145954, + "grad_norm": 0.19770963490009308, + "learning_rate": 0.001, + "loss": 1.1049, + "step": 17630 + }, + { + "epoch": 0.5625179374342294, + "grad_norm": 0.20164230465888977, + "learning_rate": 0.001, + "loss": 1.0906, + "step": 17640 + }, + { + "epoch": 0.5628368251538634, + "grad_norm": 0.19402539730072021, + "learning_rate": 0.001, + "loss": 1.0791, + "step": 17650 + }, + { + "epoch": 0.5631557128734972, + "grad_norm": 0.20351935923099518, + "learning_rate": 0.001, + "loss": 1.1006, + "step": 17660 + }, + { + "epoch": 0.5634746005931311, + "grad_norm": 0.2015489935874939, + "learning_rate": 0.001, + "loss": 1.098, + "step": 17670 + }, + { + "epoch": 0.5637934883127651, + "grad_norm": 0.19619329273700714, + "learning_rate": 0.001, + "loss": 1.0958, + "step": 17680 + }, + { + "epoch": 0.5641123760323989, + "grad_norm": 0.20024628937244415, + "learning_rate": 0.001, + "loss": 1.1097, + "step": 17690 + }, + { + "epoch": 0.5644312637520329, + "grad_norm": 0.19571398198604584, + "learning_rate": 0.001, + "loss": 1.0991, + "step": 17700 + }, + { + "epoch": 0.5647501514716669, + "grad_norm": 0.19286087155342102, + "learning_rate": 0.001, + "loss": 1.1053, + "step": 17710 + }, + { + "epoch": 0.5650690391913007, + "grad_norm": 0.19929246604442596, + "learning_rate": 0.001, + "loss": 1.0978, + "step": 17720 + }, + { + "epoch": 0.5653879269109346, + "grad_norm": 0.19413244724273682, + "learning_rate": 0.001, + "loss": 1.0784, + "step": 17730 + }, + { + "epoch": 0.5657068146305686, + "grad_norm": 0.1949404627084732, + "learning_rate": 0.001, + "loss": 1.0973, + "step": 17740 + }, + { + "epoch": 0.5660257023502024, + "grad_norm": 0.1955089569091797, + "learning_rate": 0.001, + "loss": 1.0846, + "step": 17750 + }, + { + "epoch": 0.5663445900698364, + "grad_norm": 0.2049809694290161, + "learning_rate": 0.001, + "loss": 1.0986, + "step": 17760 + }, + { + "epoch": 0.5666634777894703, + "grad_norm": 0.19769684970378876, + "learning_rate": 0.001, + "loss": 1.0903, + "step": 17770 + }, + { + "epoch": 0.5669823655091042, + "grad_norm": 0.2010537087917328, + "learning_rate": 0.001, + "loss": 1.0989, + "step": 17780 + }, + { + "epoch": 0.5673012532287381, + "grad_norm": 0.19881124794483185, + "learning_rate": 0.001, + "loss": 1.1099, + "step": 17790 + }, + { + "epoch": 0.5676201409483721, + "grad_norm": 0.18793614208698273, + "learning_rate": 0.001, + "loss": 1.0782, + "step": 17800 + }, + { + "epoch": 0.5679390286680059, + "grad_norm": 0.20160049200057983, + "learning_rate": 0.001, + "loss": 1.0984, + "step": 17810 + }, + { + "epoch": 0.5682579163876399, + "grad_norm": 0.20136494934558868, + "learning_rate": 0.001, + "loss": 1.1001, + "step": 17820 + }, + { + "epoch": 0.5685768041072738, + "grad_norm": 0.18992014229297638, + "learning_rate": 0.001, + "loss": 1.0898, + "step": 17830 + }, + { + "epoch": 0.5688956918269078, + "grad_norm": 0.20103414356708527, + "learning_rate": 0.001, + "loss": 1.0877, + "step": 17840 + }, + { + "epoch": 0.5692145795465416, + "grad_norm": 0.19369281828403473, + "learning_rate": 0.001, + "loss": 1.0928, + "step": 17850 + }, + { + "epoch": 0.5695334672661756, + "grad_norm": 0.19698695838451385, + "learning_rate": 0.001, + "loss": 1.0951, + "step": 17860 + }, + { + "epoch": 0.5698523549858096, + "grad_norm": 0.1910889446735382, + "learning_rate": 0.001, + "loss": 1.0902, + "step": 17870 + }, + { + "epoch": 0.5701712427054434, + "grad_norm": 0.2007639855146408, + "learning_rate": 0.001, + "loss": 1.101, + "step": 17880 + }, + { + "epoch": 0.5704901304250773, + "grad_norm": 0.19711415469646454, + "learning_rate": 0.001, + "loss": 1.0923, + "step": 17890 + }, + { + "epoch": 0.5708090181447113, + "grad_norm": 0.20394960045814514, + "learning_rate": 0.001, + "loss": 1.0962, + "step": 17900 + }, + { + "epoch": 0.5711279058643451, + "grad_norm": 0.19199271500110626, + "learning_rate": 0.001, + "loss": 1.0959, + "step": 17910 + }, + { + "epoch": 0.5714467935839791, + "grad_norm": 0.19167867302894592, + "learning_rate": 0.001, + "loss": 1.0927, + "step": 17920 + }, + { + "epoch": 0.571765681303613, + "grad_norm": 0.19008177518844604, + "learning_rate": 0.001, + "loss": 1.0966, + "step": 17930 + }, + { + "epoch": 0.5720845690232469, + "grad_norm": 0.19732196629047394, + "learning_rate": 0.001, + "loss": 1.1091, + "step": 17940 + }, + { + "epoch": 0.5724034567428808, + "grad_norm": 0.20157304406166077, + "learning_rate": 0.001, + "loss": 1.0926, + "step": 17950 + }, + { + "epoch": 0.5727223444625148, + "grad_norm": 0.199776753783226, + "learning_rate": 0.001, + "loss": 1.0737, + "step": 17960 + }, + { + "epoch": 0.5730412321821486, + "grad_norm": 0.18558533489704132, + "learning_rate": 0.001, + "loss": 1.0909, + "step": 17970 + }, + { + "epoch": 0.5733601199017826, + "grad_norm": 0.20540328323841095, + "learning_rate": 0.001, + "loss": 1.1023, + "step": 17980 + }, + { + "epoch": 0.5736790076214165, + "grad_norm": 0.1917695701122284, + "learning_rate": 0.001, + "loss": 1.11, + "step": 17990 + }, + { + "epoch": 0.5739978953410504, + "grad_norm": 0.20183104276657104, + "learning_rate": 0.001, + "loss": 1.0902, + "step": 18000 + }, + { + "epoch": 0.5743167830606843, + "grad_norm": 0.19891691207885742, + "learning_rate": 0.001, + "loss": 1.0862, + "step": 18010 + }, + { + "epoch": 0.5746356707803183, + "grad_norm": 0.20543940365314484, + "learning_rate": 0.001, + "loss": 1.0992, + "step": 18020 + }, + { + "epoch": 0.5749545584999521, + "grad_norm": 0.19975398480892181, + "learning_rate": 0.001, + "loss": 1.1015, + "step": 18030 + }, + { + "epoch": 0.5752734462195861, + "grad_norm": 0.18917524814605713, + "learning_rate": 0.001, + "loss": 1.0802, + "step": 18040 + }, + { + "epoch": 0.57559233393922, + "grad_norm": 0.1999102383852005, + "learning_rate": 0.001, + "loss": 1.0929, + "step": 18050 + }, + { + "epoch": 0.5759112216588539, + "grad_norm": 0.20146946609020233, + "learning_rate": 0.001, + "loss": 1.0856, + "step": 18060 + }, + { + "epoch": 0.5762301093784878, + "grad_norm": 0.18728485703468323, + "learning_rate": 0.001, + "loss": 1.0954, + "step": 18070 + }, + { + "epoch": 0.5765489970981218, + "grad_norm": 0.1965542733669281, + "learning_rate": 0.001, + "loss": 1.0916, + "step": 18080 + }, + { + "epoch": 0.5768678848177556, + "grad_norm": 0.18837697803974152, + "learning_rate": 0.001, + "loss": 1.1121, + "step": 18090 + }, + { + "epoch": 0.5771867725373896, + "grad_norm": 0.20012810826301575, + "learning_rate": 0.001, + "loss": 1.1063, + "step": 18100 + }, + { + "epoch": 0.5775056602570235, + "grad_norm": 0.19055570662021637, + "learning_rate": 0.001, + "loss": 1.0923, + "step": 18110 + }, + { + "epoch": 0.5778245479766574, + "grad_norm": 0.20348024368286133, + "learning_rate": 0.001, + "loss": 1.1026, + "step": 18120 + }, + { + "epoch": 0.5781434356962913, + "grad_norm": 0.1999468058347702, + "learning_rate": 0.001, + "loss": 1.0778, + "step": 18130 + }, + { + "epoch": 0.5784623234159253, + "grad_norm": 0.19548684358596802, + "learning_rate": 0.001, + "loss": 1.0973, + "step": 18140 + }, + { + "epoch": 0.5787812111355591, + "grad_norm": 0.19422020018100739, + "learning_rate": 0.001, + "loss": 1.0645, + "step": 18150 + }, + { + "epoch": 0.5791000988551931, + "grad_norm": 0.19890695810317993, + "learning_rate": 0.001, + "loss": 1.0852, + "step": 18160 + }, + { + "epoch": 0.579418986574827, + "grad_norm": 0.19380579888820648, + "learning_rate": 0.001, + "loss": 1.0773, + "step": 18170 + }, + { + "epoch": 0.5797378742944609, + "grad_norm": 0.18737071752548218, + "learning_rate": 0.001, + "loss": 1.1016, + "step": 18180 + }, + { + "epoch": 0.5800567620140948, + "grad_norm": 0.18995201587677002, + "learning_rate": 0.001, + "loss": 1.0984, + "step": 18190 + }, + { + "epoch": 0.5803756497337288, + "grad_norm": 0.19956058263778687, + "learning_rate": 0.001, + "loss": 1.1032, + "step": 18200 + }, + { + "epoch": 0.5806945374533626, + "grad_norm": 0.1915549784898758, + "learning_rate": 0.001, + "loss": 1.0938, + "step": 18210 + }, + { + "epoch": 0.5810134251729966, + "grad_norm": 0.19854103028774261, + "learning_rate": 0.001, + "loss": 1.0981, + "step": 18220 + }, + { + "epoch": 0.5813323128926305, + "grad_norm": 0.19441866874694824, + "learning_rate": 0.001, + "loss": 1.1146, + "step": 18230 + }, + { + "epoch": 0.5816512006122644, + "grad_norm": 0.1921454817056656, + "learning_rate": 0.001, + "loss": 1.0873, + "step": 18240 + }, + { + "epoch": 0.5819700883318983, + "grad_norm": 0.19195596873760223, + "learning_rate": 0.001, + "loss": 1.0994, + "step": 18250 + }, + { + "epoch": 0.5822889760515323, + "grad_norm": 0.1962839663028717, + "learning_rate": 0.001, + "loss": 1.0879, + "step": 18260 + }, + { + "epoch": 0.5826078637711661, + "grad_norm": 0.2027946412563324, + "learning_rate": 0.001, + "loss": 1.0964, + "step": 18270 + }, + { + "epoch": 0.5829267514908001, + "grad_norm": 0.19965171813964844, + "learning_rate": 0.001, + "loss": 1.0807, + "step": 18280 + }, + { + "epoch": 0.583245639210434, + "grad_norm": 0.19153305888175964, + "learning_rate": 0.001, + "loss": 1.0965, + "step": 18290 + }, + { + "epoch": 0.5835645269300679, + "grad_norm": 0.20132271945476532, + "learning_rate": 0.001, + "loss": 1.0881, + "step": 18300 + }, + { + "epoch": 0.5838834146497018, + "grad_norm": 0.19851796329021454, + "learning_rate": 0.001, + "loss": 1.0796, + "step": 18310 + }, + { + "epoch": 0.5842023023693358, + "grad_norm": 0.1932992786169052, + "learning_rate": 0.001, + "loss": 1.0941, + "step": 18320 + }, + { + "epoch": 0.5845211900889696, + "grad_norm": 0.20372772216796875, + "learning_rate": 0.001, + "loss": 1.097, + "step": 18330 + }, + { + "epoch": 0.5848400778086036, + "grad_norm": 0.196427583694458, + "learning_rate": 0.001, + "loss": 1.0864, + "step": 18340 + }, + { + "epoch": 0.5851589655282375, + "grad_norm": 0.19252528250217438, + "learning_rate": 0.001, + "loss": 1.098, + "step": 18350 + }, + { + "epoch": 0.5854778532478714, + "grad_norm": 0.19300012290477753, + "learning_rate": 0.001, + "loss": 1.1109, + "step": 18360 + }, + { + "epoch": 0.5857967409675053, + "grad_norm": 0.19377656280994415, + "learning_rate": 0.001, + "loss": 1.111, + "step": 18370 + }, + { + "epoch": 0.5861156286871393, + "grad_norm": 0.2068767100572586, + "learning_rate": 0.001, + "loss": 1.1132, + "step": 18380 + }, + { + "epoch": 0.5864345164067731, + "grad_norm": 0.20298132300376892, + "learning_rate": 0.001, + "loss": 1.0819, + "step": 18390 + }, + { + "epoch": 0.5867534041264071, + "grad_norm": 0.19519521296024323, + "learning_rate": 0.001, + "loss": 1.0843, + "step": 18400 + }, + { + "epoch": 0.587072291846041, + "grad_norm": 0.19534125924110413, + "learning_rate": 0.001, + "loss": 1.0902, + "step": 18410 + }, + { + "epoch": 0.5873911795656749, + "grad_norm": 0.20693759620189667, + "learning_rate": 0.001, + "loss": 1.1059, + "step": 18420 + }, + { + "epoch": 0.5877100672853088, + "grad_norm": 0.20042476058006287, + "learning_rate": 0.001, + "loss": 1.1061, + "step": 18430 + }, + { + "epoch": 0.5880289550049428, + "grad_norm": 0.20221875607967377, + "learning_rate": 0.001, + "loss": 1.0991, + "step": 18440 + }, + { + "epoch": 0.5883478427245766, + "grad_norm": 0.19484549760818481, + "learning_rate": 0.001, + "loss": 1.0965, + "step": 18450 + }, + { + "epoch": 0.5886667304442106, + "grad_norm": 0.19391950964927673, + "learning_rate": 0.001, + "loss": 1.0754, + "step": 18460 + }, + { + "epoch": 0.5889856181638445, + "grad_norm": 0.19615234434604645, + "learning_rate": 0.001, + "loss": 1.085, + "step": 18470 + }, + { + "epoch": 0.5893045058834784, + "grad_norm": 0.18772046267986298, + "learning_rate": 0.001, + "loss": 1.1002, + "step": 18480 + }, + { + "epoch": 0.5896233936031123, + "grad_norm": 0.19404157996177673, + "learning_rate": 0.001, + "loss": 1.0865, + "step": 18490 + }, + { + "epoch": 0.5899422813227463, + "grad_norm": 0.19353818893432617, + "learning_rate": 0.001, + "loss": 1.0946, + "step": 18500 + }, + { + "epoch": 0.5902611690423801, + "grad_norm": 0.20134851336479187, + "learning_rate": 0.001, + "loss": 1.0916, + "step": 18510 + }, + { + "epoch": 0.5905800567620141, + "grad_norm": 0.20002439618110657, + "learning_rate": 0.001, + "loss": 1.1061, + "step": 18520 + }, + { + "epoch": 0.590898944481648, + "grad_norm": 0.20346671342849731, + "learning_rate": 0.001, + "loss": 1.0839, + "step": 18530 + }, + { + "epoch": 0.5912178322012819, + "grad_norm": 0.19714142382144928, + "learning_rate": 0.001, + "loss": 1.0961, + "step": 18540 + }, + { + "epoch": 0.5915367199209158, + "grad_norm": 0.19142982363700867, + "learning_rate": 0.001, + "loss": 1.0959, + "step": 18550 + }, + { + "epoch": 0.5918556076405498, + "grad_norm": 0.2000853419303894, + "learning_rate": 0.001, + "loss": 1.1113, + "step": 18560 + }, + { + "epoch": 0.5921744953601836, + "grad_norm": 0.19660703837871552, + "learning_rate": 0.001, + "loss": 1.0753, + "step": 18570 + }, + { + "epoch": 0.5924933830798176, + "grad_norm": 0.20565517246723175, + "learning_rate": 0.001, + "loss": 1.0834, + "step": 18580 + }, + { + "epoch": 0.5928122707994515, + "grad_norm": 0.1943969577550888, + "learning_rate": 0.001, + "loss": 1.0956, + "step": 18590 + }, + { + "epoch": 0.5931311585190854, + "grad_norm": 0.19117647409439087, + "learning_rate": 0.001, + "loss": 1.0926, + "step": 18600 + }, + { + "epoch": 0.5934500462387193, + "grad_norm": 0.19650551676750183, + "learning_rate": 0.001, + "loss": 1.1022, + "step": 18610 + }, + { + "epoch": 0.5937689339583533, + "grad_norm": 0.1948203593492508, + "learning_rate": 0.001, + "loss": 1.0972, + "step": 18620 + }, + { + "epoch": 0.5940878216779871, + "grad_norm": 0.20211590826511383, + "learning_rate": 0.001, + "loss": 1.0944, + "step": 18630 + }, + { + "epoch": 0.5944067093976211, + "grad_norm": 0.19122843444347382, + "learning_rate": 0.001, + "loss": 1.0911, + "step": 18640 + }, + { + "epoch": 0.594725597117255, + "grad_norm": 0.18877461552619934, + "learning_rate": 0.001, + "loss": 1.0961, + "step": 18650 + }, + { + "epoch": 0.5950444848368889, + "grad_norm": 0.19857579469680786, + "learning_rate": 0.001, + "loss": 1.0676, + "step": 18660 + }, + { + "epoch": 0.5953633725565228, + "grad_norm": 0.19899848103523254, + "learning_rate": 0.001, + "loss": 1.0845, + "step": 18670 + }, + { + "epoch": 0.5956822602761568, + "grad_norm": 0.19641920924186707, + "learning_rate": 0.001, + "loss": 1.0935, + "step": 18680 + }, + { + "epoch": 0.5960011479957907, + "grad_norm": 0.1865895688533783, + "learning_rate": 0.001, + "loss": 1.0864, + "step": 18690 + }, + { + "epoch": 0.5963200357154246, + "grad_norm": 0.19067290425300598, + "learning_rate": 0.001, + "loss": 1.0751, + "step": 18700 + }, + { + "epoch": 0.5966389234350585, + "grad_norm": 0.19792988896369934, + "learning_rate": 0.001, + "loss": 1.1009, + "step": 18710 + }, + { + "epoch": 0.5969578111546925, + "grad_norm": 0.1916930228471756, + "learning_rate": 0.001, + "loss": 1.0919, + "step": 18720 + }, + { + "epoch": 0.5972766988743263, + "grad_norm": 0.19243399798870087, + "learning_rate": 0.001, + "loss": 1.0917, + "step": 18730 + }, + { + "epoch": 0.5975955865939603, + "grad_norm": 0.1880219578742981, + "learning_rate": 0.001, + "loss": 1.1085, + "step": 18740 + }, + { + "epoch": 0.5979144743135942, + "grad_norm": 0.19094349443912506, + "learning_rate": 0.001, + "loss": 1.0787, + "step": 18750 + }, + { + "epoch": 0.5982333620332281, + "grad_norm": 0.19258365035057068, + "learning_rate": 0.001, + "loss": 1.0958, + "step": 18760 + }, + { + "epoch": 0.598552249752862, + "grad_norm": 0.19618827104568481, + "learning_rate": 0.001, + "loss": 1.0911, + "step": 18770 + }, + { + "epoch": 0.598871137472496, + "grad_norm": 0.1982206553220749, + "learning_rate": 0.001, + "loss": 1.0841, + "step": 18780 + }, + { + "epoch": 0.5991900251921298, + "grad_norm": 0.20072801411151886, + "learning_rate": 0.001, + "loss": 1.1213, + "step": 18790 + }, + { + "epoch": 0.5995089129117638, + "grad_norm": 0.1929350197315216, + "learning_rate": 0.001, + "loss": 1.0577, + "step": 18800 + }, + { + "epoch": 0.5998278006313977, + "grad_norm": 0.19256435334682465, + "learning_rate": 0.001, + "loss": 1.0812, + "step": 18810 + }, + { + "epoch": 0.6001466883510316, + "grad_norm": 0.19631852209568024, + "learning_rate": 0.001, + "loss": 1.0999, + "step": 18820 + }, + { + "epoch": 0.6004655760706655, + "grad_norm": 0.19517488777637482, + "learning_rate": 0.001, + "loss": 1.0916, + "step": 18830 + }, + { + "epoch": 0.6007844637902995, + "grad_norm": 0.19708159565925598, + "learning_rate": 0.001, + "loss": 1.0882, + "step": 18840 + }, + { + "epoch": 0.6011033515099333, + "grad_norm": 0.1962396651506424, + "learning_rate": 0.001, + "loss": 1.1258, + "step": 18850 + }, + { + "epoch": 0.6014222392295673, + "grad_norm": 0.19821253418922424, + "learning_rate": 0.001, + "loss": 1.0862, + "step": 18860 + }, + { + "epoch": 0.6017411269492012, + "grad_norm": 0.1993333101272583, + "learning_rate": 0.001, + "loss": 1.0901, + "step": 18870 + }, + { + "epoch": 0.6020600146688351, + "grad_norm": 0.19593703746795654, + "learning_rate": 0.001, + "loss": 1.0747, + "step": 18880 + }, + { + "epoch": 0.602378902388469, + "grad_norm": 0.18685583770275116, + "learning_rate": 0.001, + "loss": 1.0908, + "step": 18890 + }, + { + "epoch": 0.602697790108103, + "grad_norm": 0.19480600953102112, + "learning_rate": 0.001, + "loss": 1.0918, + "step": 18900 + }, + { + "epoch": 0.6030166778277368, + "grad_norm": 0.19692644476890564, + "learning_rate": 0.001, + "loss": 1.0936, + "step": 18910 + }, + { + "epoch": 0.6033355655473708, + "grad_norm": 0.19970379769802094, + "learning_rate": 0.001, + "loss": 1.0996, + "step": 18920 + }, + { + "epoch": 0.6036544532670047, + "grad_norm": 0.1902437061071396, + "learning_rate": 0.001, + "loss": 1.0695, + "step": 18930 + }, + { + "epoch": 0.6039733409866386, + "grad_norm": 0.20170606672763824, + "learning_rate": 0.001, + "loss": 1.0699, + "step": 18940 + }, + { + "epoch": 0.6042922287062725, + "grad_norm": 0.19433560967445374, + "learning_rate": 0.001, + "loss": 1.0994, + "step": 18950 + }, + { + "epoch": 0.6046111164259065, + "grad_norm": 0.19772832095623016, + "learning_rate": 0.001, + "loss": 1.0976, + "step": 18960 + }, + { + "epoch": 0.6049300041455403, + "grad_norm": 0.2006259709596634, + "learning_rate": 0.001, + "loss": 1.0949, + "step": 18970 + }, + { + "epoch": 0.6052488918651743, + "grad_norm": 0.2032744139432907, + "learning_rate": 0.001, + "loss": 1.0961, + "step": 18980 + }, + { + "epoch": 0.6055677795848082, + "grad_norm": 0.195106640458107, + "learning_rate": 0.001, + "loss": 1.0955, + "step": 18990 + }, + { + "epoch": 0.6058866673044421, + "grad_norm": 0.19616737961769104, + "learning_rate": 0.001, + "loss": 1.0871, + "step": 19000 + }, + { + "epoch": 0.606205555024076, + "grad_norm": 0.2025897055864334, + "learning_rate": 0.001, + "loss": 1.0822, + "step": 19010 + }, + { + "epoch": 0.60652444274371, + "grad_norm": 0.19735278189182281, + "learning_rate": 0.001, + "loss": 1.1055, + "step": 19020 + }, + { + "epoch": 0.6068433304633438, + "grad_norm": 0.18668997287750244, + "learning_rate": 0.001, + "loss": 1.088, + "step": 19030 + }, + { + "epoch": 0.6071622181829778, + "grad_norm": 0.201369971036911, + "learning_rate": 0.001, + "loss": 1.0767, + "step": 19040 + }, + { + "epoch": 0.6074811059026117, + "grad_norm": 0.19418613612651825, + "learning_rate": 0.001, + "loss": 1.0814, + "step": 19050 + }, + { + "epoch": 0.6077999936222456, + "grad_norm": 0.18868911266326904, + "learning_rate": 0.001, + "loss": 1.0913, + "step": 19060 + }, + { + "epoch": 0.6081188813418795, + "grad_norm": 0.19358381628990173, + "learning_rate": 0.001, + "loss": 1.0936, + "step": 19070 + }, + { + "epoch": 0.6084377690615135, + "grad_norm": 0.19428423047065735, + "learning_rate": 0.001, + "loss": 1.0861, + "step": 19080 + }, + { + "epoch": 0.6087566567811473, + "grad_norm": 0.20530322194099426, + "learning_rate": 0.001, + "loss": 1.068, + "step": 19090 + }, + { + "epoch": 0.6090755445007813, + "grad_norm": 0.19999028742313385, + "learning_rate": 0.001, + "loss": 1.0968, + "step": 19100 + }, + { + "epoch": 0.6093944322204152, + "grad_norm": 0.21033933758735657, + "learning_rate": 0.001, + "loss": 1.0697, + "step": 19110 + }, + { + "epoch": 0.6097133199400491, + "grad_norm": 0.20325277745723724, + "learning_rate": 0.001, + "loss": 1.0923, + "step": 19120 + }, + { + "epoch": 0.610032207659683, + "grad_norm": 0.19787181913852692, + "learning_rate": 0.001, + "loss": 1.1045, + "step": 19130 + }, + { + "epoch": 0.610351095379317, + "grad_norm": 0.19693613052368164, + "learning_rate": 0.001, + "loss": 1.1015, + "step": 19140 + }, + { + "epoch": 0.6106699830989508, + "grad_norm": 0.19551625847816467, + "learning_rate": 0.001, + "loss": 1.1002, + "step": 19150 + }, + { + "epoch": 0.6109888708185848, + "grad_norm": 0.1972067505121231, + "learning_rate": 0.001, + "loss": 1.0734, + "step": 19160 + }, + { + "epoch": 0.6113077585382187, + "grad_norm": 0.19269640743732452, + "learning_rate": 0.001, + "loss": 1.1039, + "step": 19170 + }, + { + "epoch": 0.6116266462578526, + "grad_norm": 0.19563859701156616, + "learning_rate": 0.001, + "loss": 1.0986, + "step": 19180 + }, + { + "epoch": 0.6119455339774865, + "grad_norm": 0.19782280921936035, + "learning_rate": 0.001, + "loss": 1.0991, + "step": 19190 + }, + { + "epoch": 0.6122644216971205, + "grad_norm": 0.20272885262966156, + "learning_rate": 0.001, + "loss": 1.0818, + "step": 19200 + }, + { + "epoch": 0.6125833094167543, + "grad_norm": 0.18932965397834778, + "learning_rate": 0.001, + "loss": 1.0675, + "step": 19210 + }, + { + "epoch": 0.6129021971363883, + "grad_norm": 0.19627796113491058, + "learning_rate": 0.001, + "loss": 1.0917, + "step": 19220 + }, + { + "epoch": 0.6132210848560222, + "grad_norm": 0.18708980083465576, + "learning_rate": 0.001, + "loss": 1.0772, + "step": 19230 + }, + { + "epoch": 0.6135399725756561, + "grad_norm": 0.19243952631950378, + "learning_rate": 0.001, + "loss": 1.0861, + "step": 19240 + }, + { + "epoch": 0.61385886029529, + "grad_norm": 0.19430191814899445, + "learning_rate": 0.001, + "loss": 1.0734, + "step": 19250 + }, + { + "epoch": 0.614177748014924, + "grad_norm": 0.1974942982196808, + "learning_rate": 0.001, + "loss": 1.0971, + "step": 19260 + }, + { + "epoch": 0.6144966357345578, + "grad_norm": 0.19094713032245636, + "learning_rate": 0.001, + "loss": 1.0998, + "step": 19270 + }, + { + "epoch": 0.6148155234541918, + "grad_norm": 0.19476744532585144, + "learning_rate": 0.001, + "loss": 1.0896, + "step": 19280 + }, + { + "epoch": 0.6151344111738257, + "grad_norm": 0.1873323768377304, + "learning_rate": 0.001, + "loss": 1.0716, + "step": 19290 + }, + { + "epoch": 0.6154532988934596, + "grad_norm": 0.19868840277194977, + "learning_rate": 0.001, + "loss": 1.1003, + "step": 19300 + }, + { + "epoch": 0.6157721866130935, + "grad_norm": 0.19452963769435883, + "learning_rate": 0.001, + "loss": 1.097, + "step": 19310 + }, + { + "epoch": 0.6160910743327275, + "grad_norm": 0.200517475605011, + "learning_rate": 0.001, + "loss": 1.0814, + "step": 19320 + }, + { + "epoch": 0.6164099620523613, + "grad_norm": 0.1923755258321762, + "learning_rate": 0.001, + "loss": 1.1073, + "step": 19330 + }, + { + "epoch": 0.6167288497719953, + "grad_norm": 0.19483143091201782, + "learning_rate": 0.001, + "loss": 1.0796, + "step": 19340 + }, + { + "epoch": 0.6170477374916292, + "grad_norm": 0.19290411472320557, + "learning_rate": 0.001, + "loss": 1.0853, + "step": 19350 + }, + { + "epoch": 0.6173666252112631, + "grad_norm": 0.19407965242862701, + "learning_rate": 0.001, + "loss": 1.0913, + "step": 19360 + }, + { + "epoch": 0.617685512930897, + "grad_norm": 0.19258002936840057, + "learning_rate": 0.001, + "loss": 1.0922, + "step": 19370 + }, + { + "epoch": 0.618004400650531, + "grad_norm": 0.19973376393318176, + "learning_rate": 0.001, + "loss": 1.0984, + "step": 19380 + }, + { + "epoch": 0.6183232883701648, + "grad_norm": 0.2037927806377411, + "learning_rate": 0.001, + "loss": 1.1043, + "step": 19390 + }, + { + "epoch": 0.6186421760897988, + "grad_norm": 0.1936698704957962, + "learning_rate": 0.001, + "loss": 1.0796, + "step": 19400 + }, + { + "epoch": 0.6189610638094327, + "grad_norm": 0.19478873908519745, + "learning_rate": 0.001, + "loss": 1.0808, + "step": 19410 + }, + { + "epoch": 0.6192799515290666, + "grad_norm": 0.1989588886499405, + "learning_rate": 0.001, + "loss": 1.1089, + "step": 19420 + }, + { + "epoch": 0.6195988392487005, + "grad_norm": 0.19915714859962463, + "learning_rate": 0.001, + "loss": 1.0871, + "step": 19430 + }, + { + "epoch": 0.6199177269683345, + "grad_norm": 0.1963144987821579, + "learning_rate": 0.001, + "loss": 1.0947, + "step": 19440 + }, + { + "epoch": 0.6202366146879683, + "grad_norm": 0.1846805065870285, + "learning_rate": 0.001, + "loss": 1.0678, + "step": 19450 + }, + { + "epoch": 0.6205555024076023, + "grad_norm": 0.1934468150138855, + "learning_rate": 0.001, + "loss": 1.1098, + "step": 19460 + }, + { + "epoch": 0.6208743901272362, + "grad_norm": 0.18973872065544128, + "learning_rate": 0.001, + "loss": 1.0751, + "step": 19470 + }, + { + "epoch": 0.6211932778468701, + "grad_norm": 0.19252535700798035, + "learning_rate": 0.001, + "loss": 1.0867, + "step": 19480 + }, + { + "epoch": 0.621512165566504, + "grad_norm": 0.19858285784721375, + "learning_rate": 0.001, + "loss": 1.09, + "step": 19490 + }, + { + "epoch": 0.621831053286138, + "grad_norm": 0.19829675555229187, + "learning_rate": 0.001, + "loss": 1.0955, + "step": 19500 + }, + { + "epoch": 0.6221499410057719, + "grad_norm": 0.1918306052684784, + "learning_rate": 0.001, + "loss": 1.0949, + "step": 19510 + }, + { + "epoch": 0.6224688287254058, + "grad_norm": 0.1984504908323288, + "learning_rate": 0.001, + "loss": 1.0865, + "step": 19520 + }, + { + "epoch": 0.6227877164450397, + "grad_norm": 0.1985856592655182, + "learning_rate": 0.001, + "loss": 1.1028, + "step": 19530 + }, + { + "epoch": 0.6231066041646737, + "grad_norm": 0.2000814974308014, + "learning_rate": 0.001, + "loss": 1.0807, + "step": 19540 + }, + { + "epoch": 0.6234254918843075, + "grad_norm": 0.19329458475112915, + "learning_rate": 0.001, + "loss": 1.085, + "step": 19550 + }, + { + "epoch": 0.6237443796039415, + "grad_norm": 0.20633338391780853, + "learning_rate": 0.001, + "loss": 1.1083, + "step": 19560 + }, + { + "epoch": 0.6240632673235754, + "grad_norm": 0.1958053559064865, + "learning_rate": 0.001, + "loss": 1.0988, + "step": 19570 + }, + { + "epoch": 0.6243821550432093, + "grad_norm": 0.20357364416122437, + "learning_rate": 0.001, + "loss": 1.0944, + "step": 19580 + }, + { + "epoch": 0.6247010427628432, + "grad_norm": 0.19455161690711975, + "learning_rate": 0.001, + "loss": 1.0881, + "step": 19590 + }, + { + "epoch": 0.6250199304824772, + "grad_norm": 0.19670099020004272, + "learning_rate": 0.001, + "loss": 1.0812, + "step": 19600 + }, + { + "epoch": 0.625338818202111, + "grad_norm": 0.19765827059745789, + "learning_rate": 0.001, + "loss": 1.0938, + "step": 19610 + }, + { + "epoch": 0.625657705921745, + "grad_norm": 0.19898134469985962, + "learning_rate": 0.001, + "loss": 1.1085, + "step": 19620 + }, + { + "epoch": 0.6259765936413789, + "grad_norm": 0.19776412844657898, + "learning_rate": 0.001, + "loss": 1.0797, + "step": 19630 + }, + { + "epoch": 0.6262954813610128, + "grad_norm": 0.20222464203834534, + "learning_rate": 0.001, + "loss": 1.094, + "step": 19640 + }, + { + "epoch": 0.6266143690806467, + "grad_norm": 0.20248456299304962, + "learning_rate": 0.001, + "loss": 1.0911, + "step": 19650 + }, + { + "epoch": 0.6269332568002807, + "grad_norm": 0.1994301676750183, + "learning_rate": 0.001, + "loss": 1.0852, + "step": 19660 + }, + { + "epoch": 0.6272521445199145, + "grad_norm": 0.19714325666427612, + "learning_rate": 0.001, + "loss": 1.0835, + "step": 19670 + }, + { + "epoch": 0.6275710322395485, + "grad_norm": 0.19362570345401764, + "learning_rate": 0.001, + "loss": 1.0993, + "step": 19680 + }, + { + "epoch": 0.6278899199591824, + "grad_norm": 0.19453924894332886, + "learning_rate": 0.001, + "loss": 1.0883, + "step": 19690 + }, + { + "epoch": 0.6282088076788163, + "grad_norm": 0.19855844974517822, + "learning_rate": 0.001, + "loss": 1.086, + "step": 19700 + }, + { + "epoch": 0.6285276953984502, + "grad_norm": 0.19160586595535278, + "learning_rate": 0.001, + "loss": 1.0713, + "step": 19710 + }, + { + "epoch": 0.6288465831180842, + "grad_norm": 0.1902851015329361, + "learning_rate": 0.001, + "loss": 1.0884, + "step": 19720 + }, + { + "epoch": 0.629165470837718, + "grad_norm": 0.1926606446504593, + "learning_rate": 0.001, + "loss": 1.0797, + "step": 19730 + }, + { + "epoch": 0.629484358557352, + "grad_norm": 0.19630712270736694, + "learning_rate": 0.001, + "loss": 1.0841, + "step": 19740 + }, + { + "epoch": 0.6298032462769859, + "grad_norm": 0.19418714940547943, + "learning_rate": 0.001, + "loss": 1.0791, + "step": 19750 + }, + { + "epoch": 0.6301221339966198, + "grad_norm": 0.19034384191036224, + "learning_rate": 0.001, + "loss": 1.076, + "step": 19760 + }, + { + "epoch": 0.6304410217162537, + "grad_norm": 0.19827674329280853, + "learning_rate": 0.001, + "loss": 1.0747, + "step": 19770 + }, + { + "epoch": 0.6307599094358877, + "grad_norm": 0.1912790685892105, + "learning_rate": 0.001, + "loss": 1.0814, + "step": 19780 + }, + { + "epoch": 0.6310787971555215, + "grad_norm": 0.19014963507652283, + "learning_rate": 0.001, + "loss": 1.0911, + "step": 19790 + }, + { + "epoch": 0.6313976848751555, + "grad_norm": 0.19616587460041046, + "learning_rate": 0.001, + "loss": 1.0806, + "step": 19800 + }, + { + "epoch": 0.6317165725947894, + "grad_norm": 0.18765047192573547, + "learning_rate": 0.001, + "loss": 1.0855, + "step": 19810 + }, + { + "epoch": 0.6320354603144233, + "grad_norm": 0.20021288096904755, + "learning_rate": 0.001, + "loss": 1.0823, + "step": 19820 + }, + { + "epoch": 0.6323543480340572, + "grad_norm": 0.20338086783885956, + "learning_rate": 0.001, + "loss": 1.087, + "step": 19830 + }, + { + "epoch": 0.6326732357536912, + "grad_norm": 0.20663778483867645, + "learning_rate": 0.001, + "loss": 1.0984, + "step": 19840 + }, + { + "epoch": 0.632992123473325, + "grad_norm": 0.18688595294952393, + "learning_rate": 0.001, + "loss": 1.0799, + "step": 19850 + }, + { + "epoch": 0.633311011192959, + "grad_norm": 0.20734763145446777, + "learning_rate": 0.001, + "loss": 1.0859, + "step": 19860 + }, + { + "epoch": 0.6336298989125929, + "grad_norm": 0.19312497973442078, + "learning_rate": 0.001, + "loss": 1.0801, + "step": 19870 + }, + { + "epoch": 0.6339487866322268, + "grad_norm": 0.19226190447807312, + "learning_rate": 0.001, + "loss": 1.0895, + "step": 19880 + }, + { + "epoch": 0.6342676743518607, + "grad_norm": 0.19165466725826263, + "learning_rate": 0.001, + "loss": 1.0896, + "step": 19890 + }, + { + "epoch": 0.6345865620714947, + "grad_norm": 0.19660542905330658, + "learning_rate": 0.001, + "loss": 1.0864, + "step": 19900 + }, + { + "epoch": 0.6349054497911285, + "grad_norm": 0.2044084072113037, + "learning_rate": 0.001, + "loss": 1.097, + "step": 19910 + }, + { + "epoch": 0.6352243375107625, + "grad_norm": 0.20047420263290405, + "learning_rate": 0.001, + "loss": 1.0935, + "step": 19920 + }, + { + "epoch": 0.6355432252303964, + "grad_norm": 0.20268094539642334, + "learning_rate": 0.001, + "loss": 1.0774, + "step": 19930 + }, + { + "epoch": 0.6358621129500303, + "grad_norm": 0.2090209722518921, + "learning_rate": 0.001, + "loss": 1.0758, + "step": 19940 + }, + { + "epoch": 0.6361810006696642, + "grad_norm": 0.19323612749576569, + "learning_rate": 0.001, + "loss": 1.0709, + "step": 19950 + }, + { + "epoch": 0.6364998883892982, + "grad_norm": 0.1982102394104004, + "learning_rate": 0.001, + "loss": 1.0875, + "step": 19960 + }, + { + "epoch": 0.636818776108932, + "grad_norm": 0.20114809274673462, + "learning_rate": 0.001, + "loss": 1.0995, + "step": 19970 + }, + { + "epoch": 0.637137663828566, + "grad_norm": 0.19009973108768463, + "learning_rate": 0.001, + "loss": 1.1036, + "step": 19980 + }, + { + "epoch": 0.6374565515481999, + "grad_norm": 0.19751933217048645, + "learning_rate": 0.001, + "loss": 1.094, + "step": 19990 + }, + { + "epoch": 0.6377754392678338, + "grad_norm": 0.19855111837387085, + "learning_rate": 0.001, + "loss": 1.083, + "step": 20000 + }, + { + "epoch": 0.6380943269874677, + "grad_norm": 0.18911032378673553, + "learning_rate": 0.001, + "loss": 1.0974, + "step": 20010 + }, + { + "epoch": 0.6384132147071017, + "grad_norm": 0.19520005583763123, + "learning_rate": 0.001, + "loss": 1.0957, + "step": 20020 + }, + { + "epoch": 0.6387321024267355, + "grad_norm": 0.19466760754585266, + "learning_rate": 0.001, + "loss": 1.1106, + "step": 20030 + }, + { + "epoch": 0.6390509901463695, + "grad_norm": 0.18498612940311432, + "learning_rate": 0.001, + "loss": 1.0813, + "step": 20040 + }, + { + "epoch": 0.6393698778660034, + "grad_norm": 0.19077274203300476, + "learning_rate": 0.001, + "loss": 1.0773, + "step": 20050 + }, + { + "epoch": 0.6396887655856373, + "grad_norm": 0.19003081321716309, + "learning_rate": 0.001, + "loss": 1.0869, + "step": 20060 + }, + { + "epoch": 0.6400076533052712, + "grad_norm": 0.19166330993175507, + "learning_rate": 0.001, + "loss": 1.0862, + "step": 20070 + }, + { + "epoch": 0.6403265410249052, + "grad_norm": 0.1930341273546219, + "learning_rate": 0.001, + "loss": 1.0803, + "step": 20080 + }, + { + "epoch": 0.640645428744539, + "grad_norm": 0.18485362827777863, + "learning_rate": 0.001, + "loss": 1.0921, + "step": 20090 + }, + { + "epoch": 0.640964316464173, + "grad_norm": 0.19426341354846954, + "learning_rate": 0.001, + "loss": 1.0984, + "step": 20100 + }, + { + "epoch": 0.6412832041838069, + "grad_norm": 0.19640350341796875, + "learning_rate": 0.001, + "loss": 1.0716, + "step": 20110 + }, + { + "epoch": 0.6416020919034408, + "grad_norm": 0.1945829838514328, + "learning_rate": 0.001, + "loss": 1.0932, + "step": 20120 + }, + { + "epoch": 0.6419209796230747, + "grad_norm": 0.1937423199415207, + "learning_rate": 0.001, + "loss": 1.0901, + "step": 20130 + }, + { + "epoch": 0.6422398673427087, + "grad_norm": 0.19474102556705475, + "learning_rate": 0.001, + "loss": 1.086, + "step": 20140 + }, + { + "epoch": 0.6425587550623425, + "grad_norm": 0.19094392657279968, + "learning_rate": 0.001, + "loss": 1.0914, + "step": 20150 + }, + { + "epoch": 0.6428776427819765, + "grad_norm": 0.18804484605789185, + "learning_rate": 0.001, + "loss": 1.0939, + "step": 20160 + }, + { + "epoch": 0.6431965305016104, + "grad_norm": 0.19421075284481049, + "learning_rate": 0.001, + "loss": 1.096, + "step": 20170 + }, + { + "epoch": 0.6435154182212443, + "grad_norm": 0.19062688946723938, + "learning_rate": 0.001, + "loss": 1.0921, + "step": 20180 + }, + { + "epoch": 0.6438343059408782, + "grad_norm": 0.19020332396030426, + "learning_rate": 0.001, + "loss": 1.0606, + "step": 20190 + }, + { + "epoch": 0.6441531936605122, + "grad_norm": 0.1877630650997162, + "learning_rate": 0.001, + "loss": 1.0834, + "step": 20200 + }, + { + "epoch": 0.644472081380146, + "grad_norm": 0.197085440158844, + "learning_rate": 0.001, + "loss": 1.1016, + "step": 20210 + }, + { + "epoch": 0.64479096909978, + "grad_norm": 0.19835904240608215, + "learning_rate": 0.001, + "loss": 1.0775, + "step": 20220 + }, + { + "epoch": 0.6451098568194139, + "grad_norm": 0.19453178346157074, + "learning_rate": 0.001, + "loss": 1.0871, + "step": 20230 + }, + { + "epoch": 0.6454287445390477, + "grad_norm": 0.19668817520141602, + "learning_rate": 0.001, + "loss": 1.072, + "step": 20240 + }, + { + "epoch": 0.6457476322586817, + "grad_norm": 0.1878117173910141, + "learning_rate": 0.001, + "loss": 1.0683, + "step": 20250 + }, + { + "epoch": 0.6460665199783157, + "grad_norm": 0.20410464704036713, + "learning_rate": 0.001, + "loss": 1.0824, + "step": 20260 + }, + { + "epoch": 0.6463854076979495, + "grad_norm": 0.20036086440086365, + "learning_rate": 0.001, + "loss": 1.0895, + "step": 20270 + }, + { + "epoch": 0.6467042954175835, + "grad_norm": 0.19183671474456787, + "learning_rate": 0.001, + "loss": 1.1032, + "step": 20280 + }, + { + "epoch": 0.6470231831372174, + "grad_norm": 0.19621118903160095, + "learning_rate": 0.001, + "loss": 1.0708, + "step": 20290 + }, + { + "epoch": 0.6473420708568512, + "grad_norm": 0.1952357292175293, + "learning_rate": 0.001, + "loss": 1.0875, + "step": 20300 + }, + { + "epoch": 0.6476609585764852, + "grad_norm": 0.20269818603992462, + "learning_rate": 0.001, + "loss": 1.0774, + "step": 20310 + }, + { + "epoch": 0.6479798462961192, + "grad_norm": 0.19550341367721558, + "learning_rate": 0.001, + "loss": 1.0756, + "step": 20320 + }, + { + "epoch": 0.6482987340157531, + "grad_norm": 0.1954687088727951, + "learning_rate": 0.001, + "loss": 1.0573, + "step": 20330 + }, + { + "epoch": 0.648617621735387, + "grad_norm": 0.20026355981826782, + "learning_rate": 0.001, + "loss": 1.0903, + "step": 20340 + }, + { + "epoch": 0.6489365094550209, + "grad_norm": 0.20052330195903778, + "learning_rate": 0.001, + "loss": 1.0987, + "step": 20350 + }, + { + "epoch": 0.6492553971746549, + "grad_norm": 0.18964596092700958, + "learning_rate": 0.001, + "loss": 1.0903, + "step": 20360 + }, + { + "epoch": 0.6495742848942887, + "grad_norm": 0.1951100081205368, + "learning_rate": 0.001, + "loss": 1.0782, + "step": 20370 + }, + { + "epoch": 0.6498931726139227, + "grad_norm": 0.18879763782024384, + "learning_rate": 0.001, + "loss": 1.0957, + "step": 20380 + }, + { + "epoch": 0.6502120603335566, + "grad_norm": 0.20073334872722626, + "learning_rate": 0.0009972136778050526, + "loss": 1.059, + "step": 20390 + }, + { + "epoch": 0.6505309480531904, + "grad_norm": 0.20074057579040527, + "learning_rate": 0.0009930383279382303, + "loss": 1.0883, + "step": 20400 + }, + { + "epoch": 0.6508498357728244, + "grad_norm": 0.19324301183223724, + "learning_rate": 0.000988880460329121, + "loss": 1.0649, + "step": 20410 + }, + { + "epoch": 0.6511687234924584, + "grad_norm": 0.1927865743637085, + "learning_rate": 0.000984740001779228, + "loss": 1.0878, + "step": 20420 + }, + { + "epoch": 0.6514876112120922, + "grad_norm": 0.19140435755252838, + "learning_rate": 0.000980616879396537, + "loss": 1.0897, + "step": 20430 + }, + { + "epoch": 0.6518064989317262, + "grad_norm": 0.19729351997375488, + "learning_rate": 0.0009765110205942349, + "loss": 1.1063, + "step": 20440 + }, + { + "epoch": 0.6521253866513601, + "grad_norm": 0.19580216705799103, + "learning_rate": 0.0009724223530894298, + "loss": 1.0841, + "step": 20450 + }, + { + "epoch": 0.652444274370994, + "grad_norm": 0.1976882368326187, + "learning_rate": 0.00096835080490188, + "loss": 1.0777, + "step": 20460 + }, + { + "epoch": 0.6527631620906279, + "grad_norm": 0.19940541684627533, + "learning_rate": 0.0009642963043527262, + "loss": 1.0777, + "step": 20470 + }, + { + "epoch": 0.6530820498102619, + "grad_norm": 0.1925906538963318, + "learning_rate": 0.0009602587800632295, + "loss": 1.05, + "step": 20480 + }, + { + "epoch": 0.6534009375298957, + "grad_norm": 0.19239018857479095, + "learning_rate": 0.0009562381609535146, + "loss": 1.0762, + "step": 20490 + }, + { + "epoch": 0.6537198252495297, + "grad_norm": 0.19486668705940247, + "learning_rate": 0.0009522343762413196, + "loss": 1.0739, + "step": 20500 + }, + { + "epoch": 0.6540387129691636, + "grad_norm": 0.19426004588603973, + "learning_rate": 0.0009482473554407485, + "loss": 1.0741, + "step": 20510 + }, + { + "epoch": 0.6543576006887974, + "grad_norm": 0.19421601295471191, + "learning_rate": 0.000944277028361031, + "loss": 1.0772, + "step": 20520 + }, + { + "epoch": 0.6546764884084314, + "grad_norm": 0.19363418221473694, + "learning_rate": 0.0009403233251052866, + "loss": 1.0623, + "step": 20530 + }, + { + "epoch": 0.6549953761280654, + "grad_norm": 0.19872882962226868, + "learning_rate": 0.0009363861760692945, + "loss": 1.0814, + "step": 20540 + }, + { + "epoch": 0.6553142638476992, + "grad_norm": 0.1920090913772583, + "learning_rate": 0.0009324655119402678, + "loss": 1.0756, + "step": 20550 + }, + { + "epoch": 0.6556331515673332, + "grad_norm": 0.19251906871795654, + "learning_rate": 0.0009285612636956329, + "loss": 1.0625, + "step": 20560 + }, + { + "epoch": 0.6559520392869671, + "grad_norm": 0.19410337507724762, + "learning_rate": 0.0009246733626018155, + "loss": 1.0835, + "step": 20570 + }, + { + "epoch": 0.656270927006601, + "grad_norm": 0.19288881123065948, + "learning_rate": 0.0009208017402130296, + "loss": 1.0844, + "step": 20580 + }, + { + "epoch": 0.6565898147262349, + "grad_norm": 0.1908722221851349, + "learning_rate": 0.0009169463283700727, + "loss": 1.0683, + "step": 20590 + }, + { + "epoch": 0.6569087024458689, + "grad_norm": 0.1891368180513382, + "learning_rate": 0.0009131070591991262, + "loss": 1.0737, + "step": 20600 + }, + { + "epoch": 0.6572275901655027, + "grad_norm": 0.18758034706115723, + "learning_rate": 0.00090928386511056, + "loss": 1.0777, + "step": 20610 + }, + { + "epoch": 0.6575464778851366, + "grad_norm": 0.19056786596775055, + "learning_rate": 0.0009054766787977433, + "loss": 1.0652, + "step": 20620 + }, + { + "epoch": 0.6578653656047706, + "grad_norm": 0.19077904522418976, + "learning_rate": 0.0009016854332358588, + "loss": 1.0512, + "step": 20630 + }, + { + "epoch": 0.6581842533244044, + "grad_norm": 0.1994921714067459, + "learning_rate": 0.0008979100616807236, + "loss": 1.1071, + "step": 20640 + }, + { + "epoch": 0.6585031410440384, + "grad_norm": 0.20684292912483215, + "learning_rate": 0.0008941504976676136, + "loss": 1.0724, + "step": 20650 + }, + { + "epoch": 0.6588220287636724, + "grad_norm": 0.18994581699371338, + "learning_rate": 0.0008904066750100934, + "loss": 1.0524, + "step": 20660 + }, + { + "epoch": 0.6591409164833062, + "grad_norm": 0.19218602776527405, + "learning_rate": 0.0008866785277988516, + "loss": 1.0698, + "step": 20670 + }, + { + "epoch": 0.6594598042029401, + "grad_norm": 0.1805793046951294, + "learning_rate": 0.00088296599040054, + "loss": 1.0691, + "step": 20680 + }, + { + "epoch": 0.6597786919225741, + "grad_norm": 0.1912751942873001, + "learning_rate": 0.0008792689974566183, + "loss": 1.0622, + "step": 20690 + }, + { + "epoch": 0.6600975796422079, + "grad_norm": 0.18160144984722137, + "learning_rate": 0.0008755874838822034, + "loss": 1.0687, + "step": 20700 + }, + { + "epoch": 0.6604164673618419, + "grad_norm": 0.19986706972122192, + "learning_rate": 0.0008719213848649239, + "loss": 1.0675, + "step": 20710 + }, + { + "epoch": 0.6607353550814759, + "grad_norm": 0.19376768171787262, + "learning_rate": 0.0008682706358637785, + "loss": 1.0334, + "step": 20720 + }, + { + "epoch": 0.6610542428011097, + "grad_norm": 0.18999677896499634, + "learning_rate": 0.0008646351726080005, + "loss": 1.0613, + "step": 20730 + }, + { + "epoch": 0.6613731305207436, + "grad_norm": 0.19654668867588043, + "learning_rate": 0.0008610149310959256, + "loss": 1.0636, + "step": 20740 + }, + { + "epoch": 0.6616920182403776, + "grad_norm": 0.19226598739624023, + "learning_rate": 0.0008574098475938659, + "loss": 1.0709, + "step": 20750 + }, + { + "epoch": 0.6620109059600114, + "grad_norm": 0.18288561701774597, + "learning_rate": 0.000853819858634987, + "loss": 1.0513, + "step": 20760 + }, + { + "epoch": 0.6623297936796454, + "grad_norm": 0.19555260241031647, + "learning_rate": 0.0008502449010181915, + "loss": 1.0728, + "step": 20770 + }, + { + "epoch": 0.6626486813992793, + "grad_norm": 0.1996360570192337, + "learning_rate": 0.0008466849118070059, + "loss": 1.0594, + "step": 20780 + }, + { + "epoch": 0.6629675691189132, + "grad_norm": 0.19368167221546173, + "learning_rate": 0.0008431398283284729, + "loss": 1.0615, + "step": 20790 + }, + { + "epoch": 0.6632864568385471, + "grad_norm": 0.19357769191265106, + "learning_rate": 0.0008396095881720477, + "loss": 1.0748, + "step": 20800 + }, + { + "epoch": 0.6636053445581811, + "grad_norm": 0.19414184987545013, + "learning_rate": 0.0008360941291884995, + "loss": 1.0437, + "step": 20810 + }, + { + "epoch": 0.6639242322778149, + "grad_norm": 0.1933811902999878, + "learning_rate": 0.0008325933894888175, + "loss": 1.0687, + "step": 20820 + }, + { + "epoch": 0.6642431199974489, + "grad_norm": 0.19190998375415802, + "learning_rate": 0.0008291073074431209, + "loss": 1.0489, + "step": 20830 + }, + { + "epoch": 0.6645620077170828, + "grad_norm": 0.19139911234378815, + "learning_rate": 0.0008256358216795744, + "loss": 1.0499, + "step": 20840 + }, + { + "epoch": 0.6648808954367167, + "grad_norm": 0.18963788449764252, + "learning_rate": 0.0008221788710833077, + "loss": 1.0551, + "step": 20850 + }, + { + "epoch": 0.6651997831563506, + "grad_norm": 0.18441535532474518, + "learning_rate": 0.0008187363947953391, + "loss": 1.0469, + "step": 20860 + }, + { + "epoch": 0.6655186708759846, + "grad_norm": 0.19030874967575073, + "learning_rate": 0.0008153083322115049, + "loss": 1.0637, + "step": 20870 + }, + { + "epoch": 0.6658375585956184, + "grad_norm": 0.19269512593746185, + "learning_rate": 0.0008118946229813915, + "loss": 1.0512, + "step": 20880 + }, + { + "epoch": 0.6661564463152524, + "grad_norm": 0.1916525512933731, + "learning_rate": 0.0008084952070072738, + "loss": 1.0477, + "step": 20890 + }, + { + "epoch": 0.6664753340348863, + "grad_norm": 0.1908433735370636, + "learning_rate": 0.0008051100244430569, + "loss": 1.0587, + "step": 20900 + }, + { + "epoch": 0.6667942217545202, + "grad_norm": 0.19119319319725037, + "learning_rate": 0.0008017390156932223, + "loss": 1.0501, + "step": 20910 + }, + { + "epoch": 0.6671131094741541, + "grad_norm": 0.18825064599514008, + "learning_rate": 0.0007983821214117789, + "loss": 1.0329, + "step": 20920 + }, + { + "epoch": 0.6674319971937881, + "grad_norm": 0.1840713620185852, + "learning_rate": 0.0007950392825012183, + "loss": 1.052, + "step": 20930 + }, + { + "epoch": 0.6677508849134219, + "grad_norm": 0.19763097167015076, + "learning_rate": 0.0007917104401114743, + "loss": 1.037, + "step": 20940 + }, + { + "epoch": 0.6680697726330559, + "grad_norm": 0.19263523817062378, + "learning_rate": 0.000788395535638887, + "loss": 1.0573, + "step": 20950 + }, + { + "epoch": 0.6683886603526898, + "grad_norm": 0.18894965946674347, + "learning_rate": 0.0007850945107251707, + "loss": 1.0543, + "step": 20960 + }, + { + "epoch": 0.6687075480723237, + "grad_norm": 0.18489575386047363, + "learning_rate": 0.000781807307256387, + "loss": 1.0357, + "step": 20970 + }, + { + "epoch": 0.6690264357919576, + "grad_norm": 0.1813621073961258, + "learning_rate": 0.0007785338673619216, + "loss": 1.041, + "step": 20980 + }, + { + "epoch": 0.6693453235115916, + "grad_norm": 0.194569393992424, + "learning_rate": 0.0007752741334134652, + "loss": 1.0438, + "step": 20990 + }, + { + "epoch": 0.6696642112312254, + "grad_norm": 0.18502141535282135, + "learning_rate": 0.0007720280480239992, + "loss": 1.0595, + "step": 21000 + }, + { + "epoch": 0.6699830989508594, + "grad_norm": 0.1821593940258026, + "learning_rate": 0.0007687955540467853, + "loss": 1.0434, + "step": 21010 + }, + { + "epoch": 0.6703019866704933, + "grad_norm": 0.19203370809555054, + "learning_rate": 0.0007655765945743598, + "loss": 1.0438, + "step": 21020 + }, + { + "epoch": 0.6706208743901272, + "grad_norm": 0.20178502798080444, + "learning_rate": 0.0007623711129375314, + "loss": 1.047, + "step": 21030 + }, + { + "epoch": 0.6709397621097611, + "grad_norm": 0.1864270716905594, + "learning_rate": 0.0007591790527043832, + "loss": 1.0433, + "step": 21040 + }, + { + "epoch": 0.6712586498293951, + "grad_norm": 0.18299788236618042, + "learning_rate": 0.0007560003576792802, + "loss": 1.0466, + "step": 21050 + }, + { + "epoch": 0.6715775375490289, + "grad_norm": 0.19573181867599487, + "learning_rate": 0.0007528349719018794, + "loss": 1.0456, + "step": 21060 + }, + { + "epoch": 0.6718964252686629, + "grad_norm": 0.19178634881973267, + "learning_rate": 0.0007496828396461442, + "loss": 1.0435, + "step": 21070 + }, + { + "epoch": 0.6722153129882968, + "grad_norm": 0.1954621821641922, + "learning_rate": 0.0007465439054193641, + "loss": 1.0665, + "step": 21080 + }, + { + "epoch": 0.6725342007079307, + "grad_norm": 0.18990260362625122, + "learning_rate": 0.0007434181139611777, + "loss": 1.0392, + "step": 21090 + }, + { + "epoch": 0.6728530884275646, + "grad_norm": 0.18366816639900208, + "learning_rate": 0.0007403054102425991, + "loss": 1.0124, + "step": 21100 + }, + { + "epoch": 0.6731719761471986, + "grad_norm": 0.1891552358865738, + "learning_rate": 0.0007372057394650503, + "loss": 1.0624, + "step": 21110 + }, + { + "epoch": 0.6734908638668324, + "grad_norm": 0.19928067922592163, + "learning_rate": 0.0007341190470593954, + "loss": 1.041, + "step": 21120 + }, + { + "epoch": 0.6738097515864664, + "grad_norm": 0.18672668933868408, + "learning_rate": 0.0007310452786849806, + "loss": 1.0245, + "step": 21130 + }, + { + "epoch": 0.6741286393061003, + "grad_norm": 0.19206221401691437, + "learning_rate": 0.0007279843802286769, + "loss": 1.0369, + "step": 21140 + }, + { + "epoch": 0.6744475270257342, + "grad_norm": 0.19227498769760132, + "learning_rate": 0.0007249362978039282, + "loss": 1.0333, + "step": 21150 + }, + { + "epoch": 0.6747664147453681, + "grad_norm": 0.18796871602535248, + "learning_rate": 0.0007219009777498024, + "loss": 1.0591, + "step": 21160 + }, + { + "epoch": 0.6750853024650021, + "grad_norm": 0.1877882033586502, + "learning_rate": 0.0007188783666300463, + "loss": 1.0188, + "step": 21170 + }, + { + "epoch": 0.675404190184636, + "grad_norm": 0.192293182015419, + "learning_rate": 0.000715868411232145, + "loss": 1.0582, + "step": 21180 + }, + { + "epoch": 0.6757230779042699, + "grad_norm": 0.19534669816493988, + "learning_rate": 0.0007128710585663859, + "loss": 1.0109, + "step": 21190 + }, + { + "epoch": 0.6760419656239038, + "grad_norm": 0.17991122603416443, + "learning_rate": 0.0007098862558649246, + "loss": 1.0201, + "step": 21200 + }, + { + "epoch": 0.6763608533435378, + "grad_norm": 0.18249261379241943, + "learning_rate": 0.000706913950580857, + "loss": 1.0056, + "step": 21210 + }, + { + "epoch": 0.6766797410631716, + "grad_norm": 0.18837027251720428, + "learning_rate": 0.000703954090387293, + "loss": 1.0296, + "step": 21220 + }, + { + "epoch": 0.6769986287828056, + "grad_norm": 0.18746830523014069, + "learning_rate": 0.0007010066231764369, + "loss": 1.0128, + "step": 21230 + }, + { + "epoch": 0.6773175165024395, + "grad_norm": 0.1960870623588562, + "learning_rate": 0.0006980714970586688, + "loss": 1.0259, + "step": 21240 + }, + { + "epoch": 0.6776364042220734, + "grad_norm": 0.18670812249183655, + "learning_rate": 0.0006951486603616313, + "loss": 1.0404, + "step": 21250 + }, + { + "epoch": 0.6779552919417073, + "grad_norm": 0.1884147822856903, + "learning_rate": 0.0006922380616293202, + "loss": 1.0233, + "step": 21260 + }, + { + "epoch": 0.6782741796613413, + "grad_norm": 0.20132644474506378, + "learning_rate": 0.0006893396496211784, + "loss": 1.0356, + "step": 21270 + }, + { + "epoch": 0.6785930673809751, + "grad_norm": 0.1896122395992279, + "learning_rate": 0.0006864533733111942, + "loss": 1.0438, + "step": 21280 + }, + { + "epoch": 0.6789119551006091, + "grad_norm": 0.18535298109054565, + "learning_rate": 0.0006835791818870018, + "loss": 1.0126, + "step": 21290 + }, + { + "epoch": 0.679230842820243, + "grad_norm": 0.1904032975435257, + "learning_rate": 0.0006807170247489883, + "loss": 1.0315, + "step": 21300 + }, + { + "epoch": 0.6795497305398769, + "grad_norm": 0.18548151850700378, + "learning_rate": 0.0006778668515094021, + "loss": 1.0316, + "step": 21310 + }, + { + "epoch": 0.6798686182595108, + "grad_norm": 0.19118435680866241, + "learning_rate": 0.0006750286119914656, + "loss": 1.0208, + "step": 21320 + }, + { + "epoch": 0.6801875059791448, + "grad_norm": 0.18484553694725037, + "learning_rate": 0.0006722022562284926, + "loss": 1.0432, + "step": 21330 + }, + { + "epoch": 0.6805063936987786, + "grad_norm": 0.18801674246788025, + "learning_rate": 0.000669387734463008, + "loss": 1.0095, + "step": 21340 + }, + { + "epoch": 0.6808252814184126, + "grad_norm": 0.1849338710308075, + "learning_rate": 0.0006665849971458721, + "loss": 1.0386, + "step": 21350 + }, + { + "epoch": 0.6811441691380465, + "grad_norm": 0.18272876739501953, + "learning_rate": 0.0006637939949354081, + "loss": 1.0148, + "step": 21360 + }, + { + "epoch": 0.6814630568576804, + "grad_norm": 0.19171403348445892, + "learning_rate": 0.000661014678696534, + "loss": 1.0252, + "step": 21370 + }, + { + "epoch": 0.6817819445773143, + "grad_norm": 0.18120139837265015, + "learning_rate": 0.0006582469994998967, + "loss": 1.0499, + "step": 21380 + }, + { + "epoch": 0.6821008322969483, + "grad_norm": 0.19342155754566193, + "learning_rate": 0.0006554909086210115, + "loss": 1.0257, + "step": 21390 + }, + { + "epoch": 0.6824197200165821, + "grad_norm": 0.1867322474718094, + "learning_rate": 0.0006527463575394037, + "loss": 1.0277, + "step": 21400 + }, + { + "epoch": 0.6827386077362161, + "grad_norm": 0.18487964570522308, + "learning_rate": 0.0006500132979377546, + "loss": 1.0246, + "step": 21410 + }, + { + "epoch": 0.68305749545585, + "grad_norm": 0.19047372043132782, + "learning_rate": 0.0006472916817010511, + "loss": 1.0152, + "step": 21420 + }, + { + "epoch": 0.6833763831754839, + "grad_norm": 0.1882789582014084, + "learning_rate": 0.0006445814609157381, + "loss": 1.016, + "step": 21430 + }, + { + "epoch": 0.6836952708951178, + "grad_norm": 0.193642720580101, + "learning_rate": 0.0006418825878688756, + "loss": 1.0261, + "step": 21440 + }, + { + "epoch": 0.6840141586147518, + "grad_norm": 0.19018161296844482, + "learning_rate": 0.0006391950150472985, + "loss": 1.0106, + "step": 21450 + }, + { + "epoch": 0.6843330463343856, + "grad_norm": 0.18343989551067352, + "learning_rate": 0.0006365186951367798, + "loss": 1.0189, + "step": 21460 + }, + { + "epoch": 0.6846519340540196, + "grad_norm": 0.19221456348896027, + "learning_rate": 0.0006338535810211983, + "loss": 1.0154, + "step": 21470 + }, + { + "epoch": 0.6849708217736535, + "grad_norm": 0.1819676160812378, + "learning_rate": 0.0006311996257817083, + "loss": 1.0163, + "step": 21480 + }, + { + "epoch": 0.6852897094932874, + "grad_norm": 0.19188915193080902, + "learning_rate": 0.0006285567826959148, + "loss": 1.0247, + "step": 21490 + }, + { + "epoch": 0.6856085972129213, + "grad_norm": 0.1923832893371582, + "learning_rate": 0.0006259250052370493, + "loss": 1.0384, + "step": 21500 + }, + { + "epoch": 0.6859274849325553, + "grad_norm": 0.18359391391277313, + "learning_rate": 0.0006233042470731524, + "loss": 1.0117, + "step": 21510 + }, + { + "epoch": 0.6862463726521891, + "grad_norm": 0.1798015683889389, + "learning_rate": 0.0006206944620662569, + "loss": 1.0222, + "step": 21520 + }, + { + "epoch": 0.6865652603718231, + "grad_norm": 0.1816495805978775, + "learning_rate": 0.0006180956042715764, + "loss": 1.01, + "step": 21530 + }, + { + "epoch": 0.686884148091457, + "grad_norm": 0.19056637585163116, + "learning_rate": 0.0006155076279366958, + "loss": 1.0215, + "step": 21540 + }, + { + "epoch": 0.6872030358110909, + "grad_norm": 0.18547722697257996, + "learning_rate": 0.0006129304875007661, + "loss": 1.0136, + "step": 21550 + }, + { + "epoch": 0.6875219235307248, + "grad_norm": 0.1922054886817932, + "learning_rate": 0.0006103641375937023, + "loss": 1.0195, + "step": 21560 + }, + { + "epoch": 0.6878408112503588, + "grad_norm": 0.1904924213886261, + "learning_rate": 0.0006078085330353851, + "loss": 1.0025, + "step": 21570 + }, + { + "epoch": 0.6881596989699926, + "grad_norm": 0.18738815188407898, + "learning_rate": 0.0006052636288348644, + "loss": 0.9913, + "step": 21580 + }, + { + "epoch": 0.6884785866896266, + "grad_norm": 0.1942305862903595, + "learning_rate": 0.0006027293801895685, + "loss": 1.0139, + "step": 21590 + }, + { + "epoch": 0.6887974744092605, + "grad_norm": 0.18734295666217804, + "learning_rate": 0.0006002057424845144, + "loss": 1.0081, + "step": 21600 + }, + { + "epoch": 0.6891163621288944, + "grad_norm": 0.18953493237495422, + "learning_rate": 0.0005976926712915233, + "loss": 1.012, + "step": 21610 + }, + { + "epoch": 0.6894352498485283, + "grad_norm": 0.1921025663614273, + "learning_rate": 0.0005951901223684372, + "loss": 1.0153, + "step": 21620 + }, + { + "epoch": 0.6897541375681623, + "grad_norm": 0.18728868663311005, + "learning_rate": 0.0005926980516583412, + "loss": 0.9991, + "step": 21630 + }, + { + "epoch": 0.6900730252877961, + "grad_norm": 0.19061149656772614, + "learning_rate": 0.0005902164152887875, + "loss": 1.0094, + "step": 21640 + }, + { + "epoch": 0.6903919130074301, + "grad_norm": 0.18096871674060822, + "learning_rate": 0.0005877451695710226, + "loss": 1.0133, + "step": 21650 + }, + { + "epoch": 0.690710800727064, + "grad_norm": 0.1817445605993271, + "learning_rate": 0.0005852842709992187, + "loss": 1.0144, + "step": 21660 + }, + { + "epoch": 0.6910296884466979, + "grad_norm": 0.19506876170635223, + "learning_rate": 0.0005828336762497074, + "loss": 1.0025, + "step": 21670 + }, + { + "epoch": 0.6913485761663318, + "grad_norm": 0.1860935389995575, + "learning_rate": 0.0005803933421802178, + "loss": 1.0164, + "step": 21680 + }, + { + "epoch": 0.6916674638859658, + "grad_norm": 0.18761244416236877, + "learning_rate": 0.0005779632258291156, + "loss": 1.0121, + "step": 21690 + }, + { + "epoch": 0.6919863516055996, + "grad_norm": 0.18786673247814178, + "learning_rate": 0.0005755432844146483, + "loss": 1.0137, + "step": 21700 + }, + { + "epoch": 0.6923052393252336, + "grad_norm": 0.1867864727973938, + "learning_rate": 0.0005731334753341907, + "loss": 1.0047, + "step": 21710 + }, + { + "epoch": 0.6926241270448675, + "grad_norm": 0.19096152484416962, + "learning_rate": 0.0005707337561634957, + "loss": 1.0024, + "step": 21720 + }, + { + "epoch": 0.6929430147645014, + "grad_norm": 0.1956232190132141, + "learning_rate": 0.0005683440846559473, + "loss": 1.016, + "step": 21730 + }, + { + "epoch": 0.6932619024841353, + "grad_norm": 0.1859396994113922, + "learning_rate": 0.0005659644187418168, + "loss": 0.9975, + "step": 21740 + }, + { + "epoch": 0.6935807902037693, + "grad_norm": 0.18429817259311676, + "learning_rate": 0.0005635947165275219, + "loss": 1.0059, + "step": 21750 + }, + { + "epoch": 0.6938996779234031, + "grad_norm": 0.18661698698997498, + "learning_rate": 0.0005612349362948896, + "loss": 1.0114, + "step": 21760 + }, + { + "epoch": 0.6942185656430371, + "grad_norm": 0.18431392312049866, + "learning_rate": 0.0005588850365004215, + "loss": 1.0059, + "step": 21770 + }, + { + "epoch": 0.694537453362671, + "grad_norm": 0.19037270545959473, + "learning_rate": 0.0005565449757745625, + "loss": 1.012, + "step": 21780 + }, + { + "epoch": 0.6948563410823049, + "grad_norm": 0.18189027905464172, + "learning_rate": 0.0005542147129209725, + "loss": 1.0081, + "step": 21790 + }, + { + "epoch": 0.6951752288019388, + "grad_norm": 0.18496230244636536, + "learning_rate": 0.0005518942069158012, + "loss": 1.0098, + "step": 21800 + }, + { + "epoch": 0.6954941165215728, + "grad_norm": 0.19114148616790771, + "learning_rate": 0.0005495834169069658, + "loss": 1.0009, + "step": 21810 + }, + { + "epoch": 0.6958130042412066, + "grad_norm": 0.19435721635818481, + "learning_rate": 0.0005472823022134319, + "loss": 1.013, + "step": 21820 + }, + { + "epoch": 0.6961318919608406, + "grad_norm": 0.18771202862262726, + "learning_rate": 0.000544990822324497, + "loss": 0.9897, + "step": 21830 + }, + { + "epoch": 0.6964507796804745, + "grad_norm": 0.18119776248931885, + "learning_rate": 0.0005427089368990779, + "loss": 0.9922, + "step": 21840 + }, + { + "epoch": 0.6967696674001084, + "grad_norm": 0.1919165849685669, + "learning_rate": 0.0005404366057649998, + "loss": 1.0073, + "step": 21850 + }, + { + "epoch": 0.6970885551197423, + "grad_norm": 0.19585365056991577, + "learning_rate": 0.00053817378891829, + "loss": 0.9996, + "step": 21860 + }, + { + "epoch": 0.6974074428393763, + "grad_norm": 0.1853964477777481, + "learning_rate": 0.0005359204465224725, + "loss": 1.0088, + "step": 21870 + }, + { + "epoch": 0.6977263305590101, + "grad_norm": 0.1896389275789261, + "learning_rate": 0.0005336765389078676, + "loss": 0.9985, + "step": 21880 + }, + { + "epoch": 0.6980452182786441, + "grad_norm": 0.18373239040374756, + "learning_rate": 0.000531442026570893, + "loss": 1.0067, + "step": 21890 + }, + { + "epoch": 0.698364105998278, + "grad_norm": 0.18755769729614258, + "learning_rate": 0.0005292168701733688, + "loss": 1.0027, + "step": 21900 + }, + { + "epoch": 0.6986829937179119, + "grad_norm": 0.18125109374523163, + "learning_rate": 0.0005270010305418245, + "loss": 0.9857, + "step": 21910 + }, + { + "epoch": 0.6990018814375458, + "grad_norm": 0.18633505702018738, + "learning_rate": 0.0005247944686668097, + "loss": 0.9839, + "step": 21920 + }, + { + "epoch": 0.6993207691571798, + "grad_norm": 0.19030959904193878, + "learning_rate": 0.0005225971457022069, + "loss": 0.9982, + "step": 21930 + }, + { + "epoch": 0.6996396568768136, + "grad_norm": 0.1889505535364151, + "learning_rate": 0.0005204090229645483, + "loss": 0.9807, + "step": 21940 + }, + { + "epoch": 0.6999585445964476, + "grad_norm": 0.18198321759700775, + "learning_rate": 0.0005182300619323341, + "loss": 0.99, + "step": 21950 + }, + { + "epoch": 0.7002774323160815, + "grad_norm": 0.19034498929977417, + "learning_rate": 0.0005160602242453551, + "loss": 1.0019, + "step": 21960 + }, + { + "epoch": 0.7005963200357154, + "grad_norm": 0.1891014575958252, + "learning_rate": 0.0005138994717040161, + "loss": 0.9735, + "step": 21970 + }, + { + "epoch": 0.7009152077553493, + "grad_norm": 0.18174348771572113, + "learning_rate": 0.0005117477662686652, + "loss": 0.9999, + "step": 21980 + }, + { + "epoch": 0.7012340954749833, + "grad_norm": 0.1839553713798523, + "learning_rate": 0.0005096050700589225, + "loss": 1.0032, + "step": 21990 + }, + { + "epoch": 0.7015529831946172, + "grad_norm": 0.18982721865177155, + "learning_rate": 0.0005074713453530141, + "loss": 0.991, + "step": 22000 + }, + { + "epoch": 0.7018718709142511, + "grad_norm": 0.18580938875675201, + "learning_rate": 0.0005053465545871075, + "loss": 1.0031, + "step": 22010 + }, + { + "epoch": 0.702190758633885, + "grad_norm": 0.19052889943122864, + "learning_rate": 0.0005032306603546511, + "loss": 0.9878, + "step": 22020 + }, + { + "epoch": 0.702509646353519, + "grad_norm": 0.18959671258926392, + "learning_rate": 0.0005011236254057146, + "loss": 1.0018, + "step": 22030 + }, + { + "epoch": 0.7028285340731528, + "grad_norm": 0.18057048320770264, + "learning_rate": 0.0004990254126463343, + "loss": 0.9908, + "step": 22040 + }, + { + "epoch": 0.7031474217927868, + "grad_norm": 0.19141310453414917, + "learning_rate": 0.0004969359851378588, + "loss": 0.9963, + "step": 22050 + }, + { + "epoch": 0.7034663095124207, + "grad_norm": 0.18355531990528107, + "learning_rate": 0.0004948553060963001, + "loss": 0.9711, + "step": 22060 + }, + { + "epoch": 0.7037851972320546, + "grad_norm": 0.19848114252090454, + "learning_rate": 0.0004927833388916852, + "loss": 1.0064, + "step": 22070 + }, + { + "epoch": 0.7041040849516885, + "grad_norm": 0.18456107378005981, + "learning_rate": 0.0004907200470474113, + "loss": 0.993, + "step": 22080 + }, + { + "epoch": 0.7044229726713225, + "grad_norm": 0.18483372032642365, + "learning_rate": 0.0004886653942396035, + "loss": 0.9807, + "step": 22090 + }, + { + "epoch": 0.7047418603909563, + "grad_norm": 0.18378837406635284, + "learning_rate": 0.00048661934429647597, + "loss": 0.9992, + "step": 22100 + }, + { + "epoch": 0.7050607481105903, + "grad_norm": 0.1917727291584015, + "learning_rate": 0.0004845818611976946, + "loss": 0.9897, + "step": 22110 + }, + { + "epoch": 0.7053796358302242, + "grad_norm": 0.18209923803806305, + "learning_rate": 0.0004825529090737429, + "loss": 0.9792, + "step": 22120 + }, + { + "epoch": 0.7056985235498581, + "grad_norm": 0.17974483966827393, + "learning_rate": 0.0004805324522052906, + "loss": 0.9956, + "step": 22130 + }, + { + "epoch": 0.706017411269492, + "grad_norm": 0.18029539287090302, + "learning_rate": 0.000478520455022565, + "loss": 0.9818, + "step": 22140 + }, + { + "epoch": 0.706336298989126, + "grad_norm": 0.182785302400589, + "learning_rate": 0.0004765168821047247, + "loss": 0.9831, + "step": 22150 + }, + { + "epoch": 0.7066551867087598, + "grad_norm": 0.1831427663564682, + "learning_rate": 0.0004745216981792355, + "loss": 0.9999, + "step": 22160 + }, + { + "epoch": 0.7069740744283938, + "grad_norm": 0.18406899273395538, + "learning_rate": 0.00047253486812125044, + "loss": 0.9654, + "step": 22170 + }, + { + "epoch": 0.7072929621480277, + "grad_norm": 0.19103997945785522, + "learning_rate": 0.0004705563569529904, + "loss": 0.9865, + "step": 22180 + }, + { + "epoch": 0.7076118498676616, + "grad_norm": 0.1879732608795166, + "learning_rate": 0.00046858612984312904, + "loss": 0.9963, + "step": 22190 + }, + { + "epoch": 0.7079307375872955, + "grad_norm": 0.18674372136592865, + "learning_rate": 0.00046662415210617933, + "loss": 0.9823, + "step": 22200 + }, + { + "epoch": 0.7082496253069295, + "grad_norm": 0.18366439640522003, + "learning_rate": 0.00046467038920188283, + "loss": 0.9865, + "step": 22210 + }, + { + "epoch": 0.7085685130265633, + "grad_norm": 0.18931563198566437, + "learning_rate": 0.0004627248067346017, + "loss": 0.98, + "step": 22220 + }, + { + "epoch": 0.7088874007461973, + "grad_norm": 0.18589584529399872, + "learning_rate": 0.0004607873704527135, + "loss": 1.0023, + "step": 22230 + }, + { + "epoch": 0.7092062884658312, + "grad_norm": 0.19276930391788483, + "learning_rate": 0.00045885804624800757, + "loss": 1.0027, + "step": 22240 + }, + { + "epoch": 0.7095251761854651, + "grad_norm": 0.18109150230884552, + "learning_rate": 0.000456936800155085, + "loss": 0.9737, + "step": 22250 + }, + { + "epoch": 0.709844063905099, + "grad_norm": 0.1827128678560257, + "learning_rate": 0.00045502359835076047, + "loss": 0.9768, + "step": 22260 + }, + { + "epoch": 0.710162951624733, + "grad_norm": 0.18957406282424927, + "learning_rate": 0.00045311840715346694, + "loss": 0.9796, + "step": 22270 + }, + { + "epoch": 0.7104818393443668, + "grad_norm": 0.1889813393354416, + "learning_rate": 0.0004512211930226627, + "loss": 0.9838, + "step": 22280 + }, + { + "epoch": 0.7108007270640008, + "grad_norm": 0.18386803567409515, + "learning_rate": 0.0004493319225582409, + "loss": 0.9903, + "step": 22290 + }, + { + "epoch": 0.7111196147836347, + "grad_norm": 0.18842735886573792, + "learning_rate": 0.00044745056249994127, + "loss": 0.9888, + "step": 22300 + }, + { + "epoch": 0.7114385025032686, + "grad_norm": 0.18693885207176208, + "learning_rate": 0.00044557707972676475, + "loss": 0.9828, + "step": 22310 + }, + { + "epoch": 0.7117573902229025, + "grad_norm": 0.1820717453956604, + "learning_rate": 0.0004437114412563908, + "loss": 0.9856, + "step": 22320 + }, + { + "epoch": 0.7120762779425365, + "grad_norm": 0.18761351704597473, + "learning_rate": 0.0004418536142445961, + "loss": 0.9836, + "step": 22330 + }, + { + "epoch": 0.7123951656621703, + "grad_norm": 0.1892370730638504, + "learning_rate": 0.0004400035659846766, + "loss": 0.9766, + "step": 22340 + }, + { + "epoch": 0.7127140533818043, + "grad_norm": 0.18151234090328217, + "learning_rate": 0.00043816126390687195, + "loss": 0.9651, + "step": 22350 + }, + { + "epoch": 0.7130329411014382, + "grad_norm": 0.1909625083208084, + "learning_rate": 0.0004363266755777918, + "loss": 0.9808, + "step": 22360 + }, + { + "epoch": 0.7133518288210721, + "grad_norm": 0.17754018306732178, + "learning_rate": 0.00043449976869984496, + "loss": 0.9813, + "step": 22370 + }, + { + "epoch": 0.713670716540706, + "grad_norm": 0.1815163642168045, + "learning_rate": 0.00043268051111067067, + "loss": 0.9625, + "step": 22380 + }, + { + "epoch": 0.71398960426034, + "grad_norm": 0.18049179017543793, + "learning_rate": 0.00043086887078257267, + "loss": 0.9729, + "step": 22390 + }, + { + "epoch": 0.7143084919799738, + "grad_norm": 0.18532010912895203, + "learning_rate": 0.00042906481582195513, + "loss": 0.9576, + "step": 22400 + }, + { + "epoch": 0.7146273796996078, + "grad_norm": 0.18741180002689362, + "learning_rate": 0.0004272683144687611, + "loss": 0.9909, + "step": 22410 + }, + { + "epoch": 0.7149462674192417, + "grad_norm": 0.18309523165225983, + "learning_rate": 0.0004254793350959138, + "loss": 0.9858, + "step": 22420 + }, + { + "epoch": 0.7152651551388756, + "grad_norm": 0.18443326652050018, + "learning_rate": 0.00042369784620875905, + "loss": 0.9638, + "step": 22430 + }, + { + "epoch": 0.7155840428585095, + "grad_norm": 0.18828490376472473, + "learning_rate": 0.00042192381644451176, + "loss": 0.9697, + "step": 22440 + }, + { + "epoch": 0.7159029305781435, + "grad_norm": 0.1892024129629135, + "learning_rate": 0.0004201572145717032, + "loss": 0.9741, + "step": 22450 + }, + { + "epoch": 0.7162218182977773, + "grad_norm": 0.18395423889160156, + "learning_rate": 0.0004183980094896312, + "loss": 0.9832, + "step": 22460 + }, + { + "epoch": 0.7165407060174113, + "grad_norm": 0.18173611164093018, + "learning_rate": 0.0004166461702278128, + "loss": 0.9703, + "step": 22470 + }, + { + "epoch": 0.7168595937370452, + "grad_norm": 0.1873852014541626, + "learning_rate": 0.0004149016659454389, + "loss": 0.9676, + "step": 22480 + }, + { + "epoch": 0.7171784814566791, + "grad_norm": 0.1870831847190857, + "learning_rate": 0.00041316446593083145, + "loss": 0.9739, + "step": 22490 + }, + { + "epoch": 0.717497369176313, + "grad_norm": 0.1896754801273346, + "learning_rate": 0.00041143453960090277, + "loss": 0.9775, + "step": 22500 + }, + { + "epoch": 0.717816256895947, + "grad_norm": 0.18270163238048553, + "learning_rate": 0.0004097118565006169, + "loss": 0.9752, + "step": 22510 + }, + { + "epoch": 0.7181351446155808, + "grad_norm": 0.1867200881242752, + "learning_rate": 0.00040799638630245356, + "loss": 0.9648, + "step": 22520 + }, + { + "epoch": 0.7184540323352148, + "grad_norm": 0.19233950972557068, + "learning_rate": 0.0004062880988058746, + "loss": 0.9662, + "step": 22530 + }, + { + "epoch": 0.7187729200548487, + "grad_norm": 0.18478399515151978, + "learning_rate": 0.0004045869639367917, + "loss": 0.9691, + "step": 22540 + }, + { + "epoch": 0.7190918077744826, + "grad_norm": 0.18730990588665009, + "learning_rate": 0.0004028929517470373, + "loss": 0.9673, + "step": 22550 + }, + { + "epoch": 0.7194106954941165, + "grad_norm": 0.18217439949512482, + "learning_rate": 0.00040120603241383754, + "loss": 0.965, + "step": 22560 + }, + { + "epoch": 0.7197295832137505, + "grad_norm": 0.18644104897975922, + "learning_rate": 0.0003995261762392866, + "loss": 0.9701, + "step": 22570 + }, + { + "epoch": 0.7200484709333843, + "grad_norm": 0.1922314167022705, + "learning_rate": 0.0003978533536498247, + "loss": 0.9698, + "step": 22580 + }, + { + "epoch": 0.7203673586530183, + "grad_norm": 0.18881219625473022, + "learning_rate": 0.0003961875351957167, + "loss": 0.9628, + "step": 22590 + }, + { + "epoch": 0.7206862463726522, + "grad_norm": 0.18104930222034454, + "learning_rate": 0.000394528691550534, + "loss": 0.967, + "step": 22600 + }, + { + "epoch": 0.7210051340922861, + "grad_norm": 0.1838918924331665, + "learning_rate": 0.0003928767935106386, + "loss": 0.9563, + "step": 22610 + }, + { + "epoch": 0.72132402181192, + "grad_norm": 0.19298873841762543, + "learning_rate": 0.0003912318119946682, + "loss": 0.9705, + "step": 22620 + }, + { + "epoch": 0.721642909531554, + "grad_norm": 0.1953434944152832, + "learning_rate": 0.0003895937180430247, + "loss": 0.9669, + "step": 22630 + }, + { + "epoch": 0.7219617972511878, + "grad_norm": 0.18364940583705902, + "learning_rate": 0.0003879624828173645, + "loss": 0.9552, + "step": 22640 + }, + { + "epoch": 0.7222806849708218, + "grad_norm": 0.1896495521068573, + "learning_rate": 0.0003863380776000905, + "loss": 0.97, + "step": 22650 + }, + { + "epoch": 0.7225995726904557, + "grad_norm": 0.19229236245155334, + "learning_rate": 0.0003847204737938466, + "loss": 0.9617, + "step": 22660 + }, + { + "epoch": 0.7229184604100896, + "grad_norm": 0.1945151388645172, + "learning_rate": 0.0003831096429210144, + "loss": 0.9585, + "step": 22670 + }, + { + "epoch": 0.7232373481297235, + "grad_norm": 0.18564851582050323, + "learning_rate": 0.00038150555662321163, + "loss": 0.9655, + "step": 22680 + }, + { + "epoch": 0.7235562358493575, + "grad_norm": 0.1905517876148224, + "learning_rate": 0.0003799081866607931, + "loss": 0.976, + "step": 22690 + }, + { + "epoch": 0.7238751235689913, + "grad_norm": 0.18559037148952484, + "learning_rate": 0.00037831750491235344, + "loss": 0.9715, + "step": 22700 + }, + { + "epoch": 0.7241940112886253, + "grad_norm": 0.18964865803718567, + "learning_rate": 0.0003767334833742322, + "loss": 0.9654, + "step": 22710 + }, + { + "epoch": 0.7245128990082592, + "grad_norm": 0.1926131397485733, + "learning_rate": 0.0003751560941600205, + "loss": 0.9556, + "step": 22720 + }, + { + "epoch": 0.7248317867278931, + "grad_norm": 0.18436503410339355, + "learning_rate": 0.0003735853095000704, + "loss": 0.9575, + "step": 22730 + }, + { + "epoch": 0.725150674447527, + "grad_norm": 0.1904323697090149, + "learning_rate": 0.0003720211017410059, + "loss": 0.9488, + "step": 22740 + }, + { + "epoch": 0.725469562167161, + "grad_norm": 0.1889037936925888, + "learning_rate": 0.0003704634433452362, + "loss": 0.9564, + "step": 22750 + }, + { + "epoch": 0.7257884498867948, + "grad_norm": 0.1985059231519699, + "learning_rate": 0.0003689123068904708, + "loss": 0.9664, + "step": 22760 + }, + { + "epoch": 0.7261073376064288, + "grad_norm": 0.196369007229805, + "learning_rate": 0.00036736766506923683, + "loss": 0.9646, + "step": 22770 + }, + { + "epoch": 0.7264262253260627, + "grad_norm": 0.18976521492004395, + "learning_rate": 0.00036582949068839814, + "loss": 0.9542, + "step": 22780 + }, + { + "epoch": 0.7267451130456966, + "grad_norm": 0.19328856468200684, + "learning_rate": 0.0003642977566686768, + "loss": 0.9763, + "step": 22790 + }, + { + "epoch": 0.7270640007653305, + "grad_norm": 0.1831946074962616, + "learning_rate": 0.00036277243604417616, + "loss": 0.9544, + "step": 22800 + }, + { + "epoch": 0.7273828884849645, + "grad_norm": 0.18955983221530914, + "learning_rate": 0.00036125350196190614, + "loss": 0.9699, + "step": 22810 + }, + { + "epoch": 0.7277017762045984, + "grad_norm": 0.19156791269779205, + "learning_rate": 0.0003597409276813109, + "loss": 0.9747, + "step": 22820 + }, + { + "epoch": 0.7280206639242323, + "grad_norm": 0.1873570680618286, + "learning_rate": 0.0003582346865737974, + "loss": 0.9647, + "step": 22830 + }, + { + "epoch": 0.7283395516438662, + "grad_norm": 0.1930588185787201, + "learning_rate": 0.0003567347521222671, + "loss": 0.9737, + "step": 22840 + }, + { + "epoch": 0.7286584393635002, + "grad_norm": 0.1854488104581833, + "learning_rate": 0.00035524109792064903, + "loss": 0.9753, + "step": 22850 + }, + { + "epoch": 0.728977327083134, + "grad_norm": 0.18703170120716095, + "learning_rate": 0.00035375369767343465, + "loss": 0.9513, + "step": 22860 + }, + { + "epoch": 0.729296214802768, + "grad_norm": 0.1863420605659485, + "learning_rate": 0.0003522725251952154, + "loss": 0.9618, + "step": 22870 + }, + { + "epoch": 0.7296151025224019, + "grad_norm": 0.1924019306898117, + "learning_rate": 0.0003507975544102212, + "loss": 0.9453, + "step": 22880 + }, + { + "epoch": 0.7299339902420358, + "grad_norm": 0.19052354991436005, + "learning_rate": 0.000349328759351862, + "loss": 0.9561, + "step": 22890 + }, + { + "epoch": 0.7302528779616697, + "grad_norm": 0.18800488114356995, + "learning_rate": 0.00034786611416226987, + "loss": 0.9726, + "step": 22900 + }, + { + "epoch": 0.7305717656813037, + "grad_norm": 0.18403072655200958, + "learning_rate": 0.0003464095930918445, + "loss": 0.9536, + "step": 22910 + }, + { + "epoch": 0.7308906534009375, + "grad_norm": 0.18697819113731384, + "learning_rate": 0.0003449591704987995, + "loss": 0.9511, + "step": 22920 + }, + { + "epoch": 0.7312095411205715, + "grad_norm": 0.18541944026947021, + "learning_rate": 0.0003435148208487109, + "loss": 0.9685, + "step": 22930 + }, + { + "epoch": 0.7315284288402054, + "grad_norm": 0.1891910880804062, + "learning_rate": 0.0003420765187140679, + "loss": 0.9524, + "step": 22940 + }, + { + "epoch": 0.7318473165598393, + "grad_norm": 0.18466798961162567, + "learning_rate": 0.00034064423877382523, + "loss": 0.9507, + "step": 22950 + }, + { + "epoch": 0.7321662042794732, + "grad_norm": 0.19009530544281006, + "learning_rate": 0.000339217955812957, + "loss": 0.9511, + "step": 22960 + }, + { + "epoch": 0.7324850919991072, + "grad_norm": 0.18500375747680664, + "learning_rate": 0.0003377976447220132, + "loss": 0.9443, + "step": 22970 + }, + { + "epoch": 0.732803979718741, + "grad_norm": 0.18694646656513214, + "learning_rate": 0.0003363832804966775, + "loss": 0.9492, + "step": 22980 + }, + { + "epoch": 0.733122867438375, + "grad_norm": 0.18722723424434662, + "learning_rate": 0.00033497483823732686, + "loss": 0.9415, + "step": 22990 + }, + { + "epoch": 0.7334417551580089, + "grad_norm": 0.19515295326709747, + "learning_rate": 0.0003335722931485937, + "loss": 0.9615, + "step": 23000 + }, + { + "epoch": 0.7337606428776428, + "grad_norm": 0.18494488298892975, + "learning_rate": 0.00033217562053892876, + "loss": 0.9512, + "step": 23010 + }, + { + "epoch": 0.7340795305972767, + "grad_norm": 0.19232329726219177, + "learning_rate": 0.00033078479582016675, + "loss": 0.9603, + "step": 23020 + }, + { + "epoch": 0.7343984183169107, + "grad_norm": 0.18243782222270966, + "learning_rate": 0.0003293997945070935, + "loss": 0.9707, + "step": 23030 + }, + { + "epoch": 0.7347173060365445, + "grad_norm": 0.18182645738124847, + "learning_rate": 0.00032802059221701484, + "loss": 0.9418, + "step": 23040 + }, + { + "epoch": 0.7350361937561785, + "grad_norm": 0.19260235130786896, + "learning_rate": 0.00032664716466932733, + "loss": 0.9697, + "step": 23050 + }, + { + "epoch": 0.7353550814758124, + "grad_norm": 0.18719175457954407, + "learning_rate": 0.0003252794876850907, + "loss": 0.9499, + "step": 23060 + }, + { + "epoch": 0.7356739691954463, + "grad_norm": 0.1868394911289215, + "learning_rate": 0.00032391753718660234, + "loss": 0.9502, + "step": 23070 + }, + { + "epoch": 0.7359928569150802, + "grad_norm": 0.1856638640165329, + "learning_rate": 0.00032256128919697346, + "loss": 0.9573, + "step": 23080 + }, + { + "epoch": 0.7363117446347142, + "grad_norm": 0.18470019102096558, + "learning_rate": 0.00032121071983970694, + "loss": 0.9338, + "step": 23090 + }, + { + "epoch": 0.736630632354348, + "grad_norm": 0.1904626041650772, + "learning_rate": 0.0003198658053382767, + "loss": 0.9534, + "step": 23100 + }, + { + "epoch": 0.736949520073982, + "grad_norm": 0.18753063678741455, + "learning_rate": 0.0003185265220157095, + "loss": 0.9358, + "step": 23110 + }, + { + "epoch": 0.7372684077936159, + "grad_norm": 0.18264374136924744, + "learning_rate": 0.00031719284629416804, + "loss": 0.9542, + "step": 23120 + }, + { + "epoch": 0.7375872955132498, + "grad_norm": 0.18596158921718597, + "learning_rate": 0.0003158647546945357, + "loss": 0.9445, + "step": 23130 + }, + { + "epoch": 0.7379061832328837, + "grad_norm": 0.1909426897764206, + "learning_rate": 0.0003145422238360034, + "loss": 0.9745, + "step": 23140 + }, + { + "epoch": 0.7382250709525177, + "grad_norm": 0.1785246878862381, + "learning_rate": 0.0003132252304356578, + "loss": 0.9695, + "step": 23150 + }, + { + "epoch": 0.7385439586721515, + "grad_norm": 0.18791161477565765, + "learning_rate": 0.00031191375130807145, + "loss": 0.9535, + "step": 23160 + }, + { + "epoch": 0.7388628463917855, + "grad_norm": 0.18513339757919312, + "learning_rate": 0.0003106077633648948, + "loss": 0.9566, + "step": 23170 + }, + { + "epoch": 0.7391817341114194, + "grad_norm": 0.19226591289043427, + "learning_rate": 0.0003093072436144496, + "loss": 0.947, + "step": 23180 + }, + { + "epoch": 0.7395006218310533, + "grad_norm": 0.18903833627700806, + "learning_rate": 0.00030801216916132403, + "loss": 0.9454, + "step": 23190 + }, + { + "epoch": 0.7398195095506872, + "grad_norm": 0.18524321913719177, + "learning_rate": 0.00030672251720596967, + "loss": 0.9444, + "step": 23200 + }, + { + "epoch": 0.7401383972703212, + "grad_norm": 0.1868593543767929, + "learning_rate": 0.0003054382650443004, + "loss": 0.9544, + "step": 23210 + }, + { + "epoch": 0.740457284989955, + "grad_norm": 0.1891583651304245, + "learning_rate": 0.0003041593900672922, + "loss": 0.9377, + "step": 23220 + }, + { + "epoch": 0.740776172709589, + "grad_norm": 0.1848241090774536, + "learning_rate": 0.00030288586976058574, + "loss": 0.9334, + "step": 23230 + }, + { + "epoch": 0.7410950604292229, + "grad_norm": 0.19397152960300446, + "learning_rate": 0.00030161768170408935, + "loss": 0.9403, + "step": 23240 + }, + { + "epoch": 0.7414139481488567, + "grad_norm": 0.19353078305721283, + "learning_rate": 0.0003003548035715848, + "loss": 0.9424, + "step": 23250 + }, + { + "epoch": 0.7417328358684907, + "grad_norm": 0.1882072240114212, + "learning_rate": 0.0002990972131303341, + "loss": 0.9421, + "step": 23260 + }, + { + "epoch": 0.7420517235881247, + "grad_norm": 0.1862560659646988, + "learning_rate": 0.0002978448882406881, + "loss": 0.9413, + "step": 23270 + }, + { + "epoch": 0.7423706113077585, + "grad_norm": 0.19289855659008026, + "learning_rate": 0.00029659780685569674, + "loss": 0.9371, + "step": 23280 + }, + { + "epoch": 0.7426894990273925, + "grad_norm": 0.1900213360786438, + "learning_rate": 0.00029535594702072087, + "loss": 0.9352, + "step": 23290 + }, + { + "epoch": 0.7430083867470264, + "grad_norm": 0.19510945677757263, + "learning_rate": 0.0002941192868730457, + "loss": 0.9612, + "step": 23300 + }, + { + "epoch": 0.7433272744666602, + "grad_norm": 0.186062753200531, + "learning_rate": 0.00029288780464149593, + "loss": 0.9451, + "step": 23310 + }, + { + "epoch": 0.7436461621862942, + "grad_norm": 0.19324566423892975, + "learning_rate": 0.00029166147864605294, + "loss": 0.9531, + "step": 23320 + }, + { + "epoch": 0.7439650499059282, + "grad_norm": 0.1828330159187317, + "learning_rate": 0.0002904402872974721, + "loss": 0.9505, + "step": 23330 + }, + { + "epoch": 0.744283937625562, + "grad_norm": 0.18823856115341187, + "learning_rate": 0.00028922420909690367, + "loss": 0.9478, + "step": 23340 + }, + { + "epoch": 0.744602825345196, + "grad_norm": 0.18443375825881958, + "learning_rate": 0.00028801322263551397, + "loss": 0.9438, + "step": 23350 + }, + { + "epoch": 0.7449217130648299, + "grad_norm": 0.1899966150522232, + "learning_rate": 0.0002868073065941083, + "loss": 0.9278, + "step": 23360 + }, + { + "epoch": 0.7452406007844637, + "grad_norm": 0.1841687709093094, + "learning_rate": 0.00028560643974275587, + "loss": 0.9396, + "step": 23370 + }, + { + "epoch": 0.7455594885040977, + "grad_norm": 0.18998615443706512, + "learning_rate": 0.00028441060094041583, + "loss": 0.9382, + "step": 23380 + }, + { + "epoch": 0.7458783762237317, + "grad_norm": 0.19826436042785645, + "learning_rate": 0.0002832197691345653, + "loss": 0.9448, + "step": 23390 + }, + { + "epoch": 0.7461972639433655, + "grad_norm": 0.19598931074142456, + "learning_rate": 0.0002820339233608287, + "loss": 0.945, + "step": 23400 + }, + { + "epoch": 0.7465161516629994, + "grad_norm": 0.18679118156433105, + "learning_rate": 0.00028085304274260857, + "loss": 0.9547, + "step": 23410 + }, + { + "epoch": 0.7468350393826334, + "grad_norm": 0.1902741640806198, + "learning_rate": 0.0002796771064907181, + "loss": 0.9444, + "step": 23420 + }, + { + "epoch": 0.7471539271022672, + "grad_norm": 0.18102550506591797, + "learning_rate": 0.0002785060939030151, + "loss": 0.9373, + "step": 23430 + }, + { + "epoch": 0.7474728148219012, + "grad_norm": 0.1945812702178955, + "learning_rate": 0.0002773399843640378, + "loss": 0.9259, + "step": 23440 + }, + { + "epoch": 0.7477917025415352, + "grad_norm": 0.1866329461336136, + "learning_rate": 0.00027617875734464145, + "loss": 0.931, + "step": 23450 + }, + { + "epoch": 0.748110590261169, + "grad_norm": 0.19371329247951508, + "learning_rate": 0.00027502239240163715, + "loss": 0.9546, + "step": 23460 + }, + { + "epoch": 0.748429477980803, + "grad_norm": 0.1864960938692093, + "learning_rate": 0.00027387086917743224, + "loss": 0.9432, + "step": 23470 + }, + { + "epoch": 0.7487483657004369, + "grad_norm": 0.1891416311264038, + "learning_rate": 0.0002727241673996714, + "loss": 0.9329, + "step": 23480 + }, + { + "epoch": 0.7490672534200707, + "grad_norm": 0.18572381138801575, + "learning_rate": 0.00027158226688088006, + "loss": 0.9348, + "step": 23490 + }, + { + "epoch": 0.7493861411397047, + "grad_norm": 0.1926565021276474, + "learning_rate": 0.0002704451475181089, + "loss": 0.9434, + "step": 23500 + }, + { + "epoch": 0.7497050288593387, + "grad_norm": 0.1821991503238678, + "learning_rate": 0.00026931278929257993, + "loss": 0.9317, + "step": 23510 + }, + { + "epoch": 0.7500239165789725, + "grad_norm": 0.1843806356191635, + "learning_rate": 0.00026818517226933437, + "loss": 0.9282, + "step": 23520 + }, + { + "epoch": 0.7503428042986064, + "grad_norm": 0.19848383963108063, + "learning_rate": 0.00026706227659688107, + "loss": 0.94, + "step": 23530 + }, + { + "epoch": 0.7506616920182404, + "grad_norm": 0.1843789517879486, + "learning_rate": 0.00026594408250684776, + "loss": 0.9373, + "step": 23540 + }, + { + "epoch": 0.7509805797378742, + "grad_norm": 0.1920584887266159, + "learning_rate": 0.00026483057031363234, + "loss": 0.9448, + "step": 23550 + }, + { + "epoch": 0.7512994674575082, + "grad_norm": 0.1851128190755844, + "learning_rate": 0.00026372172041405677, + "loss": 0.9338, + "step": 23560 + }, + { + "epoch": 0.7516183551771422, + "grad_norm": 0.19818131625652313, + "learning_rate": 0.0002626175132870219, + "loss": 0.9425, + "step": 23570 + }, + { + "epoch": 0.751937242896776, + "grad_norm": 0.18199047446250916, + "learning_rate": 0.0002615179294931637, + "loss": 0.9331, + "step": 23580 + }, + { + "epoch": 0.75225613061641, + "grad_norm": 0.18399380147457123, + "learning_rate": 0.000260422949674511, + "loss": 0.9283, + "step": 23590 + }, + { + "epoch": 0.7525750183360439, + "grad_norm": 0.18700499832630157, + "learning_rate": 0.00025933255455414493, + "loss": 0.9268, + "step": 23600 + }, + { + "epoch": 0.7528939060556777, + "grad_norm": 0.1888848841190338, + "learning_rate": 0.0002582467249358593, + "loss": 0.9426, + "step": 23610 + }, + { + "epoch": 0.7532127937753117, + "grad_norm": 0.18916404247283936, + "learning_rate": 0.0002571654417038226, + "loss": 0.9448, + "step": 23620 + }, + { + "epoch": 0.7535316814949456, + "grad_norm": 0.1902066469192505, + "learning_rate": 0.0002560886858222419, + "loss": 0.9503, + "step": 23630 + }, + { + "epoch": 0.7538505692145795, + "grad_norm": 0.17855492234230042, + "learning_rate": 0.0002550164383350272, + "loss": 0.9192, + "step": 23640 + }, + { + "epoch": 0.7541694569342134, + "grad_norm": 0.1947944611310959, + "learning_rate": 0.0002539486803654581, + "loss": 0.9391, + "step": 23650 + }, + { + "epoch": 0.7544883446538474, + "grad_norm": 0.19682112336158752, + "learning_rate": 0.0002528853931158513, + "loss": 0.9362, + "step": 23660 + }, + { + "epoch": 0.7548072323734814, + "grad_norm": 0.18450482189655304, + "learning_rate": 0.0002518265578672297, + "loss": 0.9295, + "step": 23670 + }, + { + "epoch": 0.7551261200931152, + "grad_norm": 0.1883128583431244, + "learning_rate": 0.00025077215597899276, + "loss": 0.9278, + "step": 23680 + }, + { + "epoch": 0.7554450078127491, + "grad_norm": 0.1992689073085785, + "learning_rate": 0.0002497221688885888, + "loss": 0.9508, + "step": 23690 + }, + { + "epoch": 0.7557638955323831, + "grad_norm": 0.19176770746707916, + "learning_rate": 0.0002486765781111873, + "loss": 0.9338, + "step": 23700 + }, + { + "epoch": 0.7560827832520169, + "grad_norm": 0.18524746596813202, + "learning_rate": 0.0002476353652393545, + "loss": 0.9386, + "step": 23710 + }, + { + "epoch": 0.7564016709716509, + "grad_norm": 0.19836051762104034, + "learning_rate": 0.0002465985119427286, + "loss": 0.9239, + "step": 23720 + }, + { + "epoch": 0.7567205586912849, + "grad_norm": 0.19612550735473633, + "learning_rate": 0.0002455659999676974, + "loss": 0.9326, + "step": 23730 + }, + { + "epoch": 0.7570394464109187, + "grad_norm": 0.18374860286712646, + "learning_rate": 0.0002445378111370768, + "loss": 0.9327, + "step": 23740 + }, + { + "epoch": 0.7573583341305526, + "grad_norm": 0.18519578874111176, + "learning_rate": 0.00024351392734979106, + "loss": 0.9249, + "step": 23750 + }, + { + "epoch": 0.7576772218501866, + "grad_norm": 0.1952565759420395, + "learning_rate": 0.00024249433058055368, + "loss": 0.9177, + "step": 23760 + }, + { + "epoch": 0.7579961095698204, + "grad_norm": 0.1942618042230606, + "learning_rate": 0.00024147900287955056, + "loss": 0.9411, + "step": 23770 + }, + { + "epoch": 0.7583149972894544, + "grad_norm": 0.1848842203617096, + "learning_rate": 0.0002404679263721236, + "loss": 0.9188, + "step": 23780 + }, + { + "epoch": 0.7586338850090883, + "grad_norm": 0.19031748175621033, + "learning_rate": 0.00023946108325845628, + "loss": 0.9368, + "step": 23790 + }, + { + "epoch": 0.7589527727287222, + "grad_norm": 0.19315947592258453, + "learning_rate": 0.00023845845581326017, + "loss": 0.9527, + "step": 23800 + }, + { + "epoch": 0.7592716604483561, + "grad_norm": 0.2014995813369751, + "learning_rate": 0.00023746002638546287, + "loss": 0.9196, + "step": 23810 + }, + { + "epoch": 0.7595905481679901, + "grad_norm": 0.19522787630558014, + "learning_rate": 0.00023646577739789735, + "loss": 0.9334, + "step": 23820 + }, + { + "epoch": 0.7599094358876239, + "grad_norm": 0.18518857657909393, + "learning_rate": 0.00023547569134699248, + "loss": 0.931, + "step": 23830 + }, + { + "epoch": 0.7602283236072579, + "grad_norm": 0.1907796561717987, + "learning_rate": 0.00023448975080246477, + "loss": 0.9305, + "step": 23840 + }, + { + "epoch": 0.7605472113268918, + "grad_norm": 0.18997465074062347, + "learning_rate": 0.00023350793840701175, + "loss": 0.9316, + "step": 23850 + }, + { + "epoch": 0.7608660990465257, + "grad_norm": 0.18743346631526947, + "learning_rate": 0.00023253023687600625, + "loss": 0.9189, + "step": 23860 + }, + { + "epoch": 0.7611849867661596, + "grad_norm": 0.18413014709949493, + "learning_rate": 0.00023155662899719193, + "loss": 0.9461, + "step": 23870 + }, + { + "epoch": 0.7615038744857936, + "grad_norm": 0.1894141137599945, + "learning_rate": 0.00023058709763038065, + "loss": 0.9311, + "step": 23880 + }, + { + "epoch": 0.7618227622054274, + "grad_norm": 0.18807397782802582, + "learning_rate": 0.00022962162570715043, + "loss": 0.9373, + "step": 23890 + }, + { + "epoch": 0.7621416499250614, + "grad_norm": 0.1933257281780243, + "learning_rate": 0.00022866019623054502, + "loss": 0.9213, + "step": 23900 + }, + { + "epoch": 0.7624605376446953, + "grad_norm": 0.19029848277568817, + "learning_rate": 0.00022770279227477477, + "loss": 0.9353, + "step": 23910 + }, + { + "epoch": 0.7627794253643292, + "grad_norm": 0.18332846462726593, + "learning_rate": 0.00022674939698491854, + "loss": 0.9226, + "step": 23920 + }, + { + "epoch": 0.7630983130839631, + "grad_norm": 0.18277508020401, + "learning_rate": 0.000225799993576627, + "loss": 0.9239, + "step": 23930 + }, + { + "epoch": 0.7634172008035971, + "grad_norm": 0.1963907927274704, + "learning_rate": 0.0002248545653358273, + "loss": 0.9371, + "step": 23940 + }, + { + "epoch": 0.7637360885232309, + "grad_norm": 0.18964403867721558, + "learning_rate": 0.00022391309561842852, + "loss": 0.9302, + "step": 23950 + }, + { + "epoch": 0.7640549762428649, + "grad_norm": 0.1915760636329651, + "learning_rate": 0.00022297556785002887, + "loss": 0.9303, + "step": 23960 + }, + { + "epoch": 0.7643738639624988, + "grad_norm": 0.18544916808605194, + "learning_rate": 0.0002220419655256239, + "loss": 0.9178, + "step": 23970 + }, + { + "epoch": 0.7646927516821327, + "grad_norm": 0.1847660392522812, + "learning_rate": 0.00022111227220931578, + "loss": 0.9324, + "step": 23980 + }, + { + "epoch": 0.7650116394017666, + "grad_norm": 0.19324614107608795, + "learning_rate": 0.00022018647153402418, + "loss": 0.9297, + "step": 23990 + }, + { + "epoch": 0.7653305271214006, + "grad_norm": 0.18945874273777008, + "learning_rate": 0.00021926454720119792, + "loss": 0.9435, + "step": 24000 + }, + { + "epoch": 0.7656494148410344, + "grad_norm": 0.18816037476062775, + "learning_rate": 0.00021834648298052806, + "loss": 0.9106, + "step": 24010 + }, + { + "epoch": 0.7659683025606684, + "grad_norm": 0.18085907399654388, + "learning_rate": 0.00021743226270966244, + "loss": 0.9252, + "step": 24020 + }, + { + "epoch": 0.7662871902803023, + "grad_norm": 0.19740988314151764, + "learning_rate": 0.00021652187029392068, + "loss": 0.941, + "step": 24030 + }, + { + "epoch": 0.7666060779999362, + "grad_norm": 0.18533362448215485, + "learning_rate": 0.00021561528970601124, + "loss": 0.9313, + "step": 24040 + }, + { + "epoch": 0.7669249657195701, + "grad_norm": 0.1974269449710846, + "learning_rate": 0.00021471250498574909, + "loss": 0.9306, + "step": 24050 + }, + { + "epoch": 0.7672438534392041, + "grad_norm": 0.1847943365573883, + "learning_rate": 0.0002138135002397747, + "loss": 0.9329, + "step": 24060 + }, + { + "epoch": 0.7675627411588379, + "grad_norm": 0.1897907704114914, + "learning_rate": 0.0002129182596412743, + "loss": 0.9456, + "step": 24070 + }, + { + "epoch": 0.7678816288784719, + "grad_norm": 0.18913713097572327, + "learning_rate": 0.00021202676742970136, + "loss": 0.9362, + "step": 24080 + }, + { + "epoch": 0.7682005165981058, + "grad_norm": 0.18905293941497803, + "learning_rate": 0.0002111390079104989, + "loss": 0.9248, + "step": 24090 + }, + { + "epoch": 0.7685194043177397, + "grad_norm": 0.18642562627792358, + "learning_rate": 0.00021025496545482323, + "loss": 0.9298, + "step": 24100 + }, + { + "epoch": 0.7688382920373736, + "grad_norm": 0.19463413953781128, + "learning_rate": 0.0002093746244992691, + "loss": 0.9348, + "step": 24110 + }, + { + "epoch": 0.7691571797570076, + "grad_norm": 0.18571799993515015, + "learning_rate": 0.0002084979695455954, + "loss": 0.9225, + "step": 24120 + }, + { + "epoch": 0.7694760674766414, + "grad_norm": 0.19087985157966614, + "learning_rate": 0.00020762498516045232, + "loss": 0.9244, + "step": 24130 + }, + { + "epoch": 0.7697949551962754, + "grad_norm": 0.18741409480571747, + "learning_rate": 0.00020675565597510982, + "loss": 0.9278, + "step": 24140 + }, + { + "epoch": 0.7701138429159093, + "grad_norm": 0.18996545672416687, + "learning_rate": 0.00020588996668518696, + "loss": 0.9291, + "step": 24150 + }, + { + "epoch": 0.7704327306355432, + "grad_norm": 0.18844963610172272, + "learning_rate": 0.00020502790205038248, + "loss": 0.9122, + "step": 24160 + }, + { + "epoch": 0.7707516183551771, + "grad_norm": 0.19069145619869232, + "learning_rate": 0.00020416944689420654, + "loss": 0.9184, + "step": 24170 + }, + { + "epoch": 0.7710705060748111, + "grad_norm": 0.19214212894439697, + "learning_rate": 0.00020331458610371345, + "loss": 0.931, + "step": 24180 + }, + { + "epoch": 0.7713893937944449, + "grad_norm": 0.19216571748256683, + "learning_rate": 0.00020246330462923582, + "loss": 0.9262, + "step": 24190 + }, + { + "epoch": 0.7717082815140789, + "grad_norm": 0.18903692066669464, + "learning_rate": 0.00020161558748411925, + "loss": 0.9149, + "step": 24200 + }, + { + "epoch": 0.7720271692337128, + "grad_norm": 0.18422691524028778, + "learning_rate": 0.00020077141974445887, + "loss": 0.9198, + "step": 24210 + }, + { + "epoch": 0.7723460569533467, + "grad_norm": 0.190536767244339, + "learning_rate": 0.00019993078654883636, + "loss": 0.9135, + "step": 24220 + }, + { + "epoch": 0.7726649446729806, + "grad_norm": 0.18851052224636078, + "learning_rate": 0.00019909367309805842, + "loss": 0.8966, + "step": 24230 + }, + { + "epoch": 0.7729838323926146, + "grad_norm": 0.1889980584383011, + "learning_rate": 0.0001982600646548962, + "loss": 0.9211, + "step": 24240 + }, + { + "epoch": 0.7733027201122484, + "grad_norm": 0.1888222098350525, + "learning_rate": 0.0001974299465438259, + "loss": 0.9144, + "step": 24250 + }, + { + "epoch": 0.7736216078318824, + "grad_norm": 0.19497574865818024, + "learning_rate": 0.00019660330415077035, + "loss": 0.9109, + "step": 24260 + }, + { + "epoch": 0.7739404955515163, + "grad_norm": 0.19014975428581238, + "learning_rate": 0.00019578012292284172, + "loss": 0.9264, + "step": 24270 + }, + { + "epoch": 0.7742593832711502, + "grad_norm": 0.19474774599075317, + "learning_rate": 0.0001949603883680855, + "loss": 0.9307, + "step": 24280 + }, + { + "epoch": 0.7745782709907841, + "grad_norm": 0.19689294695854187, + "learning_rate": 0.0001941440860552251, + "loss": 0.9297, + "step": 24290 + }, + { + "epoch": 0.7748971587104181, + "grad_norm": 0.18944482505321503, + "learning_rate": 0.00019333120161340792, + "loss": 0.9221, + "step": 24300 + }, + { + "epoch": 0.7752160464300519, + "grad_norm": 0.18974262475967407, + "learning_rate": 0.00019252172073195239, + "loss": 0.925, + "step": 24310 + }, + { + "epoch": 0.7755349341496859, + "grad_norm": 0.19915218651294708, + "learning_rate": 0.00019171562916009603, + "loss": 0.9313, + "step": 24320 + }, + { + "epoch": 0.7758538218693198, + "grad_norm": 0.1882493942975998, + "learning_rate": 0.00019091291270674447, + "loss": 0.9154, + "step": 24330 + }, + { + "epoch": 0.7761727095889537, + "grad_norm": 0.18934135138988495, + "learning_rate": 0.00019011355724022166, + "loss": 0.9269, + "step": 24340 + }, + { + "epoch": 0.7764915973085876, + "grad_norm": 0.18505986034870148, + "learning_rate": 0.0001893175486880212, + "loss": 0.9286, + "step": 24350 + }, + { + "epoch": 0.7768104850282216, + "grad_norm": 0.187198668718338, + "learning_rate": 0.0001885248730365585, + "loss": 0.9193, + "step": 24360 + }, + { + "epoch": 0.7771293727478554, + "grad_norm": 0.18569988012313843, + "learning_rate": 0.00018773551633092397, + "loss": 0.9095, + "step": 24370 + }, + { + "epoch": 0.7774482604674894, + "grad_norm": 0.1854165643453598, + "learning_rate": 0.00018694946467463756, + "loss": 0.9035, + "step": 24380 + }, + { + "epoch": 0.7777671481871233, + "grad_norm": 0.18419300019741058, + "learning_rate": 0.00018616670422940394, + "loss": 0.9207, + "step": 24390 + }, + { + "epoch": 0.7780860359067572, + "grad_norm": 0.19491253793239594, + "learning_rate": 0.00018538722121486895, + "loss": 0.9235, + "step": 24400 + }, + { + "epoch": 0.7784049236263911, + "grad_norm": 0.18213653564453125, + "learning_rate": 0.00018461100190837707, + "loss": 0.9231, + "step": 24410 + }, + { + "epoch": 0.7787238113460251, + "grad_norm": 0.18353652954101562, + "learning_rate": 0.0001838380326447297, + "loss": 0.9278, + "step": 24420 + }, + { + "epoch": 0.7790426990656589, + "grad_norm": 0.18761824071407318, + "learning_rate": 0.00018306829981594458, + "loss": 0.9237, + "step": 24430 + }, + { + "epoch": 0.7793615867852929, + "grad_norm": 0.1892223060131073, + "learning_rate": 0.0001823017898710165, + "loss": 0.9195, + "step": 24440 + }, + { + "epoch": 0.7796804745049268, + "grad_norm": 0.1860656440258026, + "learning_rate": 0.00018153848931567836, + "loss": 0.9091, + "step": 24450 + }, + { + "epoch": 0.7799993622245607, + "grad_norm": 0.19069086015224457, + "learning_rate": 0.00018077838471216377, + "loss": 0.9207, + "step": 24460 + }, + { + "epoch": 0.7803182499441946, + "grad_norm": 0.18532173335552216, + "learning_rate": 0.00018002146267897054, + "loss": 0.9138, + "step": 24470 + }, + { + "epoch": 0.7806371376638286, + "grad_norm": 0.18655841052532196, + "learning_rate": 0.00017926770989062511, + "loss": 0.9038, + "step": 24480 + }, + { + "epoch": 0.7809560253834625, + "grad_norm": 0.18871451914310455, + "learning_rate": 0.0001785171130774477, + "loss": 0.9208, + "step": 24490 + }, + { + "epoch": 0.7812749131030964, + "grad_norm": 0.1932729184627533, + "learning_rate": 0.00017776965902531916, + "loss": 0.9235, + "step": 24500 + }, + { + "epoch": 0.7815938008227303, + "grad_norm": 0.19129279255867004, + "learning_rate": 0.00017702533457544784, + "loss": 0.9015, + "step": 24510 + }, + { + "epoch": 0.7819126885423643, + "grad_norm": 0.19334463775157928, + "learning_rate": 0.00017628412662413823, + "loss": 0.9229, + "step": 24520 + }, + { + "epoch": 0.7822315762619981, + "grad_norm": 0.18650083243846893, + "learning_rate": 0.0001755460221225603, + "loss": 0.9277, + "step": 24530 + }, + { + "epoch": 0.7825504639816321, + "grad_norm": 0.18478453159332275, + "learning_rate": 0.00017481100807651963, + "loss": 0.9089, + "step": 24540 + }, + { + "epoch": 0.782869351701266, + "grad_norm": 0.19256654381752014, + "learning_rate": 0.00017407907154622863, + "loss": 0.9176, + "step": 24550 + }, + { + "epoch": 0.7831882394208999, + "grad_norm": 0.1951526403427124, + "learning_rate": 0.00017335019964607887, + "loss": 0.9103, + "step": 24560 + }, + { + "epoch": 0.7835071271405338, + "grad_norm": 0.1941751390695572, + "learning_rate": 0.00017262437954441417, + "loss": 0.9187, + "step": 24570 + }, + { + "epoch": 0.7838260148601678, + "grad_norm": 0.1897512674331665, + "learning_rate": 0.00017190159846330476, + "loss": 0.915, + "step": 24580 + }, + { + "epoch": 0.7841449025798016, + "grad_norm": 0.18099430203437805, + "learning_rate": 0.00017118184367832215, + "loss": 0.92, + "step": 24590 + }, + { + "epoch": 0.7844637902994356, + "grad_norm": 0.18526895344257355, + "learning_rate": 0.00017046510251831525, + "loss": 0.9181, + "step": 24600 + }, + { + "epoch": 0.7847826780190695, + "grad_norm": 0.18779991567134857, + "learning_rate": 0.0001697513623651875, + "loss": 0.9108, + "step": 24610 + }, + { + "epoch": 0.7851015657387034, + "grad_norm": 0.1872919648885727, + "learning_rate": 0.00016904061065367424, + "loss": 0.9005, + "step": 24620 + }, + { + "epoch": 0.7854204534583373, + "grad_norm": 0.19210565090179443, + "learning_rate": 0.00016833283487112187, + "loss": 0.9142, + "step": 24630 + }, + { + "epoch": 0.7857393411779713, + "grad_norm": 0.19088362157344818, + "learning_rate": 0.00016762802255726757, + "loss": 0.9082, + "step": 24640 + }, + { + "epoch": 0.7860582288976051, + "grad_norm": 0.19318880140781403, + "learning_rate": 0.0001669261613040197, + "loss": 0.9098, + "step": 24650 + }, + { + "epoch": 0.7863771166172391, + "grad_norm": 0.19324028491973877, + "learning_rate": 0.00016622723875523958, + "loss": 0.9167, + "step": 24660 + }, + { + "epoch": 0.786696004336873, + "grad_norm": 0.18459977209568024, + "learning_rate": 0.0001655312426065239, + "loss": 0.9013, + "step": 24670 + }, + { + "epoch": 0.7870148920565069, + "grad_norm": 0.18851350247859955, + "learning_rate": 0.00016483816060498802, + "loss": 0.9235, + "step": 24680 + }, + { + "epoch": 0.7873337797761408, + "grad_norm": 0.19676366448402405, + "learning_rate": 0.00016414798054905036, + "loss": 0.9191, + "step": 24690 + }, + { + "epoch": 0.7876526674957748, + "grad_norm": 0.18468333780765533, + "learning_rate": 0.0001634606902882176, + "loss": 0.9007, + "step": 24700 + }, + { + "epoch": 0.7879715552154086, + "grad_norm": 0.18907414376735687, + "learning_rate": 0.00016277627772287072, + "loss": 0.9181, + "step": 24710 + }, + { + "epoch": 0.7882904429350426, + "grad_norm": 0.18885044753551483, + "learning_rate": 0.00016209473080405187, + "loss": 0.8997, + "step": 24720 + }, + { + "epoch": 0.7886093306546765, + "grad_norm": 0.18907248973846436, + "learning_rate": 0.0001614160375332526, + "loss": 0.9155, + "step": 24730 + }, + { + "epoch": 0.7889282183743104, + "grad_norm": 0.19450700283050537, + "learning_rate": 0.00016074018596220224, + "loss": 0.9155, + "step": 24740 + }, + { + "epoch": 0.7892471060939443, + "grad_norm": 0.18370944261550903, + "learning_rate": 0.00016006716419265783, + "loss": 0.9325, + "step": 24750 + }, + { + "epoch": 0.7895659938135783, + "grad_norm": 0.18763361871242523, + "learning_rate": 0.00015939696037619444, + "loss": 0.9066, + "step": 24760 + }, + { + "epoch": 0.7898848815332121, + "grad_norm": 0.19110409915447235, + "learning_rate": 0.00015872956271399674, + "loss": 0.8973, + "step": 24770 + }, + { + "epoch": 0.7902037692528461, + "grad_norm": 0.18324126303195953, + "learning_rate": 0.00015806495945665133, + "loss": 0.9194, + "step": 24780 + }, + { + "epoch": 0.79052265697248, + "grad_norm": 0.19040825963020325, + "learning_rate": 0.00015740313890393964, + "loss": 0.9321, + "step": 24790 + }, + { + "epoch": 0.7908415446921139, + "grad_norm": 0.1885160505771637, + "learning_rate": 0.00015674408940463216, + "loss": 0.9003, + "step": 24800 + }, + { + "epoch": 0.7911604324117478, + "grad_norm": 0.1887117624282837, + "learning_rate": 0.00015608779935628333, + "loss": 0.9166, + "step": 24810 + }, + { + "epoch": 0.7914793201313818, + "grad_norm": 0.19734545052051544, + "learning_rate": 0.0001554342572050271, + "loss": 0.9099, + "step": 24820 + }, + { + "epoch": 0.7917982078510156, + "grad_norm": 0.19855238497257233, + "learning_rate": 0.00015478345144537376, + "loss": 0.9131, + "step": 24830 + }, + { + "epoch": 0.7921170955706496, + "grad_norm": 0.1890731155872345, + "learning_rate": 0.0001541353706200072, + "loss": 0.9044, + "step": 24840 + }, + { + "epoch": 0.7924359832902835, + "grad_norm": 0.1917041540145874, + "learning_rate": 0.0001534900033195833, + "loss": 0.9151, + "step": 24850 + }, + { + "epoch": 0.7927548710099174, + "grad_norm": 0.19247248768806458, + "learning_rate": 0.00015284733818252897, + "loss": 0.9068, + "step": 24860 + }, + { + "epoch": 0.7930737587295513, + "grad_norm": 0.18687069416046143, + "learning_rate": 0.00015220736389484244, + "loss": 0.9018, + "step": 24870 + }, + { + "epoch": 0.7933926464491853, + "grad_norm": 0.1889362782239914, + "learning_rate": 0.00015157006918989363, + "loss": 0.9096, + "step": 24880 + }, + { + "epoch": 0.7937115341688191, + "grad_norm": 0.19133806228637695, + "learning_rate": 0.00015093544284822607, + "loss": 0.9034, + "step": 24890 + }, + { + "epoch": 0.7940304218884531, + "grad_norm": 0.1814614236354828, + "learning_rate": 0.0001503034736973594, + "loss": 0.9042, + "step": 24900 + }, + { + "epoch": 0.794349309608087, + "grad_norm": 0.1918705403804779, + "learning_rate": 0.00014967415061159254, + "loss": 0.9141, + "step": 24910 + }, + { + "epoch": 0.7946681973277209, + "grad_norm": 0.18677212297916412, + "learning_rate": 0.00014904746251180796, + "loss": 0.9097, + "step": 24920 + }, + { + "epoch": 0.7949870850473548, + "grad_norm": 0.19333551824092865, + "learning_rate": 0.00014842339836527651, + "loss": 0.9008, + "step": 24930 + }, + { + "epoch": 0.7953059727669888, + "grad_norm": 0.19339174032211304, + "learning_rate": 0.00014780194718546334, + "loss": 0.9083, + "step": 24940 + }, + { + "epoch": 0.7956248604866226, + "grad_norm": 0.19391214847564697, + "learning_rate": 0.00014718309803183436, + "loss": 0.9166, + "step": 24950 + }, + { + "epoch": 0.7959437482062566, + "grad_norm": 0.19230784475803375, + "learning_rate": 0.00014656684000966363, + "loss": 0.9076, + "step": 24960 + }, + { + "epoch": 0.7962626359258905, + "grad_norm": 0.1877783089876175, + "learning_rate": 0.00014595316226984173, + "loss": 0.9016, + "step": 24970 + }, + { + "epoch": 0.7965815236455244, + "grad_norm": 0.1867537945508957, + "learning_rate": 0.00014534205400868448, + "loss": 0.9091, + "step": 24980 + }, + { + "epoch": 0.7969004113651583, + "grad_norm": 0.18787339329719543, + "learning_rate": 0.000144733504467743, + "loss": 0.8847, + "step": 24990 + }, + { + "epoch": 0.7972192990847923, + "grad_norm": 0.19018739461898804, + "learning_rate": 0.00014412750293361419, + "loss": 0.9036, + "step": 25000 + }, + { + "epoch": 0.7975381868044261, + "grad_norm": 0.19432775676250458, + "learning_rate": 0.00014352403873775206, + "loss": 0.9233, + "step": 25010 + }, + { + "epoch": 0.7978570745240601, + "grad_norm": 0.19632703065872192, + "learning_rate": 0.0001429231012562802, + "loss": 0.901, + "step": 25020 + }, + { + "epoch": 0.798175962243694, + "grad_norm": 0.1939215213060379, + "learning_rate": 0.00014232467990980434, + "loss": 0.8983, + "step": 25030 + }, + { + "epoch": 0.7984948499633279, + "grad_norm": 0.2033773511648178, + "learning_rate": 0.0001417287641632264, + "loss": 0.9169, + "step": 25040 + }, + { + "epoch": 0.7988137376829618, + "grad_norm": 0.18326032161712646, + "learning_rate": 0.00014113534352555893, + "loss": 0.9006, + "step": 25050 + }, + { + "epoch": 0.7991326254025958, + "grad_norm": 0.18715116381645203, + "learning_rate": 0.00014054440754974036, + "loss": 0.8999, + "step": 25060 + }, + { + "epoch": 0.7994515131222296, + "grad_norm": 0.19295094907283783, + "learning_rate": 0.00013995594583245116, + "loss": 0.9173, + "step": 25070 + }, + { + "epoch": 0.7997704008418636, + "grad_norm": 0.19653460383415222, + "learning_rate": 0.0001393699480139307, + "loss": 0.9094, + "step": 25080 + }, + { + "epoch": 0.8000892885614975, + "grad_norm": 0.18987542390823364, + "learning_rate": 0.00013878640377779487, + "loss": 0.9136, + "step": 25090 + }, + { + "epoch": 0.8004081762811314, + "grad_norm": 0.1913256049156189, + "learning_rate": 0.00013820530285085425, + "loss": 0.893, + "step": 25100 + }, + { + "epoch": 0.8007270640007653, + "grad_norm": 0.1863279491662979, + "learning_rate": 0.0001376266350029336, + "loss": 0.9053, + "step": 25110 + }, + { + "epoch": 0.8010459517203993, + "grad_norm": 0.1903347223997116, + "learning_rate": 0.0001370503900466916, + "loss": 0.9071, + "step": 25120 + }, + { + "epoch": 0.8013648394400331, + "grad_norm": 0.19538167119026184, + "learning_rate": 0.00013647655783744143, + "loss": 0.8983, + "step": 25130 + }, + { + "epoch": 0.8016837271596671, + "grad_norm": 0.19056464731693268, + "learning_rate": 0.00013590512827297215, + "loss": 0.9075, + "step": 25140 + }, + { + "epoch": 0.802002614879301, + "grad_norm": 0.19379113614559174, + "learning_rate": 0.00013533609129337112, + "loss": 0.8932, + "step": 25150 + }, + { + "epoch": 0.8023215025989349, + "grad_norm": 0.19236436486244202, + "learning_rate": 0.00013476943688084665, + "loss": 0.9199, + "step": 25160 + }, + { + "epoch": 0.8026403903185688, + "grad_norm": 0.19388633966445923, + "learning_rate": 0.00013420515505955158, + "loss": 0.8861, + "step": 25170 + }, + { + "epoch": 0.8029592780382028, + "grad_norm": 0.18646042048931122, + "learning_rate": 0.000133643235895408, + "loss": 0.8961, + "step": 25180 + }, + { + "epoch": 0.8032781657578366, + "grad_norm": 0.19682526588439941, + "learning_rate": 0.00013308366949593199, + "loss": 0.8948, + "step": 25190 + }, + { + "epoch": 0.8035970534774706, + "grad_norm": 0.1909864842891693, + "learning_rate": 0.0001325264460100597, + "loss": 0.9045, + "step": 25200 + }, + { + "epoch": 0.8039159411971045, + "grad_norm": 0.19391486048698425, + "learning_rate": 0.0001319715556279738, + "loss": 0.913, + "step": 25210 + }, + { + "epoch": 0.8042348289167384, + "grad_norm": 0.18908917903900146, + "learning_rate": 0.00013141898858093086, + "loss": 0.9024, + "step": 25220 + }, + { + "epoch": 0.8045537166363723, + "grad_norm": 0.18832768499851227, + "learning_rate": 0.00013086873514108925, + "loss": 0.892, + "step": 25230 + }, + { + "epoch": 0.8048726043560063, + "grad_norm": 0.19684334099292755, + "learning_rate": 0.00013032078562133812, + "loss": 0.9172, + "step": 25240 + }, + { + "epoch": 0.8051914920756401, + "grad_norm": 0.18681035935878754, + "learning_rate": 0.0001297751303751266, + "loss": 0.9105, + "step": 25250 + }, + { + "epoch": 0.8055103797952741, + "grad_norm": 0.19750331342220306, + "learning_rate": 0.00012923175979629407, + "loss": 0.8844, + "step": 25260 + }, + { + "epoch": 0.805829267514908, + "grad_norm": 0.17914694547653198, + "learning_rate": 0.00012869066431890117, + "loss": 0.8821, + "step": 25270 + }, + { + "epoch": 0.8061481552345419, + "grad_norm": 0.18732082843780518, + "learning_rate": 0.00012815183441706112, + "loss": 0.9071, + "step": 25280 + }, + { + "epoch": 0.8064670429541758, + "grad_norm": 0.18838287889957428, + "learning_rate": 0.0001276152606047724, + "loss": 0.9071, + "step": 25290 + }, + { + "epoch": 0.8067859306738098, + "grad_norm": 0.18626269698143005, + "learning_rate": 0.0001270809334357514, + "loss": 0.8998, + "step": 25300 + }, + { + "epoch": 0.8071048183934437, + "grad_norm": 0.19195237755775452, + "learning_rate": 0.00012654884350326619, + "loss": 0.8856, + "step": 25310 + }, + { + "epoch": 0.8074237061130776, + "grad_norm": 0.19287879765033722, + "learning_rate": 0.00012601898143997108, + "loss": 0.9198, + "step": 25320 + }, + { + "epoch": 0.8077425938327115, + "grad_norm": 0.18623973429203033, + "learning_rate": 0.00012549133791774162, + "loss": 0.9028, + "step": 25330 + }, + { + "epoch": 0.8080614815523455, + "grad_norm": 0.1909293383359909, + "learning_rate": 0.0001249659036475103, + "loss": 0.9023, + "step": 25340 + }, + { + "epoch": 0.8083803692719793, + "grad_norm": 0.1917041689157486, + "learning_rate": 0.00012444266937910312, + "loss": 0.9037, + "step": 25350 + }, + { + "epoch": 0.8086992569916133, + "grad_norm": 0.1956179291009903, + "learning_rate": 0.0001239216259010767, + "loss": 0.897, + "step": 25360 + }, + { + "epoch": 0.8090181447112472, + "grad_norm": 0.18946874141693115, + "learning_rate": 0.00012340276404055616, + "loss": 0.8903, + "step": 25370 + }, + { + "epoch": 0.8093370324308811, + "grad_norm": 0.19109565019607544, + "learning_rate": 0.00012288607466307355, + "loss": 0.9091, + "step": 25380 + }, + { + "epoch": 0.809655920150515, + "grad_norm": 0.19188068807125092, + "learning_rate": 0.0001223715486724071, + "loss": 0.8943, + "step": 25390 + }, + { + "epoch": 0.809974807870149, + "grad_norm": 0.1854020357131958, + "learning_rate": 0.00012185917701042106, + "loss": 0.8957, + "step": 25400 + }, + { + "epoch": 0.8102936955897828, + "grad_norm": 0.18928898870944977, + "learning_rate": 0.0001213489506569063, + "loss": 0.8974, + "step": 25410 + }, + { + "epoch": 0.8106125833094168, + "grad_norm": 0.19741034507751465, + "learning_rate": 0.00012084086062942134, + "loss": 0.9008, + "step": 25420 + }, + { + "epoch": 0.8109314710290507, + "grad_norm": 0.1944274604320526, + "learning_rate": 0.00012033489798313444, + "loss": 0.8994, + "step": 25430 + }, + { + "epoch": 0.8112503587486846, + "grad_norm": 0.1925448775291443, + "learning_rate": 0.00011983105381066592, + "loss": 0.9098, + "step": 25440 + }, + { + "epoch": 0.8115692464683185, + "grad_norm": 0.19471345841884613, + "learning_rate": 0.00011932931924193155, + "loss": 0.8981, + "step": 25450 + }, + { + "epoch": 0.8118881341879525, + "grad_norm": 0.18608881533145905, + "learning_rate": 0.0001188296854439862, + "loss": 0.8864, + "step": 25460 + }, + { + "epoch": 0.8122070219075863, + "grad_norm": 0.19888868927955627, + "learning_rate": 0.00011833214362086844, + "loss": 0.9101, + "step": 25470 + }, + { + "epoch": 0.8125259096272203, + "grad_norm": 0.18246127665042877, + "learning_rate": 0.00011783668501344572, + "loss": 0.9138, + "step": 25480 + }, + { + "epoch": 0.8128447973468542, + "grad_norm": 0.19178058207035065, + "learning_rate": 0.00011734330089926006, + "loss": 0.9087, + "step": 25490 + }, + { + "epoch": 0.8131636850664881, + "grad_norm": 0.19574995338916779, + "learning_rate": 0.0001168519825923746, + "loss": 0.9075, + "step": 25500 + }, + { + "epoch": 0.813482572786122, + "grad_norm": 0.19114628434181213, + "learning_rate": 0.00011636272144322059, + "loss": 0.9029, + "step": 25510 + }, + { + "epoch": 0.813801460505756, + "grad_norm": 0.20036236941814423, + "learning_rate": 0.00011587550883844523, + "loss": 0.8975, + "step": 25520 + }, + { + "epoch": 0.8141203482253898, + "grad_norm": 0.19069448113441467, + "learning_rate": 0.00011539033620075986, + "loss": 0.9043, + "step": 25530 + }, + { + "epoch": 0.8144392359450238, + "grad_norm": 0.1888374239206314, + "learning_rate": 0.00011490719498878924, + "loss": 0.9047, + "step": 25540 + }, + { + "epoch": 0.8147581236646577, + "grad_norm": 0.19065211713314056, + "learning_rate": 0.00011442607669692085, + "loss": 0.9017, + "step": 25550 + }, + { + "epoch": 0.8150770113842916, + "grad_norm": 0.18727807700634003, + "learning_rate": 0.00011394697285515537, + "loss": 0.8943, + "step": 25560 + }, + { + "epoch": 0.8153958991039255, + "grad_norm": 0.18859420716762543, + "learning_rate": 0.0001134698750289575, + "loss": 0.9018, + "step": 25570 + }, + { + "epoch": 0.8157147868235595, + "grad_norm": 0.18913602828979492, + "learning_rate": 0.00011299477481910747, + "loss": 0.9019, + "step": 25580 + }, + { + "epoch": 0.8160336745431933, + "grad_norm": 0.19403930008411407, + "learning_rate": 0.00011252166386155319, + "loss": 0.9129, + "step": 25590 + }, + { + "epoch": 0.8163525622628273, + "grad_norm": 0.1911238133907318, + "learning_rate": 0.00011205053382726299, + "loss": 0.8979, + "step": 25600 + }, + { + "epoch": 0.8166714499824612, + "grad_norm": 0.19056203961372375, + "learning_rate": 0.00011158137642207893, + "loss": 0.8961, + "step": 25610 + }, + { + "epoch": 0.8169903377020951, + "grad_norm": 0.1901327222585678, + "learning_rate": 0.00011111418338657102, + "loss": 0.9004, + "step": 25620 + }, + { + "epoch": 0.817309225421729, + "grad_norm": 0.19726039469242096, + "learning_rate": 0.00011064894649589143, + "loss": 0.9107, + "step": 25630 + }, + { + "epoch": 0.817628113141363, + "grad_norm": 0.1873912811279297, + "learning_rate": 0.00011018565755962999, + "loss": 0.9057, + "step": 25640 + }, + { + "epoch": 0.8179470008609968, + "grad_norm": 0.1929081529378891, + "learning_rate": 0.00010972430842166995, + "loss": 0.883, + "step": 25650 + }, + { + "epoch": 0.8182658885806308, + "grad_norm": 0.19839973747730255, + "learning_rate": 0.0001092648909600443, + "loss": 0.9024, + "step": 25660 + }, + { + "epoch": 0.8185847763002647, + "grad_norm": 0.19346867501735687, + "learning_rate": 0.00010880739708679283, + "loss": 0.8958, + "step": 25670 + }, + { + "epoch": 0.8189036640198986, + "grad_norm": 0.1984575092792511, + "learning_rate": 0.00010835181874781979, + "loss": 0.9047, + "step": 25680 + }, + { + "epoch": 0.8192225517395325, + "grad_norm": 0.1977538913488388, + "learning_rate": 0.00010789814792275205, + "loss": 0.9038, + "step": 25690 + }, + { + "epoch": 0.8195414394591665, + "grad_norm": 0.2003737837076187, + "learning_rate": 0.00010744637662479786, + "loss": 0.91, + "step": 25700 + }, + { + "epoch": 0.8198603271788003, + "grad_norm": 0.19274483621120453, + "learning_rate": 0.00010699649690060641, + "loss": 0.8969, + "step": 25710 + }, + { + "epoch": 0.8201792148984343, + "grad_norm": 0.18908193707466125, + "learning_rate": 0.00010654850083012758, + "loss": 0.8835, + "step": 25720 + }, + { + "epoch": 0.8204981026180682, + "grad_norm": 0.19787780940532684, + "learning_rate": 0.00010610238052647271, + "loss": 0.8967, + "step": 25730 + }, + { + "epoch": 0.8208169903377021, + "grad_norm": 0.1946023851633072, + "learning_rate": 0.00010565812813577561, + "loss": 0.8948, + "step": 25740 + }, + { + "epoch": 0.821135878057336, + "grad_norm": 0.19036608934402466, + "learning_rate": 0.00010521573583705442, + "loss": 0.8982, + "step": 25750 + }, + { + "epoch": 0.82145476577697, + "grad_norm": 0.19042381644248962, + "learning_rate": 0.00010477519584207379, + "loss": 0.8739, + "step": 25760 + }, + { + "epoch": 0.8217736534966038, + "grad_norm": 0.19218648970127106, + "learning_rate": 0.00010433650039520788, + "loss": 0.895, + "step": 25770 + }, + { + "epoch": 0.8220925412162378, + "grad_norm": 0.19477754831314087, + "learning_rate": 0.00010389964177330376, + "loss": 0.8941, + "step": 25780 + }, + { + "epoch": 0.8224114289358717, + "grad_norm": 0.19564403593540192, + "learning_rate": 0.00010346461228554552, + "loss": 0.9025, + "step": 25790 + }, + { + "epoch": 0.8227303166555056, + "grad_norm": 0.19064311683177948, + "learning_rate": 0.00010303140427331876, + "loss": 0.8768, + "step": 25800 + }, + { + "epoch": 0.8230492043751395, + "grad_norm": 0.18618376553058624, + "learning_rate": 0.00010260001011007584, + "loss": 0.8988, + "step": 25810 + }, + { + "epoch": 0.8233680920947735, + "grad_norm": 0.18917080760002136, + "learning_rate": 0.00010217042220120158, + "loss": 0.9023, + "step": 25820 + }, + { + "epoch": 0.8236869798144073, + "grad_norm": 0.19517607986927032, + "learning_rate": 0.00010174263298387963, + "loss": 0.8916, + "step": 25830 + }, + { + "epoch": 0.8240058675340413, + "grad_norm": 0.18709541857242584, + "learning_rate": 0.00010131663492695925, + "loss": 0.8908, + "step": 25840 + }, + { + "epoch": 0.8243247552536752, + "grad_norm": 0.19261054694652557, + "learning_rate": 0.00010089242053082271, + "loss": 0.8996, + "step": 25850 + }, + { + "epoch": 0.824643642973309, + "grad_norm": 0.19508786499500275, + "learning_rate": 0.00010046998232725337, + "loss": 0.8903, + "step": 25860 + }, + { + "epoch": 0.824962530692943, + "grad_norm": 0.1922270953655243, + "learning_rate": 0.00010004931287930405, + "loss": 0.8815, + "step": 25870 + }, + { + "epoch": 0.825281418412577, + "grad_norm": 0.19108553230762482, + "learning_rate": 9.96304047811663e-05, + "loss": 0.8864, + "step": 25880 + }, + { + "epoch": 0.8256003061322108, + "grad_norm": 0.19563859701156616, + "learning_rate": 9.921325065803983e-05, + "loss": 0.8991, + "step": 25890 + }, + { + "epoch": 0.8259191938518448, + "grad_norm": 0.19630859792232513, + "learning_rate": 9.879784316600278e-05, + "loss": 0.8808, + "step": 25900 + }, + { + "epoch": 0.8262380815714787, + "grad_norm": 0.2010757029056549, + "learning_rate": 9.838417499188239e-05, + "loss": 0.897, + "step": 25910 + }, + { + "epoch": 0.8265569692911126, + "grad_norm": 0.1868877112865448, + "learning_rate": 9.79722388531263e-05, + "loss": 0.8982, + "step": 25920 + }, + { + "epoch": 0.8268758570107465, + "grad_norm": 0.19479702413082123, + "learning_rate": 9.756202749767429e-05, + "loss": 0.8909, + "step": 25930 + }, + { + "epoch": 0.8271947447303805, + "grad_norm": 0.19186285138130188, + "learning_rate": 9.715353370383069e-05, + "loss": 0.8999, + "step": 25940 + }, + { + "epoch": 0.8275136324500143, + "grad_norm": 0.1874547004699707, + "learning_rate": 9.674675028013712e-05, + "loss": 0.8698, + "step": 25950 + }, + { + "epoch": 0.8278325201696483, + "grad_norm": 0.1925581693649292, + "learning_rate": 9.634167006524597e-05, + "loss": 0.8985, + "step": 25960 + }, + { + "epoch": 0.8281514078892822, + "grad_norm": 0.19503287971019745, + "learning_rate": 9.593828592779434e-05, + "loss": 0.8946, + "step": 25970 + }, + { + "epoch": 0.828470295608916, + "grad_norm": 0.2057391107082367, + "learning_rate": 9.553659076627844e-05, + "loss": 0.9048, + "step": 25980 + }, + { + "epoch": 0.82878918332855, + "grad_norm": 0.1989460438489914, + "learning_rate": 9.513657750892853e-05, + "loss": 0.8884, + "step": 25990 + }, + { + "epoch": 0.829108071048184, + "grad_norm": 0.19758674502372742, + "learning_rate": 9.473823911358465e-05, + "loss": 0.8884, + "step": 26000 + }, + { + "epoch": 0.8294269587678178, + "grad_norm": 0.18588489294052124, + "learning_rate": 9.434156856757232e-05, + "loss": 0.885, + "step": 26010 + }, + { + "epoch": 0.8297458464874518, + "grad_norm": 0.19282850623130798, + "learning_rate": 9.394655888757933e-05, + "loss": 0.8843, + "step": 26020 + }, + { + "epoch": 0.8300647342070857, + "grad_norm": 0.19358274340629578, + "learning_rate": 9.355320311953263e-05, + "loss": 0.9003, + "step": 26030 + }, + { + "epoch": 0.8303836219267196, + "grad_norm": 0.19735553860664368, + "learning_rate": 9.316149433847609e-05, + "loss": 0.8926, + "step": 26040 + }, + { + "epoch": 0.8307025096463535, + "grad_norm": 0.1917579174041748, + "learning_rate": 9.27714256484484e-05, + "loss": 0.9033, + "step": 26050 + }, + { + "epoch": 0.8310213973659875, + "grad_norm": 0.19253985583782196, + "learning_rate": 9.238299018236176e-05, + "loss": 0.892, + "step": 26060 + }, + { + "epoch": 0.8313402850856213, + "grad_norm": 0.19036774337291718, + "learning_rate": 9.199618110188106e-05, + "loss": 0.8794, + "step": 26070 + }, + { + "epoch": 0.8316591728052553, + "grad_norm": 0.19188916683197021, + "learning_rate": 9.161099159730329e-05, + "loss": 0.891, + "step": 26080 + }, + { + "epoch": 0.8319780605248892, + "grad_norm": 0.19725832343101501, + "learning_rate": 9.122741488743787e-05, + "loss": 0.8884, + "step": 26090 + }, + { + "epoch": 0.832296948244523, + "grad_norm": 0.19478730857372284, + "learning_rate": 9.084544421948714e-05, + "loss": 0.8941, + "step": 26100 + }, + { + "epoch": 0.832615835964157, + "grad_norm": 0.1925089806318283, + "learning_rate": 9.046507286892751e-05, + "loss": 0.9235, + "step": 26110 + }, + { + "epoch": 0.832934723683791, + "grad_norm": 0.1941222846508026, + "learning_rate": 9.008629413939108e-05, + "loss": 0.8937, + "step": 26120 + }, + { + "epoch": 0.8332536114034248, + "grad_norm": 0.19237853586673737, + "learning_rate": 8.970910136254777e-05, + "loss": 0.8875, + "step": 26130 + }, + { + "epoch": 0.8335724991230588, + "grad_norm": 0.19570504128932953, + "learning_rate": 8.93334878979879e-05, + "loss": 0.8816, + "step": 26140 + }, + { + "epoch": 0.8338913868426927, + "grad_norm": 0.19162657856941223, + "learning_rate": 8.895944713310525e-05, + "loss": 0.8913, + "step": 26150 + }, + { + "epoch": 0.8342102745623267, + "grad_norm": 0.1932561993598938, + "learning_rate": 8.858697248298071e-05, + "loss": 0.9011, + "step": 26160 + }, + { + "epoch": 0.8345291622819605, + "grad_norm": 0.193185493350029, + "learning_rate": 8.821605739026645e-05, + "loss": 0.8667, + "step": 26170 + }, + { + "epoch": 0.8348480500015945, + "grad_norm": 0.2088388353586197, + "learning_rate": 8.784669532507018e-05, + "loss": 0.9125, + "step": 26180 + }, + { + "epoch": 0.8351669377212284, + "grad_norm": 0.19375212490558624, + "learning_rate": 8.747887978484048e-05, + "loss": 0.8872, + "step": 26190 + }, + { + "epoch": 0.8354858254408623, + "grad_norm": 0.19213901460170746, + "learning_rate": 8.71126042942522e-05, + "loss": 0.8768, + "step": 26200 + }, + { + "epoch": 0.8358047131604962, + "grad_norm": 0.19676388800144196, + "learning_rate": 8.674786240509246e-05, + "loss": 0.8803, + "step": 26210 + }, + { + "epoch": 0.8361236008801302, + "grad_norm": 0.18894055485725403, + "learning_rate": 8.638464769614718e-05, + "loss": 0.8887, + "step": 26220 + }, + { + "epoch": 0.836442488599764, + "grad_norm": 0.19717377424240112, + "learning_rate": 8.602295377308798e-05, + "loss": 0.8849, + "step": 26230 + }, + { + "epoch": 0.836761376319398, + "grad_norm": 0.18463239073753357, + "learning_rate": 8.566277426835967e-05, + "loss": 0.8856, + "step": 26240 + }, + { + "epoch": 0.8370802640390319, + "grad_norm": 0.19209441542625427, + "learning_rate": 8.53041028410681e-05, + "loss": 0.8865, + "step": 26250 + }, + { + "epoch": 0.8373991517586657, + "grad_norm": 0.19038350880146027, + "learning_rate": 8.494693317686852e-05, + "loss": 0.8662, + "step": 26260 + }, + { + "epoch": 0.8377180394782997, + "grad_norm": 0.19096122682094574, + "learning_rate": 8.459125898785451e-05, + "loss": 0.9065, + "step": 26270 + }, + { + "epoch": 0.8380369271979337, + "grad_norm": 0.1978101283311844, + "learning_rate": 8.423707401244714e-05, + "loss": 0.9053, + "step": 26280 + }, + { + "epoch": 0.8383558149175675, + "grad_norm": 0.18875358998775482, + "learning_rate": 8.388437201528488e-05, + "loss": 0.8865, + "step": 26290 + }, + { + "epoch": 0.8386747026372015, + "grad_norm": 0.1894250363111496, + "learning_rate": 8.353314678711372e-05, + "loss": 0.8963, + "step": 26300 + }, + { + "epoch": 0.8389935903568354, + "grad_norm": 0.1935695856809616, + "learning_rate": 8.318339214467789e-05, + "loss": 0.8895, + "step": 26310 + }, + { + "epoch": 0.8393124780764692, + "grad_norm": 0.18998850882053375, + "learning_rate": 8.283510193061105e-05, + "loss": 0.8861, + "step": 26320 + }, + { + "epoch": 0.8396313657961032, + "grad_norm": 0.20382626354694366, + "learning_rate": 8.248827001332778e-05, + "loss": 0.8913, + "step": 26330 + }, + { + "epoch": 0.8399502535157372, + "grad_norm": 0.1898353546857834, + "learning_rate": 8.214289028691584e-05, + "loss": 0.8857, + "step": 26340 + }, + { + "epoch": 0.840269141235371, + "grad_norm": 0.19221539795398712, + "learning_rate": 8.179895667102844e-05, + "loss": 0.8751, + "step": 26350 + }, + { + "epoch": 0.840588028955005, + "grad_norm": 0.18822281062602997, + "learning_rate": 8.145646311077731e-05, + "loss": 0.8997, + "step": 26360 + }, + { + "epoch": 0.8409069166746389, + "grad_norm": 0.19643518328666687, + "learning_rate": 8.11154035766261e-05, + "loss": 0.894, + "step": 26370 + }, + { + "epoch": 0.8412258043942727, + "grad_norm": 0.19378352165222168, + "learning_rate": 8.077577206428427e-05, + "loss": 0.8879, + "step": 26380 + }, + { + "epoch": 0.8415446921139067, + "grad_norm": 0.1851210743188858, + "learning_rate": 8.043756259460127e-05, + "loss": 0.8805, + "step": 26390 + }, + { + "epoch": 0.8418635798335407, + "grad_norm": 0.19853575527668, + "learning_rate": 8.010076921346141e-05, + "loss": 0.904, + "step": 26400 + }, + { + "epoch": 0.8421824675531745, + "grad_norm": 0.192692831158638, + "learning_rate": 7.976538599167896e-05, + "loss": 0.8952, + "step": 26410 + }, + { + "epoch": 0.8425013552728084, + "grad_norm": 0.19446724653244019, + "learning_rate": 7.943140702489378e-05, + "loss": 0.8703, + "step": 26420 + }, + { + "epoch": 0.8428202429924424, + "grad_norm": 0.1938653141260147, + "learning_rate": 7.909882643346739e-05, + "loss": 0.8889, + "step": 26430 + }, + { + "epoch": 0.8431391307120762, + "grad_norm": 0.1853872686624527, + "learning_rate": 7.876763836237944e-05, + "loss": 0.8928, + "step": 26440 + }, + { + "epoch": 0.8434580184317102, + "grad_norm": 0.19568127393722534, + "learning_rate": 7.843783698112465e-05, + "loss": 0.9062, + "step": 26450 + }, + { + "epoch": 0.8437769061513442, + "grad_norm": 0.1882636547088623, + "learning_rate": 7.810941648361018e-05, + "loss": 0.8969, + "step": 26460 + }, + { + "epoch": 0.844095793870978, + "grad_norm": 0.1955377161502838, + "learning_rate": 7.778237108805339e-05, + "loss": 0.8771, + "step": 26470 + }, + { + "epoch": 0.844414681590612, + "grad_norm": 0.18335965275764465, + "learning_rate": 7.745669503688002e-05, + "loss": 0.8825, + "step": 26480 + }, + { + "epoch": 0.8447335693102459, + "grad_norm": 0.19274048507213593, + "learning_rate": 7.71323825966229e-05, + "loss": 0.8819, + "step": 26490 + }, + { + "epoch": 0.8450524570298797, + "grad_norm": 0.18612460792064667, + "learning_rate": 7.680942805782095e-05, + "loss": 0.8862, + "step": 26500 + }, + { + "epoch": 0.8453713447495137, + "grad_norm": 0.1887853741645813, + "learning_rate": 7.648782573491877e-05, + "loss": 0.8825, + "step": 26510 + }, + { + "epoch": 0.8456902324691477, + "grad_norm": 0.19096797704696655, + "learning_rate": 7.616756996616643e-05, + "loss": 0.8766, + "step": 26520 + }, + { + "epoch": 0.8460091201887815, + "grad_norm": 0.19669249653816223, + "learning_rate": 7.58486551135198e-05, + "loss": 0.8821, + "step": 26530 + }, + { + "epoch": 0.8463280079084154, + "grad_norm": 0.1886143982410431, + "learning_rate": 7.553107556254135e-05, + "loss": 0.8786, + "step": 26540 + }, + { + "epoch": 0.8466468956280494, + "grad_norm": 0.19534634053707123, + "learning_rate": 7.521482572230134e-05, + "loss": 0.8883, + "step": 26550 + }, + { + "epoch": 0.8469657833476832, + "grad_norm": 0.19123771786689758, + "learning_rate": 7.48999000252793e-05, + "loss": 0.8819, + "step": 26560 + }, + { + "epoch": 0.8472846710673172, + "grad_norm": 0.18510787189006805, + "learning_rate": 7.458629292726607e-05, + "loss": 0.8966, + "step": 26570 + }, + { + "epoch": 0.8476035587869512, + "grad_norm": 0.19328685104846954, + "learning_rate": 7.427399890726616e-05, + "loss": 0.8799, + "step": 26580 + }, + { + "epoch": 0.847922446506585, + "grad_norm": 0.19416575133800507, + "learning_rate": 7.396301246740063e-05, + "loss": 0.8805, + "step": 26590 + }, + { + "epoch": 0.848241334226219, + "grad_norm": 0.1965588927268982, + "learning_rate": 7.36533281328102e-05, + "loss": 0.8907, + "step": 26600 + }, + { + "epoch": 0.8485602219458529, + "grad_norm": 0.2026282250881195, + "learning_rate": 7.334494045155892e-05, + "loss": 0.8804, + "step": 26610 + }, + { + "epoch": 0.8488791096654867, + "grad_norm": 0.19632813334465027, + "learning_rate": 7.303784399453824e-05, + "loss": 0.8882, + "step": 26620 + }, + { + "epoch": 0.8491979973851207, + "grad_norm": 0.19422686100006104, + "learning_rate": 7.273203335537129e-05, + "loss": 0.8857, + "step": 26630 + }, + { + "epoch": 0.8495168851047546, + "grad_norm": 0.1914503127336502, + "learning_rate": 7.242750315031787e-05, + "loss": 0.9041, + "step": 26640 + }, + { + "epoch": 0.8498357728243885, + "grad_norm": 0.18675798177719116, + "learning_rate": 7.21242480181795e-05, + "loss": 0.8853, + "step": 26650 + }, + { + "epoch": 0.8501546605440224, + "grad_norm": 0.19239214062690735, + "learning_rate": 7.182226262020522e-05, + "loss": 0.8753, + "step": 26660 + }, + { + "epoch": 0.8504735482636564, + "grad_norm": 0.19588054716587067, + "learning_rate": 7.15215416399974e-05, + "loss": 0.8887, + "step": 26670 + }, + { + "epoch": 0.8507924359832902, + "grad_norm": 0.1920136660337448, + "learning_rate": 7.122207978341839e-05, + "loss": 0.889, + "step": 26680 + }, + { + "epoch": 0.8511113237029242, + "grad_norm": 0.18894952535629272, + "learning_rate": 7.092387177849706e-05, + "loss": 0.88, + "step": 26690 + }, + { + "epoch": 0.8514302114225581, + "grad_norm": 0.19077442586421967, + "learning_rate": 7.062691237533617e-05, + "loss": 0.8782, + "step": 26700 + }, + { + "epoch": 0.851749099142192, + "grad_norm": 0.1909032016992569, + "learning_rate": 7.033119634601985e-05, + "loss": 0.8887, + "step": 26710 + }, + { + "epoch": 0.8520679868618259, + "grad_norm": 0.19129188358783722, + "learning_rate": 7.003671848452163e-05, + "loss": 0.885, + "step": 26720 + }, + { + "epoch": 0.8523868745814599, + "grad_norm": 0.19594432413578033, + "learning_rate": 6.974347360661275e-05, + "loss": 0.8924, + "step": 26730 + }, + { + "epoch": 0.8527057623010937, + "grad_norm": 0.1917877495288849, + "learning_rate": 6.945145654977087e-05, + "loss": 0.8972, + "step": 26740 + }, + { + "epoch": 0.8530246500207277, + "grad_norm": 0.19224712252616882, + "learning_rate": 6.916066217308926e-05, + "loss": 0.8944, + "step": 26750 + }, + { + "epoch": 0.8533435377403616, + "grad_norm": 0.19904139637947083, + "learning_rate": 6.887108535718623e-05, + "loss": 0.8813, + "step": 26760 + }, + { + "epoch": 0.8536624254599955, + "grad_norm": 0.1943054050207138, + "learning_rate": 6.858272100411499e-05, + "loss": 0.8757, + "step": 26770 + }, + { + "epoch": 0.8539813131796294, + "grad_norm": 0.19775620102882385, + "learning_rate": 6.829556403727401e-05, + "loss": 0.8983, + "step": 26780 + }, + { + "epoch": 0.8543002008992634, + "grad_norm": 0.19544585049152374, + "learning_rate": 6.800960940131751e-05, + "loss": 0.8895, + "step": 26790 + }, + { + "epoch": 0.8546190886188972, + "grad_norm": 0.19292794167995453, + "learning_rate": 6.772485206206656e-05, + "loss": 0.8663, + "step": 26800 + }, + { + "epoch": 0.8549379763385312, + "grad_norm": 0.18773417174816132, + "learning_rate": 6.74412870064204e-05, + "loss": 0.8881, + "step": 26810 + }, + { + "epoch": 0.8552568640581651, + "grad_norm": 0.20264096558094025, + "learning_rate": 6.71589092422682e-05, + "loss": 0.8956, + "step": 26820 + }, + { + "epoch": 0.855575751777799, + "grad_norm": 0.19748295843601227, + "learning_rate": 6.687771379840115e-05, + "loss": 0.901, + "step": 26830 + }, + { + "epoch": 0.8558946394974329, + "grad_norm": 0.1929382085800171, + "learning_rate": 6.659769572442513e-05, + "loss": 0.8889, + "step": 26840 + }, + { + "epoch": 0.8562135272170669, + "grad_norm": 0.1998470276594162, + "learning_rate": 6.631885009067319e-05, + "loss": 0.8796, + "step": 26850 + }, + { + "epoch": 0.8565324149367007, + "grad_norm": 0.19323571026325226, + "learning_rate": 6.60411719881191e-05, + "loss": 0.8787, + "step": 26860 + }, + { + "epoch": 0.8568513026563347, + "grad_norm": 0.19377054274082184, + "learning_rate": 6.576465652829075e-05, + "loss": 0.8903, + "step": 26870 + }, + { + "epoch": 0.8571701903759686, + "grad_norm": 0.19681768119335175, + "learning_rate": 6.548929884318418e-05, + "loss": 0.8818, + "step": 26880 + }, + { + "epoch": 0.8574890780956025, + "grad_norm": 0.19799382984638214, + "learning_rate": 6.521509408517782e-05, + "loss": 0.8918, + "step": 26890 + }, + { + "epoch": 0.8578079658152364, + "grad_norm": 0.19595491886138916, + "learning_rate": 6.494203742694715e-05, + "loss": 0.8946, + "step": 26900 + }, + { + "epoch": 0.8581268535348704, + "grad_norm": 0.19243298470973969, + "learning_rate": 6.46701240613798e-05, + "loss": 0.8933, + "step": 26910 + }, + { + "epoch": 0.8584457412545042, + "grad_norm": 0.19631756842136383, + "learning_rate": 6.439934920149081e-05, + "loss": 0.8797, + "step": 26920 + }, + { + "epoch": 0.8587646289741382, + "grad_norm": 0.18874439597129822, + "learning_rate": 6.412970808033838e-05, + "loss": 0.9002, + "step": 26930 + }, + { + "epoch": 0.8590835166937721, + "grad_norm": 0.19086116552352905, + "learning_rate": 6.386119595094003e-05, + "loss": 0.8796, + "step": 26940 + }, + { + "epoch": 0.859402404413406, + "grad_norm": 0.1910838931798935, + "learning_rate": 6.359380808618895e-05, + "loss": 0.8777, + "step": 26950 + }, + { + "epoch": 0.8597212921330399, + "grad_norm": 0.1921229064464569, + "learning_rate": 6.332753977877079e-05, + "loss": 0.8886, + "step": 26960 + }, + { + "epoch": 0.8600401798526739, + "grad_norm": 0.19134828448295593, + "learning_rate": 6.306238634108082e-05, + "loss": 0.8638, + "step": 26970 + }, + { + "epoch": 0.8603590675723078, + "grad_norm": 0.1976134330034256, + "learning_rate": 6.279834310514136e-05, + "loss": 0.8728, + "step": 26980 + }, + { + "epoch": 0.8606779552919417, + "grad_norm": 0.19609402120113373, + "learning_rate": 6.253540542251968e-05, + "loss": 0.8825, + "step": 26990 + }, + { + "epoch": 0.8609968430115756, + "grad_norm": 0.19391323626041412, + "learning_rate": 6.227356866424601e-05, + "loss": 0.8678, + "step": 27000 + }, + { + "epoch": 0.8613157307312096, + "grad_norm": 0.19331073760986328, + "learning_rate": 6.201282822073233e-05, + "loss": 0.8655, + "step": 27010 + }, + { + "epoch": 0.8616346184508434, + "grad_norm": 0.19262798130512238, + "learning_rate": 6.175317950169087e-05, + "loss": 0.8838, + "step": 27020 + }, + { + "epoch": 0.8619535061704774, + "grad_norm": 0.1916375756263733, + "learning_rate": 6.149461793605354e-05, + "loss": 0.8883, + "step": 27030 + }, + { + "epoch": 0.8622723938901113, + "grad_norm": 0.18959954380989075, + "learning_rate": 6.123713897189136e-05, + "loss": 0.9003, + "step": 27040 + }, + { + "epoch": 0.8625912816097452, + "grad_norm": 0.19482597708702087, + "learning_rate": 6.0980738076334414e-05, + "loss": 0.9003, + "step": 27050 + }, + { + "epoch": 0.8629101693293791, + "grad_norm": 0.1904841810464859, + "learning_rate": 6.0725410735491895e-05, + "loss": 0.8917, + "step": 27060 + }, + { + "epoch": 0.8632290570490131, + "grad_norm": 0.19442743062973022, + "learning_rate": 6.0471152454372806e-05, + "loss": 0.877, + "step": 27070 + }, + { + "epoch": 0.8635479447686469, + "grad_norm": 0.18793179094791412, + "learning_rate": 6.02179587568067e-05, + "loss": 0.8927, + "step": 27080 + }, + { + "epoch": 0.8638668324882809, + "grad_norm": 0.20632310211658478, + "learning_rate": 5.9965825185364964e-05, + "loss": 0.8945, + "step": 27090 + }, + { + "epoch": 0.8641857202079148, + "grad_norm": 0.18707668781280518, + "learning_rate": 5.971474730128228e-05, + "loss": 0.8852, + "step": 27100 + }, + { + "epoch": 0.8645046079275487, + "grad_norm": 0.18785780668258667, + "learning_rate": 5.946472068437854e-05, + "loss": 0.8763, + "step": 27110 + }, + { + "epoch": 0.8648234956471826, + "grad_norm": 0.19697052240371704, + "learning_rate": 5.9215740932980986e-05, + "loss": 0.8899, + "step": 27120 + }, + { + "epoch": 0.8651423833668166, + "grad_norm": 0.19185379147529602, + "learning_rate": 5.8967803663846734e-05, + "loss": 0.8746, + "step": 27130 + }, + { + "epoch": 0.8654612710864504, + "grad_norm": 0.19396957755088806, + "learning_rate": 5.8720904512085626e-05, + "loss": 0.8683, + "step": 27140 + }, + { + "epoch": 0.8657801588060844, + "grad_norm": 0.18885491788387299, + "learning_rate": 5.8475039131083355e-05, + "loss": 0.8838, + "step": 27150 + }, + { + "epoch": 0.8660990465257183, + "grad_norm": 0.19360066950321198, + "learning_rate": 5.823020319242495e-05, + "loss": 0.8922, + "step": 27160 + }, + { + "epoch": 0.8664179342453522, + "grad_norm": 0.19725437462329865, + "learning_rate": 5.7986392385818584e-05, + "loss": 0.8765, + "step": 27170 + }, + { + "epoch": 0.8667368219649861, + "grad_norm": 0.19328470528125763, + "learning_rate": 5.774360241901975e-05, + "loss": 0.8879, + "step": 27180 + }, + { + "epoch": 0.8670557096846201, + "grad_norm": 0.19106145203113556, + "learning_rate": 5.7501829017755564e-05, + "loss": 0.8834, + "step": 27190 + }, + { + "epoch": 0.8673745974042539, + "grad_norm": 0.19127941131591797, + "learning_rate": 5.7261067925649635e-05, + "loss": 0.8797, + "step": 27200 + }, + { + "epoch": 0.8676934851238879, + "grad_norm": 0.19829584658145905, + "learning_rate": 5.7021314904147045e-05, + "loss": 0.8799, + "step": 27210 + }, + { + "epoch": 0.8680123728435218, + "grad_norm": 0.19009840488433838, + "learning_rate": 5.678256573243984e-05, + "loss": 0.8787, + "step": 27220 + }, + { + "epoch": 0.8683312605631557, + "grad_norm": 0.1915004998445511, + "learning_rate": 5.6544816207392587e-05, + "loss": 0.8743, + "step": 27230 + }, + { + "epoch": 0.8686501482827896, + "grad_norm": 0.19615386426448822, + "learning_rate": 5.630806214346851e-05, + "loss": 0.8757, + "step": 27240 + }, + { + "epoch": 0.8689690360024236, + "grad_norm": 0.19712796807289124, + "learning_rate": 5.6072299372655695e-05, + "loss": 0.8872, + "step": 27250 + }, + { + "epoch": 0.8692879237220574, + "grad_norm": 0.19148242473602295, + "learning_rate": 5.58375237443938e-05, + "loss": 0.8766, + "step": 27260 + }, + { + "epoch": 0.8696068114416914, + "grad_norm": 0.19317524135112762, + "learning_rate": 5.5603731125500924e-05, + "loss": 0.8802, + "step": 27270 + }, + { + "epoch": 0.8699256991613253, + "grad_norm": 0.19434620440006256, + "learning_rate": 5.537091740010087e-05, + "loss": 0.8769, + "step": 27280 + }, + { + "epoch": 0.8702445868809592, + "grad_norm": 0.19037990272045135, + "learning_rate": 5.513907846955069e-05, + "loss": 0.8812, + "step": 27290 + }, + { + "epoch": 0.8705634746005931, + "grad_norm": 0.19795745611190796, + "learning_rate": 5.490821025236851e-05, + "loss": 0.8913, + "step": 27300 + }, + { + "epoch": 0.8708823623202271, + "grad_norm": 0.2026415467262268, + "learning_rate": 5.467830868416169e-05, + "loss": 0.9003, + "step": 27310 + }, + { + "epoch": 0.8712012500398609, + "grad_norm": 0.18853889405727386, + "learning_rate": 5.444936971755526e-05, + "loss": 0.8757, + "step": 27320 + }, + { + "epoch": 0.8715201377594949, + "grad_norm": 0.18458408117294312, + "learning_rate": 5.42213893221207e-05, + "loss": 0.8898, + "step": 27330 + }, + { + "epoch": 0.8718390254791288, + "grad_norm": 0.1918436586856842, + "learning_rate": 5.399436348430491e-05, + "loss": 0.8927, + "step": 27340 + }, + { + "epoch": 0.8721579131987627, + "grad_norm": 0.19048848748207092, + "learning_rate": 5.3768288207359675e-05, + "loss": 0.8763, + "step": 27350 + }, + { + "epoch": 0.8724768009183966, + "grad_norm": 0.19431082904338837, + "learning_rate": 5.354315951127119e-05, + "loss": 0.8899, + "step": 27360 + }, + { + "epoch": 0.8727956886380306, + "grad_norm": 0.18930363655090332, + "learning_rate": 5.331897343269001e-05, + "loss": 0.8735, + "step": 27370 + }, + { + "epoch": 0.8731145763576644, + "grad_norm": 0.19170422852039337, + "learning_rate": 5.309572602486132e-05, + "loss": 0.8802, + "step": 27380 + }, + { + "epoch": 0.8734334640772984, + "grad_norm": 0.1955621987581253, + "learning_rate": 5.287341335755546e-05, + "loss": 0.883, + "step": 27390 + }, + { + "epoch": 0.8737523517969323, + "grad_norm": 0.19437985122203827, + "learning_rate": 5.265203151699865e-05, + "loss": 0.8785, + "step": 27400 + }, + { + "epoch": 0.8740712395165662, + "grad_norm": 0.19519606232643127, + "learning_rate": 5.243157660580418e-05, + "loss": 0.8694, + "step": 27410 + }, + { + "epoch": 0.8743901272362001, + "grad_norm": 0.19466404616832733, + "learning_rate": 5.221204474290376e-05, + "loss": 0.8751, + "step": 27420 + }, + { + "epoch": 0.8747090149558341, + "grad_norm": 0.19154104590415955, + "learning_rate": 5.199343206347918e-05, + "loss": 0.8816, + "step": 27430 + }, + { + "epoch": 0.8750279026754679, + "grad_norm": 0.1949530988931656, + "learning_rate": 5.1775734718894304e-05, + "loss": 0.8758, + "step": 27440 + }, + { + "epoch": 0.8753467903951019, + "grad_norm": 0.19437164068222046, + "learning_rate": 5.155894887662728e-05, + "loss": 0.8815, + "step": 27450 + }, + { + "epoch": 0.8756656781147358, + "grad_norm": 0.19124570488929749, + "learning_rate": 5.134307072020311e-05, + "loss": 0.878, + "step": 27460 + }, + { + "epoch": 0.8759845658343697, + "grad_norm": 0.1909313052892685, + "learning_rate": 5.1128096449126425e-05, + "loss": 0.887, + "step": 27470 + }, + { + "epoch": 0.8763034535540036, + "grad_norm": 0.19552530348300934, + "learning_rate": 5.091402227881458e-05, + "loss": 0.877, + "step": 27480 + }, + { + "epoch": 0.8766223412736376, + "grad_norm": 0.19062495231628418, + "learning_rate": 5.070084444053106e-05, + "loss": 0.871, + "step": 27490 + }, + { + "epoch": 0.8769412289932714, + "grad_norm": 0.19744792580604553, + "learning_rate": 5.0488559181319116e-05, + "loss": 0.8901, + "step": 27500 + }, + { + "epoch": 0.8772601167129054, + "grad_norm": 0.19660893082618713, + "learning_rate": 5.027716276393563e-05, + "loss": 0.8743, + "step": 27510 + }, + { + "epoch": 0.8775790044325393, + "grad_norm": 0.19473454356193542, + "learning_rate": 5.00666514667855e-05, + "loss": 0.8898, + "step": 27520 + }, + { + "epoch": 0.8778978921521732, + "grad_norm": 0.1918601542711258, + "learning_rate": 4.9857021583855885e-05, + "loss": 0.8592, + "step": 27530 + }, + { + "epoch": 0.8782167798718071, + "grad_norm": 0.18904858827590942, + "learning_rate": 4.9648269424651144e-05, + "loss": 0.8803, + "step": 27540 + }, + { + "epoch": 0.8785356675914411, + "grad_norm": 0.19310516119003296, + "learning_rate": 4.9440391314127776e-05, + "loss": 0.8841, + "step": 27550 + }, + { + "epoch": 0.8788545553110749, + "grad_norm": 0.1876547634601593, + "learning_rate": 4.9233383592629755e-05, + "loss": 0.8776, + "step": 27560 + }, + { + "epoch": 0.8791734430307089, + "grad_norm": 0.1921987533569336, + "learning_rate": 4.90272426158241e-05, + "loss": 0.8773, + "step": 27570 + }, + { + "epoch": 0.8794923307503428, + "grad_norm": 0.19707876443862915, + "learning_rate": 4.88219647546367e-05, + "loss": 0.8824, + "step": 27580 + }, + { + "epoch": 0.8798112184699767, + "grad_norm": 0.1927400529384613, + "learning_rate": 4.861754639518844e-05, + "loss": 0.8784, + "step": 27590 + }, + { + "epoch": 0.8801301061896106, + "grad_norm": 0.19577376544475555, + "learning_rate": 4.841398393873158e-05, + "loss": 0.8793, + "step": 27600 + }, + { + "epoch": 0.8804489939092446, + "grad_norm": 0.1940997689962387, + "learning_rate": 4.821127380158639e-05, + "loss": 0.8756, + "step": 27610 + }, + { + "epoch": 0.8807678816288784, + "grad_norm": 0.19283920526504517, + "learning_rate": 4.800941241507807e-05, + "loss": 0.8822, + "step": 27620 + }, + { + "epoch": 0.8810867693485124, + "grad_norm": 0.19780674576759338, + "learning_rate": 4.7808396225473926e-05, + "loss": 0.8815, + "step": 27630 + }, + { + "epoch": 0.8814056570681463, + "grad_norm": 0.19866904616355896, + "learning_rate": 4.760822169392078e-05, + "loss": 0.8766, + "step": 27640 + }, + { + "epoch": 0.8817245447877802, + "grad_norm": 0.19428886473178864, + "learning_rate": 4.74088852963827e-05, + "loss": 0.874, + "step": 27650 + }, + { + "epoch": 0.8820434325074141, + "grad_norm": 0.19498443603515625, + "learning_rate": 4.7210383523578954e-05, + "loss": 0.8894, + "step": 27660 + }, + { + "epoch": 0.8823623202270481, + "grad_norm": 0.19383135437965393, + "learning_rate": 4.701271288092223e-05, + "loss": 0.8709, + "step": 27670 + }, + { + "epoch": 0.8826812079466819, + "grad_norm": 0.19539567828178406, + "learning_rate": 4.681586988845707e-05, + "loss": 0.8806, + "step": 27680 + }, + { + "epoch": 0.8830000956663159, + "grad_norm": 0.1919242888689041, + "learning_rate": 4.661985108079873e-05, + "loss": 0.8994, + "step": 27690 + }, + { + "epoch": 0.8833189833859498, + "grad_norm": 0.19845853745937347, + "learning_rate": 4.6424653007072e-05, + "loss": 0.8786, + "step": 27700 + }, + { + "epoch": 0.8836378711055837, + "grad_norm": 0.1964893341064453, + "learning_rate": 4.62302722308506e-05, + "loss": 0.8652, + "step": 27710 + }, + { + "epoch": 0.8839567588252176, + "grad_norm": 0.1927543431520462, + "learning_rate": 4.603670533009658e-05, + "loss": 0.8723, + "step": 27720 + }, + { + "epoch": 0.8842756465448516, + "grad_norm": 0.1906014084815979, + "learning_rate": 4.5843948897100144e-05, + "loss": 0.8771, + "step": 27730 + }, + { + "epoch": 0.8845945342644854, + "grad_norm": 0.19443483650684357, + "learning_rate": 4.565199953841964e-05, + "loss": 0.8573, + "step": 27740 + }, + { + "epoch": 0.8849134219841194, + "grad_norm": 0.18828009068965912, + "learning_rate": 4.5460853874821776e-05, + "loss": 0.8908, + "step": 27750 + }, + { + "epoch": 0.8852323097037533, + "grad_norm": 0.19496281445026398, + "learning_rate": 4.52705085412222e-05, + "loss": 0.8787, + "step": 27760 + }, + { + "epoch": 0.8855511974233872, + "grad_norm": 0.1892516016960144, + "learning_rate": 4.50809601866262e-05, + "loss": 0.8804, + "step": 27770 + }, + { + "epoch": 0.8858700851430211, + "grad_norm": 0.19120721518993378, + "learning_rate": 4.4892205474069755e-05, + "loss": 0.8658, + "step": 27780 + }, + { + "epoch": 0.8861889728626551, + "grad_norm": 0.1939810812473297, + "learning_rate": 4.4704241080560745e-05, + "loss": 0.8731, + "step": 27790 + }, + { + "epoch": 0.886507860582289, + "grad_norm": 0.1969224512577057, + "learning_rate": 4.4517063697020485e-05, + "loss": 0.8849, + "step": 27800 + }, + { + "epoch": 0.8868267483019229, + "grad_norm": 0.19289876520633698, + "learning_rate": 4.433067002822545e-05, + "loss": 0.8784, + "step": 27810 + }, + { + "epoch": 0.8871456360215568, + "grad_norm": 0.1962817758321762, + "learning_rate": 4.414505679274928e-05, + "loss": 0.8611, + "step": 27820 + }, + { + "epoch": 0.8874645237411908, + "grad_norm": 0.19316478073596954, + "learning_rate": 4.396022072290497e-05, + "loss": 0.8717, + "step": 27830 + }, + { + "epoch": 0.8877834114608246, + "grad_norm": 0.19402272999286652, + "learning_rate": 4.3776158564687404e-05, + "loss": 0.8757, + "step": 27840 + }, + { + "epoch": 0.8881022991804586, + "grad_norm": 0.19799882173538208, + "learning_rate": 4.359286707771605e-05, + "loss": 0.8913, + "step": 27850 + }, + { + "epoch": 0.8884211869000925, + "grad_norm": 0.19214606285095215, + "learning_rate": 4.341034303517787e-05, + "loss": 0.8773, + "step": 27860 + }, + { + "epoch": 0.8887400746197264, + "grad_norm": 0.18697836995124817, + "learning_rate": 4.322858322377056e-05, + "loss": 0.8758, + "step": 27870 + }, + { + "epoch": 0.8890589623393603, + "grad_norm": 0.20322303473949432, + "learning_rate": 4.3047584443645955e-05, + "loss": 0.8815, + "step": 27880 + }, + { + "epoch": 0.8893778500589943, + "grad_norm": 0.19178463518619537, + "learning_rate": 4.2867343508353714e-05, + "loss": 0.8632, + "step": 27890 + }, + { + "epoch": 0.8896967377786281, + "grad_norm": 0.18942740559577942, + "learning_rate": 4.2687857244785214e-05, + "loss": 0.8832, + "step": 27900 + }, + { + "epoch": 0.8900156254982621, + "grad_norm": 0.187669038772583, + "learning_rate": 4.2509122493117696e-05, + "loss": 0.8843, + "step": 27910 + }, + { + "epoch": 0.890334513217896, + "grad_norm": 0.19848766922950745, + "learning_rate": 4.2331136106758614e-05, + "loss": 0.8919, + "step": 27920 + }, + { + "epoch": 0.8906534009375299, + "grad_norm": 0.19923046231269836, + "learning_rate": 4.2153894952290276e-05, + "loss": 0.8728, + "step": 27930 + }, + { + "epoch": 0.8909722886571638, + "grad_norm": 0.19249597191810608, + "learning_rate": 4.1977395909414635e-05, + "loss": 0.8744, + "step": 27940 + }, + { + "epoch": 0.8912911763767978, + "grad_norm": 0.19503191113471985, + "learning_rate": 4.180163587089841e-05, + "loss": 0.8794, + "step": 27950 + }, + { + "epoch": 0.8916100640964316, + "grad_norm": 0.19658716022968292, + "learning_rate": 4.162661174251835e-05, + "loss": 0.8863, + "step": 27960 + }, + { + "epoch": 0.8919289518160656, + "grad_norm": 0.20051929354667664, + "learning_rate": 4.1452320443006753e-05, + "loss": 0.8899, + "step": 27970 + }, + { + "epoch": 0.8922478395356995, + "grad_norm": 0.19176195561885834, + "learning_rate": 4.1278758903997235e-05, + "loss": 0.8968, + "step": 27980 + }, + { + "epoch": 0.8925667272553334, + "grad_norm": 0.19284586608409882, + "learning_rate": 4.110592406997074e-05, + "loss": 0.8748, + "step": 27990 + }, + { + "epoch": 0.8928856149749673, + "grad_norm": 0.18788652122020721, + "learning_rate": 4.0933812898201676e-05, + "loss": 0.8764, + "step": 28000 + }, + { + "epoch": 0.8932045026946013, + "grad_norm": 0.19523321092128754, + "learning_rate": 4.0762422358704414e-05, + "loss": 0.8765, + "step": 28010 + }, + { + "epoch": 0.8935233904142351, + "grad_norm": 0.18856965005397797, + "learning_rate": 4.059174943417997e-05, + "loss": 0.8778, + "step": 28020 + }, + { + "epoch": 0.8938422781338691, + "grad_norm": 0.188978374004364, + "learning_rate": 4.0421791119962764e-05, + "loss": 0.8822, + "step": 28030 + }, + { + "epoch": 0.894161165853503, + "grad_norm": 0.18726089596748352, + "learning_rate": 4.0252544423967835e-05, + "loss": 0.8723, + "step": 28040 + }, + { + "epoch": 0.8944800535731369, + "grad_norm": 0.19660618901252747, + "learning_rate": 4.008400636663814e-05, + "loss": 0.8619, + "step": 28050 + }, + { + "epoch": 0.8947989412927708, + "grad_norm": 0.1990547776222229, + "learning_rate": 3.991617398089205e-05, + "loss": 0.8734, + "step": 28060 + }, + { + "epoch": 0.8951178290124048, + "grad_norm": 0.1974078267812729, + "learning_rate": 3.974904431207121e-05, + "loss": 0.8741, + "step": 28070 + }, + { + "epoch": 0.8954367167320386, + "grad_norm": 0.1837972104549408, + "learning_rate": 3.958261441788843e-05, + "loss": 0.8747, + "step": 28080 + }, + { + "epoch": 0.8957556044516726, + "grad_norm": 0.20282982289791107, + "learning_rate": 3.941688136837593e-05, + "loss": 0.871, + "step": 28090 + }, + { + "epoch": 0.8960744921713065, + "grad_norm": 0.19532689452171326, + "learning_rate": 3.925184224583376e-05, + "loss": 0.8797, + "step": 28100 + }, + { + "epoch": 0.8963933798909404, + "grad_norm": 0.19317756593227386, + "learning_rate": 3.908749414477842e-05, + "loss": 0.8684, + "step": 28110 + }, + { + "epoch": 0.8967122676105743, + "grad_norm": 0.19880421459674835, + "learning_rate": 3.8923834171891724e-05, + "loss": 0.8912, + "step": 28120 + }, + { + "epoch": 0.8970311553302083, + "grad_norm": 0.1922730654478073, + "learning_rate": 3.876085944596985e-05, + "loss": 0.8719, + "step": 28130 + }, + { + "epoch": 0.8973500430498421, + "grad_norm": 0.20053960382938385, + "learning_rate": 3.8598567097872634e-05, + "loss": 0.8934, + "step": 28140 + }, + { + "epoch": 0.8976689307694761, + "grad_norm": 0.1984051764011383, + "learning_rate": 3.843695427047303e-05, + "loss": 0.8794, + "step": 28150 + }, + { + "epoch": 0.89798781848911, + "grad_norm": 0.19294370710849762, + "learning_rate": 3.8276018118606846e-05, + "loss": 0.8787, + "step": 28160 + }, + { + "epoch": 0.8983067062087439, + "grad_norm": 0.19863317906856537, + "learning_rate": 3.8115755809022624e-05, + "loss": 0.8681, + "step": 28170 + }, + { + "epoch": 0.8986255939283778, + "grad_norm": 0.1922876238822937, + "learning_rate": 3.7956164520331766e-05, + "loss": 0.8711, + "step": 28180 + }, + { + "epoch": 0.8989444816480118, + "grad_norm": 0.1919499635696411, + "learning_rate": 3.7797241442958946e-05, + "loss": 0.8707, + "step": 28190 + }, + { + "epoch": 0.8992633693676456, + "grad_norm": 0.19040320813655853, + "learning_rate": 3.7638983779092486e-05, + "loss": 0.8747, + "step": 28200 + }, + { + "epoch": 0.8995822570872796, + "grad_norm": 0.19564911723136902, + "learning_rate": 3.748138874263523e-05, + "loss": 0.8796, + "step": 28210 + }, + { + "epoch": 0.8999011448069135, + "grad_norm": 0.19297778606414795, + "learning_rate": 3.732445355915546e-05, + "loss": 0.8868, + "step": 28220 + }, + { + "epoch": 0.9002200325265474, + "grad_norm": 0.18817225098609924, + "learning_rate": 3.716817546583803e-05, + "loss": 0.8635, + "step": 28230 + }, + { + "epoch": 0.9005389202461813, + "grad_norm": 0.19400273263454437, + "learning_rate": 3.7012551711435764e-05, + "loss": 0.884, + "step": 28240 + }, + { + "epoch": 0.9008578079658153, + "grad_norm": 0.19113945960998535, + "learning_rate": 3.6857579556221e-05, + "loss": 0.8822, + "step": 28250 + }, + { + "epoch": 0.9011766956854491, + "grad_norm": 0.19377689063549042, + "learning_rate": 3.670325627193734e-05, + "loss": 0.8593, + "step": 28260 + }, + { + "epoch": 0.9014955834050831, + "grad_norm": 0.19520173966884613, + "learning_rate": 3.654957914175167e-05, + "loss": 0.882, + "step": 28270 + }, + { + "epoch": 0.901814471124717, + "grad_norm": 0.1930304616689682, + "learning_rate": 3.639654546020629e-05, + "loss": 0.8676, + "step": 28280 + }, + { + "epoch": 0.9021333588443509, + "grad_norm": 0.18980911374092102, + "learning_rate": 3.6244152533171275e-05, + "loss": 0.8825, + "step": 28290 + }, + { + "epoch": 0.9024522465639848, + "grad_norm": 0.20174168050289154, + "learning_rate": 3.609239767779709e-05, + "loss": 0.902, + "step": 28300 + }, + { + "epoch": 0.9027711342836188, + "grad_norm": 0.18596351146697998, + "learning_rate": 3.594127822246733e-05, + "loss": 0.8835, + "step": 28310 + }, + { + "epoch": 0.9030900220032526, + "grad_norm": 0.1928795576095581, + "learning_rate": 3.579079150675168e-05, + "loss": 0.8707, + "step": 28320 + }, + { + "epoch": 0.9034089097228866, + "grad_norm": 0.18944554030895233, + "learning_rate": 3.564093488135911e-05, + "loss": 0.8735, + "step": 28330 + }, + { + "epoch": 0.9037277974425205, + "grad_norm": 0.18926158547401428, + "learning_rate": 3.54917057080912e-05, + "loss": 0.8601, + "step": 28340 + }, + { + "epoch": 0.9040466851621544, + "grad_norm": 0.19643589854240417, + "learning_rate": 3.534310135979569e-05, + "loss": 0.882, + "step": 28350 + }, + { + "epoch": 0.9043655728817883, + "grad_norm": 0.19318707287311554, + "learning_rate": 3.51951192203203e-05, + "loss": 0.8793, + "step": 28360 + }, + { + "epoch": 0.9046844606014223, + "grad_norm": 0.1926054209470749, + "learning_rate": 3.504775668446659e-05, + "loss": 0.8793, + "step": 28370 + }, + { + "epoch": 0.9050033483210561, + "grad_norm": 0.19345009326934814, + "learning_rate": 3.4901011157944126e-05, + "loss": 0.867, + "step": 28380 + }, + { + "epoch": 0.9053222360406901, + "grad_norm": 0.19208772480487823, + "learning_rate": 3.475488005732481e-05, + "loss": 0.8662, + "step": 28390 + }, + { + "epoch": 0.905641123760324, + "grad_norm": 0.19586001336574554, + "learning_rate": 3.460936080999741e-05, + "loss": 0.8716, + "step": 28400 + }, + { + "epoch": 0.9059600114799579, + "grad_norm": 0.20053620636463165, + "learning_rate": 3.4464450854122266e-05, + "loss": 0.8779, + "step": 28410 + }, + { + "epoch": 0.9062788991995918, + "grad_norm": 0.19501394033432007, + "learning_rate": 3.4320147638586154e-05, + "loss": 0.8713, + "step": 28420 + }, + { + "epoch": 0.9065977869192258, + "grad_norm": 0.19111500680446625, + "learning_rate": 3.4176448622957427e-05, + "loss": 0.8946, + "step": 28430 + }, + { + "epoch": 0.9069166746388596, + "grad_norm": 0.19476771354675293, + "learning_rate": 3.4033351277441255e-05, + "loss": 0.8855, + "step": 28440 + }, + { + "epoch": 0.9072355623584936, + "grad_norm": 0.19051986932754517, + "learning_rate": 3.38908530828351e-05, + "loss": 0.8852, + "step": 28450 + }, + { + "epoch": 0.9075544500781275, + "grad_norm": 0.1968327760696411, + "learning_rate": 3.374895153048438e-05, + "loss": 0.87, + "step": 28460 + }, + { + "epoch": 0.9078733377977614, + "grad_norm": 0.1960689276456833, + "learning_rate": 3.360764412223827e-05, + "loss": 0.8775, + "step": 28470 + }, + { + "epoch": 0.9081922255173953, + "grad_norm": 0.1949181854724884, + "learning_rate": 3.3466928370405766e-05, + "loss": 0.8774, + "step": 28480 + }, + { + "epoch": 0.9085111132370293, + "grad_norm": 0.20106466114521027, + "learning_rate": 3.332680179771184e-05, + "loss": 0.882, + "step": 28490 + }, + { + "epoch": 0.9088300009566631, + "grad_norm": 0.1960950791835785, + "learning_rate": 3.318726193725388e-05, + "loss": 0.8785, + "step": 28500 + }, + { + "epoch": 0.9091488886762971, + "grad_norm": 0.18996301293373108, + "learning_rate": 3.304830633245822e-05, + "loss": 0.8744, + "step": 28510 + }, + { + "epoch": 0.909467776395931, + "grad_norm": 0.18958501517772675, + "learning_rate": 3.290993253703689e-05, + "loss": 0.8837, + "step": 28520 + }, + { + "epoch": 0.9097866641155649, + "grad_norm": 0.1934012621641159, + "learning_rate": 3.277213811494463e-05, + "loss": 0.85, + "step": 28530 + }, + { + "epoch": 0.9101055518351988, + "grad_norm": 0.18933558464050293, + "learning_rate": 3.263492064033587e-05, + "loss": 0.8757, + "step": 28540 + }, + { + "epoch": 0.9104244395548328, + "grad_norm": 0.1904170960187912, + "learning_rate": 3.249827769752215e-05, + "loss": 0.8675, + "step": 28550 + }, + { + "epoch": 0.9107433272744666, + "grad_norm": 0.1956118792295456, + "learning_rate": 3.2362206880929494e-05, + "loss": 0.8857, + "step": 28560 + }, + { + "epoch": 0.9110622149941006, + "grad_norm": 0.1950976848602295, + "learning_rate": 3.222670579505612e-05, + "loss": 0.8793, + "step": 28570 + }, + { + "epoch": 0.9113811027137345, + "grad_norm": 0.19186055660247803, + "learning_rate": 3.2091772054430256e-05, + "loss": 0.88, + "step": 28580 + }, + { + "epoch": 0.9116999904333684, + "grad_norm": 0.1933923065662384, + "learning_rate": 3.195740328356815e-05, + "loss": 0.8882, + "step": 28590 + }, + { + "epoch": 0.9120188781530023, + "grad_norm": 0.20145314931869507, + "learning_rate": 3.182359711693219e-05, + "loss": 0.8918, + "step": 28600 + }, + { + "epoch": 0.9123377658726363, + "grad_norm": 0.20328111946582794, + "learning_rate": 3.169035119888938e-05, + "loss": 0.8868, + "step": 28610 + }, + { + "epoch": 0.9126566535922701, + "grad_norm": 0.18951085209846497, + "learning_rate": 3.155766318366973e-05, + "loss": 0.8737, + "step": 28620 + }, + { + "epoch": 0.9129755413119041, + "grad_norm": 0.18694986402988434, + "learning_rate": 3.142553073532508e-05, + "loss": 0.8656, + "step": 28630 + }, + { + "epoch": 0.913294429031538, + "grad_norm": 0.19038134813308716, + "learning_rate": 3.129395152768789e-05, + "loss": 0.8748, + "step": 28640 + }, + { + "epoch": 0.913613316751172, + "grad_norm": 0.19913242757320404, + "learning_rate": 3.116292324433033e-05, + "loss": 0.8684, + "step": 28650 + }, + { + "epoch": 0.9139322044708058, + "grad_norm": 0.20065714418888092, + "learning_rate": 3.103244357852349e-05, + "loss": 0.8734, + "step": 28660 + }, + { + "epoch": 0.9142510921904398, + "grad_norm": 0.19351083040237427, + "learning_rate": 3.090251023319679e-05, + "loss": 0.8794, + "step": 28670 + }, + { + "epoch": 0.9145699799100737, + "grad_norm": 0.1949431151151657, + "learning_rate": 3.0773120920897486e-05, + "loss": 0.8843, + "step": 28680 + }, + { + "epoch": 0.9148888676297076, + "grad_norm": 0.19440433382987976, + "learning_rate": 3.064427336375045e-05, + "loss": 0.8854, + "step": 28690 + }, + { + "epoch": 0.9152077553493415, + "grad_norm": 0.19945016503334045, + "learning_rate": 3.0515965293418096e-05, + "loss": 0.8745, + "step": 28700 + }, + { + "epoch": 0.9155266430689755, + "grad_norm": 0.19426259398460388, + "learning_rate": 3.038819445106034e-05, + "loss": 0.877, + "step": 28710 + }, + { + "epoch": 0.9158455307886093, + "grad_norm": 0.19654859602451324, + "learning_rate": 3.0260958587294935e-05, + "loss": 0.8796, + "step": 28720 + }, + { + "epoch": 0.9161644185082433, + "grad_norm": 0.19408033788204193, + "learning_rate": 3.013425546215782e-05, + "loss": 0.8922, + "step": 28730 + }, + { + "epoch": 0.9164833062278772, + "grad_norm": 0.2011575996875763, + "learning_rate": 3.0008082845063706e-05, + "loss": 0.8958, + "step": 28740 + }, + { + "epoch": 0.9168021939475111, + "grad_norm": 0.19144727289676666, + "learning_rate": 2.9882438514766802e-05, + "loss": 0.885, + "step": 28750 + }, + { + "epoch": 0.917121081667145, + "grad_norm": 0.191970556974411, + "learning_rate": 2.9757320259321707e-05, + "loss": 0.8791, + "step": 28760 + }, + { + "epoch": 0.917439969386779, + "grad_norm": 0.20373089611530304, + "learning_rate": 2.963272587604447e-05, + "loss": 0.8728, + "step": 28770 + }, + { + "epoch": 0.9177588571064128, + "grad_norm": 0.202840656042099, + "learning_rate": 2.9508653171473813e-05, + "loss": 0.8794, + "step": 28780 + }, + { + "epoch": 0.9180777448260468, + "grad_norm": 0.19466151297092438, + "learning_rate": 2.9385099961332523e-05, + "loss": 0.8728, + "step": 28790 + }, + { + "epoch": 0.9183966325456807, + "grad_norm": 0.19009439647197723, + "learning_rate": 2.926206407048898e-05, + "loss": 0.8707, + "step": 28800 + }, + { + "epoch": 0.9187155202653146, + "grad_norm": 0.19433505833148956, + "learning_rate": 2.913954333291889e-05, + "loss": 0.8721, + "step": 28810 + }, + { + "epoch": 0.9190344079849485, + "grad_norm": 0.19531868398189545, + "learning_rate": 2.901753559166712e-05, + "loss": 0.8554, + "step": 28820 + }, + { + "epoch": 0.9193532957045825, + "grad_norm": 0.1936861276626587, + "learning_rate": 2.8896038698809757e-05, + "loss": 0.8692, + "step": 28830 + }, + { + "epoch": 0.9196721834242163, + "grad_norm": 0.19250236451625824, + "learning_rate": 2.8775050515416274e-05, + "loss": 0.8772, + "step": 28840 + }, + { + "epoch": 0.9199910711438503, + "grad_norm": 0.18581518530845642, + "learning_rate": 2.865456891151188e-05, + "loss": 0.8771, + "step": 28850 + }, + { + "epoch": 0.9203099588634842, + "grad_norm": 0.1961974948644638, + "learning_rate": 2.8534591766040024e-05, + "loss": 0.8798, + "step": 28860 + }, + { + "epoch": 0.920628846583118, + "grad_norm": 0.19266340136528015, + "learning_rate": 2.8415116966825072e-05, + "loss": 0.8634, + "step": 28870 + }, + { + "epoch": 0.920947734302752, + "grad_norm": 0.19667044281959534, + "learning_rate": 2.8296142410535074e-05, + "loss": 0.8706, + "step": 28880 + }, + { + "epoch": 0.921266622022386, + "grad_norm": 0.19851966202259064, + "learning_rate": 2.8177666002644782e-05, + "loss": 0.8848, + "step": 28890 + }, + { + "epoch": 0.9215855097420198, + "grad_norm": 0.1962861716747284, + "learning_rate": 2.8059685657398748e-05, + "loss": 0.8833, + "step": 28900 + }, + { + "epoch": 0.9219043974616538, + "grad_norm": 0.19509483873844147, + "learning_rate": 2.7942199297774628e-05, + "loss": 0.8732, + "step": 28910 + }, + { + "epoch": 0.9222232851812877, + "grad_norm": 0.19322551786899567, + "learning_rate": 2.7825204855446604e-05, + "loss": 0.8804, + "step": 28920 + }, + { + "epoch": 0.9225421729009216, + "grad_norm": 0.19780372083187103, + "learning_rate": 2.7708700270748953e-05, + "loss": 0.884, + "step": 28930 + }, + { + "epoch": 0.9228610606205555, + "grad_norm": 0.19941799342632294, + "learning_rate": 2.759268349263983e-05, + "loss": 0.8738, + "step": 28940 + }, + { + "epoch": 0.9231799483401895, + "grad_norm": 0.18909300863742828, + "learning_rate": 2.7477152478665126e-05, + "loss": 0.8751, + "step": 28950 + }, + { + "epoch": 0.9234988360598233, + "grad_norm": 0.18900838494300842, + "learning_rate": 2.736210519492252e-05, + "loss": 0.8734, + "step": 28960 + }, + { + "epoch": 0.9238177237794573, + "grad_norm": 0.19971856474876404, + "learning_rate": 2.7247539616025663e-05, + "loss": 0.8702, + "step": 28970 + }, + { + "epoch": 0.9241366114990912, + "grad_norm": 0.20252341032028198, + "learning_rate": 2.7133453725068553e-05, + "loss": 0.8648, + "step": 28980 + }, + { + "epoch": 0.924455499218725, + "grad_norm": 0.19686760008335114, + "learning_rate": 2.701984551358999e-05, + "loss": 0.8739, + "step": 28990 + }, + { + "epoch": 0.924774386938359, + "grad_norm": 0.20540061593055725, + "learning_rate": 2.6906712981538236e-05, + "loss": 0.865, + "step": 29000 + }, + { + "epoch": 0.925093274657993, + "grad_norm": 0.1936776340007782, + "learning_rate": 2.6794054137235806e-05, + "loss": 0.8734, + "step": 29010 + }, + { + "epoch": 0.9254121623776268, + "grad_norm": 0.1934029906988144, + "learning_rate": 2.668186699734439e-05, + "loss": 0.8753, + "step": 29020 + }, + { + "epoch": 0.9257310500972608, + "grad_norm": 0.1935998946428299, + "learning_rate": 2.657014958682998e-05, + "loss": 0.8686, + "step": 29030 + }, + { + "epoch": 0.9260499378168947, + "grad_norm": 0.19146475195884705, + "learning_rate": 2.6458899938928035e-05, + "loss": 0.8644, + "step": 29040 + }, + { + "epoch": 0.9263688255365286, + "grad_norm": 0.19215962290763855, + "learning_rate": 2.634811609510889e-05, + "loss": 0.8693, + "step": 29050 + }, + { + "epoch": 0.9266877132561625, + "grad_norm": 0.19085708260536194, + "learning_rate": 2.6237796105043294e-05, + "loss": 0.8823, + "step": 29060 + }, + { + "epoch": 0.9270066009757965, + "grad_norm": 0.19523738324642181, + "learning_rate": 2.6127938026568044e-05, + "loss": 0.872, + "step": 29070 + }, + { + "epoch": 0.9273254886954303, + "grad_norm": 0.19307196140289307, + "learning_rate": 2.6018539925651806e-05, + "loss": 0.8551, + "step": 29080 + }, + { + "epoch": 0.9276443764150643, + "grad_norm": 0.19289293885231018, + "learning_rate": 2.5909599876361067e-05, + "loss": 0.8643, + "step": 29090 + }, + { + "epoch": 0.9279632641346982, + "grad_norm": 0.1898387372493744, + "learning_rate": 2.5801115960826234e-05, + "loss": 0.8827, + "step": 29100 + }, + { + "epoch": 0.928282151854332, + "grad_norm": 0.20208875834941864, + "learning_rate": 2.5693086269207865e-05, + "loss": 0.8995, + "step": 29110 + }, + { + "epoch": 0.928601039573966, + "grad_norm": 0.19447356462478638, + "learning_rate": 2.5585508899663037e-05, + "loss": 0.8855, + "step": 29120 + }, + { + "epoch": 0.9289199272936, + "grad_norm": 0.19338631629943848, + "learning_rate": 2.5478381958311885e-05, + "loss": 0.8704, + "step": 29130 + }, + { + "epoch": 0.9292388150132338, + "grad_norm": 0.19808538258075714, + "learning_rate": 2.5371703559204248e-05, + "loss": 0.8744, + "step": 29140 + }, + { + "epoch": 0.9295577027328678, + "grad_norm": 0.1872604638338089, + "learning_rate": 2.526547182428646e-05, + "loss": 0.8598, + "step": 29150 + }, + { + "epoch": 0.9298765904525017, + "grad_norm": 0.190872460603714, + "learning_rate": 2.5159684883368308e-05, + "loss": 0.856, + "step": 29160 + }, + { + "epoch": 0.9301954781721355, + "grad_norm": 0.19496072828769684, + "learning_rate": 2.505434087409009e-05, + "loss": 0.88, + "step": 29170 + }, + { + "epoch": 0.9305143658917695, + "grad_norm": 0.19592823088169098, + "learning_rate": 2.4949437941889832e-05, + "loss": 0.8721, + "step": 29180 + }, + { + "epoch": 0.9308332536114035, + "grad_norm": 0.19178549945354462, + "learning_rate": 2.4844974239970637e-05, + "loss": 0.8512, + "step": 29190 + }, + { + "epoch": 0.9311521413310373, + "grad_norm": 0.19600598514080048, + "learning_rate": 2.4740947929268202e-05, + "loss": 0.8758, + "step": 29200 + }, + { + "epoch": 0.9314710290506713, + "grad_norm": 0.1950349062681198, + "learning_rate": 2.4637357178418395e-05, + "loss": 0.8914, + "step": 29210 + }, + { + "epoch": 0.9317899167703052, + "grad_norm": 0.19837765395641327, + "learning_rate": 2.4534200163725028e-05, + "loss": 0.8767, + "step": 29220 + }, + { + "epoch": 0.932108804489939, + "grad_norm": 0.1905105859041214, + "learning_rate": 2.443147506912777e-05, + "loss": 0.8675, + "step": 29230 + }, + { + "epoch": 0.932427692209573, + "grad_norm": 0.20346423983573914, + "learning_rate": 2.4329180086170172e-05, + "loss": 0.865, + "step": 29240 + }, + { + "epoch": 0.932746579929207, + "grad_norm": 0.19541527330875397, + "learning_rate": 2.4227313413967803e-05, + "loss": 0.8722, + "step": 29250 + }, + { + "epoch": 0.9330654676488408, + "grad_norm": 0.19942022860050201, + "learning_rate": 2.412587325917658e-05, + "loss": 0.872, + "step": 29260 + }, + { + "epoch": 0.9333843553684747, + "grad_norm": 0.20535457134246826, + "learning_rate": 2.4024857835961166e-05, + "loss": 0.866, + "step": 29270 + }, + { + "epoch": 0.9337032430881087, + "grad_norm": 0.19743776321411133, + "learning_rate": 2.392426536596356e-05, + "loss": 0.8674, + "step": 29280 + }, + { + "epoch": 0.9340221308077425, + "grad_norm": 0.20449943840503693, + "learning_rate": 2.3824094078271775e-05, + "loss": 0.8671, + "step": 29290 + }, + { + "epoch": 0.9343410185273765, + "grad_norm": 0.18968933820724487, + "learning_rate": 2.3724342209388646e-05, + "loss": 0.8635, + "step": 29300 + }, + { + "epoch": 0.9346599062470105, + "grad_norm": 0.18928585946559906, + "learning_rate": 2.3625008003200812e-05, + "loss": 0.8791, + "step": 29310 + }, + { + "epoch": 0.9349787939666443, + "grad_norm": 0.18719816207885742, + "learning_rate": 2.352608971094778e-05, + "loss": 0.874, + "step": 29320 + }, + { + "epoch": 0.9352976816862782, + "grad_norm": 0.1976853758096695, + "learning_rate": 2.3427585591191153e-05, + "loss": 0.8902, + "step": 29330 + }, + { + "epoch": 0.9356165694059122, + "grad_norm": 0.19273672997951508, + "learning_rate": 2.3329493909783962e-05, + "loss": 0.8614, + "step": 29340 + }, + { + "epoch": 0.935935457125546, + "grad_norm": 0.196641743183136, + "learning_rate": 2.3231812939840138e-05, + "loss": 0.8632, + "step": 29350 + }, + { + "epoch": 0.93625434484518, + "grad_norm": 0.19882845878601074, + "learning_rate": 2.3134540961704098e-05, + "loss": 0.8514, + "step": 29360 + }, + { + "epoch": 0.936573232564814, + "grad_norm": 0.1976517140865326, + "learning_rate": 2.3037676262920523e-05, + "loss": 0.8798, + "step": 29370 + }, + { + "epoch": 0.9368921202844478, + "grad_norm": 0.19035455584526062, + "learning_rate": 2.2941217138204138e-05, + "loss": 0.8758, + "step": 29380 + }, + { + "epoch": 0.9372110080040817, + "grad_norm": 0.19680151343345642, + "learning_rate": 2.2845161889409744e-05, + "loss": 0.8729, + "step": 29390 + }, + { + "epoch": 0.9375298957237157, + "grad_norm": 0.18864302337169647, + "learning_rate": 2.2749508825502283e-05, + "loss": 0.8776, + "step": 29400 + }, + { + "epoch": 0.9378487834433495, + "grad_norm": 0.1876661777496338, + "learning_rate": 2.2654256262527105e-05, + "loss": 0.8727, + "step": 29410 + }, + { + "epoch": 0.9381676711629835, + "grad_norm": 0.1983334869146347, + "learning_rate": 2.2559402523580303e-05, + "loss": 0.8783, + "step": 29420 + }, + { + "epoch": 0.9384865588826174, + "grad_norm": 0.2038796991109848, + "learning_rate": 2.2464945938779194e-05, + "loss": 0.8678, + "step": 29430 + }, + { + "epoch": 0.9388054466022513, + "grad_norm": 0.19537989795207977, + "learning_rate": 2.2370884845232913e-05, + "loss": 0.8618, + "step": 29440 + }, + { + "epoch": 0.9391243343218852, + "grad_norm": 0.19477014243602753, + "learning_rate": 2.2277217587013156e-05, + "loss": 0.8654, + "step": 29450 + }, + { + "epoch": 0.9394432220415192, + "grad_norm": 0.19163036346435547, + "learning_rate": 2.2183942515125016e-05, + "loss": 0.877, + "step": 29460 + }, + { + "epoch": 0.9397621097611532, + "grad_norm": 0.19399482011795044, + "learning_rate": 2.2091057987477944e-05, + "loss": 0.8717, + "step": 29470 + }, + { + "epoch": 0.940080997480787, + "grad_norm": 0.19094112515449524, + "learning_rate": 2.1998562368856864e-05, + "loss": 0.8701, + "step": 29480 + }, + { + "epoch": 0.940399885200421, + "grad_norm": 0.193534255027771, + "learning_rate": 2.190645403089337e-05, + "loss": 0.8704, + "step": 29490 + }, + { + "epoch": 0.9407187729200549, + "grad_norm": 0.1996348351240158, + "learning_rate": 2.181473135203705e-05, + "loss": 0.8753, + "step": 29500 + }, + { + "epoch": 0.9410376606396887, + "grad_norm": 0.19400392472743988, + "learning_rate": 2.172339271752697e-05, + "loss": 0.8673, + "step": 29510 + }, + { + "epoch": 0.9413565483593227, + "grad_norm": 0.20478451251983643, + "learning_rate": 2.1632436519363203e-05, + "loss": 0.8657, + "step": 29520 + }, + { + "epoch": 0.9416754360789567, + "grad_norm": 0.20005236566066742, + "learning_rate": 2.1541861156278552e-05, + "loss": 0.8521, + "step": 29530 + }, + { + "epoch": 0.9419943237985905, + "grad_norm": 0.19527098536491394, + "learning_rate": 2.145166503371037e-05, + "loss": 0.8808, + "step": 29540 + }, + { + "epoch": 0.9423132115182244, + "grad_norm": 0.20347745716571808, + "learning_rate": 2.1361846563772446e-05, + "loss": 0.8769, + "step": 29550 + }, + { + "epoch": 0.9426320992378584, + "grad_norm": 0.19539164006710052, + "learning_rate": 2.127240416522709e-05, + "loss": 0.8669, + "step": 29560 + }, + { + "epoch": 0.9429509869574922, + "grad_norm": 0.19707153737545013, + "learning_rate": 2.1183336263457258e-05, + "loss": 0.8782, + "step": 29570 + }, + { + "epoch": 0.9432698746771262, + "grad_norm": 0.1947970688343048, + "learning_rate": 2.109464129043888e-05, + "loss": 0.8599, + "step": 29580 + }, + { + "epoch": 0.9435887623967602, + "grad_norm": 0.20084436237812042, + "learning_rate": 2.100631768471321e-05, + "loss": 0.8637, + "step": 29590 + }, + { + "epoch": 0.943907650116394, + "grad_norm": 0.19857023656368256, + "learning_rate": 2.0918363891359365e-05, + "loss": 0.8783, + "step": 29600 + }, + { + "epoch": 0.944226537836028, + "grad_norm": 0.19687621295452118, + "learning_rate": 2.0830778361966933e-05, + "loss": 0.8798, + "step": 29610 + }, + { + "epoch": 0.9445454255556619, + "grad_norm": 0.18758653104305267, + "learning_rate": 2.0743559554608734e-05, + "loss": 0.8792, + "step": 29620 + }, + { + "epoch": 0.9448643132752957, + "grad_norm": 0.2017526775598526, + "learning_rate": 2.0656705933813646e-05, + "loss": 0.8811, + "step": 29630 + }, + { + "epoch": 0.9451832009949297, + "grad_norm": 0.20141223073005676, + "learning_rate": 2.05702159705396e-05, + "loss": 0.8659, + "step": 29640 + }, + { + "epoch": 0.9455020887145636, + "grad_norm": 0.19546176493167877, + "learning_rate": 2.0484088142146646e-05, + "loss": 0.8589, + "step": 29650 + }, + { + "epoch": 0.9458209764341975, + "grad_norm": 0.19249585270881653, + "learning_rate": 2.039832093237016e-05, + "loss": 0.8678, + "step": 29660 + }, + { + "epoch": 0.9461398641538314, + "grad_norm": 0.19663062691688538, + "learning_rate": 2.0312912831294133e-05, + "loss": 0.8846, + "step": 29670 + }, + { + "epoch": 0.9464587518734654, + "grad_norm": 0.1930922567844391, + "learning_rate": 2.022786233532461e-05, + "loss": 0.8756, + "step": 29680 + }, + { + "epoch": 0.9467776395930992, + "grad_norm": 0.19463899731636047, + "learning_rate": 2.014316794716319e-05, + "loss": 0.8715, + "step": 29690 + }, + { + "epoch": 0.9470965273127332, + "grad_norm": 0.20353105664253235, + "learning_rate": 2.0058828175780694e-05, + "loss": 0.8674, + "step": 29700 + }, + { + "epoch": 0.9474154150323671, + "grad_norm": 0.19156238436698914, + "learning_rate": 1.9974841536390925e-05, + "loss": 0.8515, + "step": 29710 + }, + { + "epoch": 0.947734302752001, + "grad_norm": 0.19340883195400238, + "learning_rate": 1.989120655042449e-05, + "loss": 0.8643, + "step": 29720 + }, + { + "epoch": 0.9480531904716349, + "grad_norm": 0.2009674459695816, + "learning_rate": 1.9807921745502785e-05, + "loss": 0.8846, + "step": 29730 + }, + { + "epoch": 0.9483720781912689, + "grad_norm": 0.1952427476644516, + "learning_rate": 1.972498565541209e-05, + "loss": 0.8587, + "step": 29740 + }, + { + "epoch": 0.9486909659109027, + "grad_norm": 0.1934279054403305, + "learning_rate": 1.964239682007775e-05, + "loss": 0.8716, + "step": 29750 + }, + { + "epoch": 0.9490098536305367, + "grad_norm": 0.19285337626934052, + "learning_rate": 1.956015378553845e-05, + "loss": 0.8741, + "step": 29760 + }, + { + "epoch": 0.9493287413501706, + "grad_norm": 0.20002107322216034, + "learning_rate": 1.947825510392065e-05, + "loss": 0.8645, + "step": 29770 + }, + { + "epoch": 0.9496476290698045, + "grad_norm": 0.1990961879491806, + "learning_rate": 1.939669933341307e-05, + "loss": 0.8674, + "step": 29780 + }, + { + "epoch": 0.9499665167894384, + "grad_norm": 0.19558599591255188, + "learning_rate": 1.9315485038241333e-05, + "loss": 0.873, + "step": 29790 + }, + { + "epoch": 0.9502854045090724, + "grad_norm": 0.19908514618873596, + "learning_rate": 1.9234610788642647e-05, + "loss": 0.8719, + "step": 29800 + }, + { + "epoch": 0.9506042922287062, + "grad_norm": 0.19451043009757996, + "learning_rate": 1.9154075160840683e-05, + "loss": 0.8565, + "step": 29810 + }, + { + "epoch": 0.9509231799483402, + "grad_norm": 0.19632062315940857, + "learning_rate": 1.907387673702047e-05, + "loss": 0.8798, + "step": 29820 + }, + { + "epoch": 0.9512420676679741, + "grad_norm": 0.19675464928150177, + "learning_rate": 1.8994014105303468e-05, + "loss": 0.8812, + "step": 29830 + }, + { + "epoch": 0.951560955387608, + "grad_norm": 0.19777949154376984, + "learning_rate": 1.8914485859722682e-05, + "loss": 0.8797, + "step": 29840 + }, + { + "epoch": 0.9518798431072419, + "grad_norm": 0.1946035623550415, + "learning_rate": 1.8835290600197926e-05, + "loss": 0.8758, + "step": 29850 + }, + { + "epoch": 0.9521987308268759, + "grad_norm": 0.19789068400859833, + "learning_rate": 1.8756426932511182e-05, + "loss": 0.8627, + "step": 29860 + }, + { + "epoch": 0.9525176185465097, + "grad_norm": 0.19965241849422455, + "learning_rate": 1.8677893468282023e-05, + "loss": 0.8783, + "step": 29870 + }, + { + "epoch": 0.9528365062661437, + "grad_norm": 0.18935656547546387, + "learning_rate": 1.8599688824943227e-05, + "loss": 0.8654, + "step": 29880 + }, + { + "epoch": 0.9531553939857776, + "grad_norm": 0.20136713981628418, + "learning_rate": 1.852181162571638e-05, + "loss": 0.863, + "step": 29890 + }, + { + "epoch": 0.9534742817054115, + "grad_norm": 0.19391171634197235, + "learning_rate": 1.8444260499587665e-05, + "loss": 0.863, + "step": 29900 + }, + { + "epoch": 0.9537931694250454, + "grad_norm": 0.19194775819778442, + "learning_rate": 1.8367034081283716e-05, + "loss": 0.8753, + "step": 29910 + }, + { + "epoch": 0.9541120571446794, + "grad_norm": 0.19135677814483643, + "learning_rate": 1.829013101124761e-05, + "loss": 0.8741, + "step": 29920 + }, + { + "epoch": 0.9544309448643132, + "grad_norm": 0.19014357030391693, + "learning_rate": 1.8213549935614886e-05, + "loss": 0.865, + "step": 29930 + }, + { + "epoch": 0.9547498325839472, + "grad_norm": 0.19929145276546478, + "learning_rate": 1.8137289506189752e-05, + "loss": 0.8698, + "step": 29940 + }, + { + "epoch": 0.9550687203035811, + "grad_norm": 0.1992056667804718, + "learning_rate": 1.806134838042133e-05, + "loss": 0.8966, + "step": 29950 + }, + { + "epoch": 0.955387608023215, + "grad_norm": 0.1995675414800644, + "learning_rate": 1.7985725221380017e-05, + "loss": 0.8682, + "step": 29960 + }, + { + "epoch": 0.9557064957428489, + "grad_norm": 0.19337597489356995, + "learning_rate": 1.7910418697733964e-05, + "loss": 0.8775, + "step": 29970 + }, + { + "epoch": 0.9560253834624829, + "grad_norm": 0.19523291289806366, + "learning_rate": 1.7835427483725635e-05, + "loss": 0.8676, + "step": 29980 + }, + { + "epoch": 0.9563442711821167, + "grad_norm": 0.20105189085006714, + "learning_rate": 1.7760750259148444e-05, + "loss": 0.8863, + "step": 29990 + }, + { + "epoch": 0.9566631589017507, + "grad_norm": 0.20133532583713531, + "learning_rate": 1.7686385709323553e-05, + "loss": 0.8821, + "step": 30000 + }, + { + "epoch": 0.9569820466213846, + "grad_norm": 0.19689635932445526, + "learning_rate": 1.7612332525076688e-05, + "loss": 0.8852, + "step": 30010 + }, + { + "epoch": 0.9573009343410185, + "grad_norm": 0.1972358226776123, + "learning_rate": 1.7538589402715118e-05, + "loss": 0.8579, + "step": 30020 + }, + { + "epoch": 0.9576198220606524, + "grad_norm": 0.19419169425964355, + "learning_rate": 1.7465155044004687e-05, + "loss": 0.8679, + "step": 30030 + }, + { + "epoch": 0.9579387097802864, + "grad_norm": 0.1934502273797989, + "learning_rate": 1.7392028156146985e-05, + "loss": 0.8688, + "step": 30040 + }, + { + "epoch": 0.9582575974999202, + "grad_norm": 0.1983090192079544, + "learning_rate": 1.7319207451756553e-05, + "loss": 0.8622, + "step": 30050 + }, + { + "epoch": 0.9585764852195542, + "grad_norm": 0.19536088407039642, + "learning_rate": 1.724669164883824e-05, + "loss": 0.8875, + "step": 30060 + }, + { + "epoch": 0.9588953729391881, + "grad_norm": 0.19346052408218384, + "learning_rate": 1.7174479470764624e-05, + "loss": 0.8837, + "step": 30070 + }, + { + "epoch": 0.959214260658822, + "grad_norm": 0.19292378425598145, + "learning_rate": 1.7102569646253555e-05, + "loss": 0.8687, + "step": 30080 + }, + { + "epoch": 0.9595331483784559, + "grad_norm": 0.1938813477754593, + "learning_rate": 1.7030960909345757e-05, + "loss": 0.8707, + "step": 30090 + }, + { + "epoch": 0.9598520360980899, + "grad_norm": 0.19189944863319397, + "learning_rate": 1.6959651999382535e-05, + "loss": 0.8792, + "step": 30100 + }, + { + "epoch": 0.9601709238177237, + "grad_norm": 0.19455473124980927, + "learning_rate": 1.688864166098361e-05, + "loss": 0.876, + "step": 30110 + }, + { + "epoch": 0.9604898115373577, + "grad_norm": 0.19673947989940643, + "learning_rate": 1.6817928644024988e-05, + "loss": 0.8685, + "step": 30120 + }, + { + "epoch": 0.9608086992569916, + "grad_norm": 0.19511686265468597, + "learning_rate": 1.6747511703616973e-05, + "loss": 0.8592, + "step": 30130 + }, + { + "epoch": 0.9611275869766255, + "grad_norm": 0.19796845316886902, + "learning_rate": 1.6677389600082237e-05, + "loss": 0.8745, + "step": 30140 + }, + { + "epoch": 0.9614464746962594, + "grad_norm": 0.19610048830509186, + "learning_rate": 1.6607561098934e-05, + "loss": 0.8725, + "step": 30150 + }, + { + "epoch": 0.9617653624158934, + "grad_norm": 0.19613753259181976, + "learning_rate": 1.6538024970854306e-05, + "loss": 0.8635, + "step": 30160 + }, + { + "epoch": 0.9620842501355272, + "grad_norm": 0.19956007599830627, + "learning_rate": 1.646877999167236e-05, + "loss": 0.8612, + "step": 30170 + }, + { + "epoch": 0.9624031378551612, + "grad_norm": 0.19366030395030975, + "learning_rate": 1.639982494234301e-05, + "loss": 0.8742, + "step": 30180 + }, + { + "epoch": 0.9627220255747951, + "grad_norm": 0.1933685839176178, + "learning_rate": 1.633115860892524e-05, + "loss": 0.8786, + "step": 30190 + }, + { + "epoch": 0.963040913294429, + "grad_norm": 0.1933773159980774, + "learning_rate": 1.6262779782560837e-05, + "loss": 0.8671, + "step": 30200 + }, + { + "epoch": 0.9633598010140629, + "grad_norm": 0.1973232924938202, + "learning_rate": 1.6194687259453118e-05, + "loss": 0.8751, + "step": 30210 + }, + { + "epoch": 0.9636786887336969, + "grad_norm": 0.1968729943037033, + "learning_rate": 1.6126879840845686e-05, + "loss": 0.8847, + "step": 30220 + }, + { + "epoch": 0.9639975764533307, + "grad_norm": 0.20425496995449066, + "learning_rate": 1.6059356333001364e-05, + "loss": 0.8802, + "step": 30230 + }, + { + "epoch": 0.9643164641729647, + "grad_norm": 0.19971290230751038, + "learning_rate": 1.599211554718118e-05, + "loss": 0.8764, + "step": 30240 + }, + { + "epoch": 0.9646353518925986, + "grad_norm": 0.19143065810203552, + "learning_rate": 1.5925156299623422e-05, + "loss": 0.8513, + "step": 30250 + }, + { + "epoch": 0.9649542396122325, + "grad_norm": 0.20351962745189667, + "learning_rate": 1.5858477411522817e-05, + "loss": 0.8607, + "step": 30260 + }, + { + "epoch": 0.9652731273318664, + "grad_norm": 0.1943381279706955, + "learning_rate": 1.5792077709009763e-05, + "loss": 0.8688, + "step": 30270 + }, + { + "epoch": 0.9655920150515004, + "grad_norm": 0.20330402255058289, + "learning_rate": 1.5725956023129666e-05, + "loss": 0.8571, + "step": 30280 + }, + { + "epoch": 0.9659109027711343, + "grad_norm": 0.19449760019779205, + "learning_rate": 1.566011118982237e-05, + "loss": 0.8664, + "step": 30290 + }, + { + "epoch": 0.9662297904907682, + "grad_norm": 0.19076880812644958, + "learning_rate": 1.559454204990166e-05, + "loss": 0.8774, + "step": 30300 + }, + { + "epoch": 0.9665486782104021, + "grad_norm": 0.19695602357387543, + "learning_rate": 1.5529247449034844e-05, + "loss": 0.8672, + "step": 30310 + }, + { + "epoch": 0.9668675659300361, + "grad_norm": 0.1943412721157074, + "learning_rate": 1.5464226237722444e-05, + "loss": 0.8717, + "step": 30320 + }, + { + "epoch": 0.9671864536496699, + "grad_norm": 0.1951615959405899, + "learning_rate": 1.5399477271277955e-05, + "loss": 0.868, + "step": 30330 + }, + { + "epoch": 0.9675053413693039, + "grad_norm": 0.19740445911884308, + "learning_rate": 1.5334999409807695e-05, + "loss": 0.8923, + "step": 30340 + }, + { + "epoch": 0.9678242290889378, + "grad_norm": 0.2107999175786972, + "learning_rate": 1.5270791518190734e-05, + "loss": 0.8857, + "step": 30350 + }, + { + "epoch": 0.9681431168085717, + "grad_norm": 0.19357022643089294, + "learning_rate": 1.5206852466058905e-05, + "loss": 0.8742, + "step": 30360 + }, + { + "epoch": 0.9684620045282056, + "grad_norm": 0.19716483354568481, + "learning_rate": 1.5143181127776916e-05, + "loss": 0.8889, + "step": 30370 + }, + { + "epoch": 0.9687808922478396, + "grad_norm": 0.19570153951644897, + "learning_rate": 1.5079776382422542e-05, + "loss": 0.8634, + "step": 30380 + }, + { + "epoch": 0.9690997799674734, + "grad_norm": 0.19478130340576172, + "learning_rate": 1.5016637113766854e-05, + "loss": 0.8679, + "step": 30390 + }, + { + "epoch": 0.9694186676871074, + "grad_norm": 0.19353069365024567, + "learning_rate": 1.495376221025461e-05, + "loss": 0.8798, + "step": 30400 + }, + { + "epoch": 0.9697375554067413, + "grad_norm": 0.19400358200073242, + "learning_rate": 1.4891150564984654e-05, + "loss": 0.862, + "step": 30410 + }, + { + "epoch": 0.9700564431263752, + "grad_norm": 0.20103727281093597, + "learning_rate": 1.482880107569045e-05, + "loss": 0.8758, + "step": 30420 + }, + { + "epoch": 0.9703753308460091, + "grad_norm": 0.19329263269901276, + "learning_rate": 1.4766712644720671e-05, + "loss": 0.8734, + "step": 30430 + }, + { + "epoch": 0.9706942185656431, + "grad_norm": 0.19491326808929443, + "learning_rate": 1.4704884179019873e-05, + "loss": 0.8584, + "step": 30440 + }, + { + "epoch": 0.9710131062852769, + "grad_norm": 0.18893815577030182, + "learning_rate": 1.464331459010925e-05, + "loss": 0.8752, + "step": 30450 + }, + { + "epoch": 0.9713319940049109, + "grad_norm": 0.19456712901592255, + "learning_rate": 1.4582002794067476e-05, + "loss": 0.8795, + "step": 30460 + }, + { + "epoch": 0.9716508817245448, + "grad_norm": 0.19400033354759216, + "learning_rate": 1.4520947711511627e-05, + "loss": 0.8845, + "step": 30470 + }, + { + "epoch": 0.9719697694441787, + "grad_norm": 0.19873221218585968, + "learning_rate": 1.446014826757816e-05, + "loss": 0.8835, + "step": 30480 + }, + { + "epoch": 0.9722886571638126, + "grad_norm": 0.19608688354492188, + "learning_rate": 1.4399603391904017e-05, + "loss": 0.8795, + "step": 30490 + }, + { + "epoch": 0.9726075448834466, + "grad_norm": 0.18778324127197266, + "learning_rate": 1.4339312018607758e-05, + "loss": 0.8729, + "step": 30500 + }, + { + "epoch": 0.9729264326030804, + "grad_norm": 0.19760362803936005, + "learning_rate": 1.4279273086270809e-05, + "loss": 0.8744, + "step": 30510 + }, + { + "epoch": 0.9732453203227144, + "grad_norm": 0.19934231042861938, + "learning_rate": 1.4219485537918775e-05, + "loss": 0.8793, + "step": 30520 + }, + { + "epoch": 0.9735642080423483, + "grad_norm": 0.19571653008460999, + "learning_rate": 1.4159948321002827e-05, + "loss": 0.8726, + "step": 30530 + }, + { + "epoch": 0.9738830957619822, + "grad_norm": 0.19574718177318573, + "learning_rate": 1.4100660387381168e-05, + "loss": 0.8679, + "step": 30540 + }, + { + "epoch": 0.9742019834816161, + "grad_norm": 0.19988127052783966, + "learning_rate": 1.4041620693300609e-05, + "loss": 0.8799, + "step": 30550 + }, + { + "epoch": 0.9745208712012501, + "grad_norm": 0.19439704716205597, + "learning_rate": 1.3982828199378157e-05, + "loss": 0.8787, + "step": 30560 + }, + { + "epoch": 0.9748397589208839, + "grad_norm": 0.195739284157753, + "learning_rate": 1.3924281870582728e-05, + "loss": 0.8751, + "step": 30570 + }, + { + "epoch": 0.9751586466405179, + "grad_norm": 0.19495446979999542, + "learning_rate": 1.3865980676216942e-05, + "loss": 0.871, + "step": 30580 + }, + { + "epoch": 0.9754775343601518, + "grad_norm": 0.1931915432214737, + "learning_rate": 1.3807923589898957e-05, + "loss": 0.8798, + "step": 30590 + }, + { + "epoch": 0.9757964220797857, + "grad_norm": 0.2003222554922104, + "learning_rate": 1.3750109589544415e-05, + "loss": 0.8967, + "step": 30600 + }, + { + "epoch": 0.9761153097994196, + "grad_norm": 0.18771512806415558, + "learning_rate": 1.3692537657348436e-05, + "loss": 0.8726, + "step": 30610 + }, + { + "epoch": 0.9764341975190536, + "grad_norm": 0.19001594185829163, + "learning_rate": 1.3635206779767707e-05, + "loss": 0.8707, + "step": 30620 + }, + { + "epoch": 0.9767530852386874, + "grad_norm": 0.19214805960655212, + "learning_rate": 1.3578115947502643e-05, + "loss": 0.8654, + "step": 30630 + }, + { + "epoch": 0.9770719729583214, + "grad_norm": 0.20203489065170288, + "learning_rate": 1.3521264155479603e-05, + "loss": 0.8795, + "step": 30640 + }, + { + "epoch": 0.9773908606779553, + "grad_norm": 0.19535228610038757, + "learning_rate": 1.3464650402833215e-05, + "loss": 0.8786, + "step": 30650 + }, + { + "epoch": 0.9777097483975892, + "grad_norm": 0.19746150076389313, + "learning_rate": 1.3408273692888739e-05, + "loss": 0.8657, + "step": 30660 + }, + { + "epoch": 0.9780286361172231, + "grad_norm": 0.19501303136348724, + "learning_rate": 1.3352133033144537e-05, + "loss": 0.8683, + "step": 30670 + }, + { + "epoch": 0.9783475238368571, + "grad_norm": 0.19745995104312897, + "learning_rate": 1.3296227435254582e-05, + "loss": 0.8707, + "step": 30680 + }, + { + "epoch": 0.9786664115564909, + "grad_norm": 0.19865138828754425, + "learning_rate": 1.3240555915011072e-05, + "loss": 0.878, + "step": 30690 + }, + { + "epoch": 0.9789852992761249, + "grad_norm": 0.192633718252182, + "learning_rate": 1.31851174923271e-05, + "loss": 0.8738, + "step": 30700 + }, + { + "epoch": 0.9793041869957588, + "grad_norm": 0.1950562447309494, + "learning_rate": 1.3129911191219392e-05, + "loss": 0.8836, + "step": 30710 + }, + { + "epoch": 0.9796230747153927, + "grad_norm": 0.19444888830184937, + "learning_rate": 1.307493603979115e-05, + "loss": 0.866, + "step": 30720 + }, + { + "epoch": 0.9799419624350266, + "grad_norm": 0.19572365283966064, + "learning_rate": 1.3020191070214908e-05, + "loss": 0.8579, + "step": 30730 + }, + { + "epoch": 0.9802608501546606, + "grad_norm": 0.19500453770160675, + "learning_rate": 1.2965675318715508e-05, + "loss": 0.8549, + "step": 30740 + }, + { + "epoch": 0.9805797378742944, + "grad_norm": 0.19590529799461365, + "learning_rate": 1.2911387825553141e-05, + "loss": 0.8733, + "step": 30750 + }, + { + "epoch": 0.9808986255939284, + "grad_norm": 0.1961866170167923, + "learning_rate": 1.2857327635006445e-05, + "loss": 0.8761, + "step": 30760 + }, + { + "epoch": 0.9812175133135623, + "grad_norm": 0.19800586998462677, + "learning_rate": 1.2803493795355673e-05, + "loss": 0.8644, + "step": 30770 + }, + { + "epoch": 0.9815364010331962, + "grad_norm": 0.19354160130023956, + "learning_rate": 1.2749885358865947e-05, + "loss": 0.8724, + "step": 30780 + }, + { + "epoch": 0.9818552887528301, + "grad_norm": 0.20072521269321442, + "learning_rate": 1.269650138177057e-05, + "loss": 0.8646, + "step": 30790 + }, + { + "epoch": 0.9821741764724641, + "grad_norm": 0.19742512702941895, + "learning_rate": 1.2643340924254416e-05, + "loss": 0.8601, + "step": 30800 + }, + { + "epoch": 0.9824930641920979, + "grad_norm": 0.1960420161485672, + "learning_rate": 1.2590403050437372e-05, + "loss": 0.8592, + "step": 30810 + }, + { + "epoch": 0.9828119519117319, + "grad_norm": 0.1924024224281311, + "learning_rate": 1.2537686828357875e-05, + "loss": 0.8603, + "step": 30820 + }, + { + "epoch": 0.9831308396313658, + "grad_norm": 0.1885261833667755, + "learning_rate": 1.2485191329956501e-05, + "loss": 0.8623, + "step": 30830 + }, + { + "epoch": 0.9834497273509997, + "grad_norm": 0.1895126849412918, + "learning_rate": 1.2432915631059624e-05, + "loss": 0.8588, + "step": 30840 + }, + { + "epoch": 0.9837686150706336, + "grad_norm": 0.19329601526260376, + "learning_rate": 1.2380858811363149e-05, + "loss": 0.8686, + "step": 30850 + }, + { + "epoch": 0.9840875027902676, + "grad_norm": 0.20007088780403137, + "learning_rate": 1.2329019954416307e-05, + "loss": 0.865, + "step": 30860 + }, + { + "epoch": 0.9844063905099014, + "grad_norm": 0.19103839993476868, + "learning_rate": 1.2277398147605525e-05, + "loss": 0.8638, + "step": 30870 + }, + { + "epoch": 0.9847252782295354, + "grad_norm": 0.1993708461523056, + "learning_rate": 1.2225992482138354e-05, + "loss": 0.8641, + "step": 30880 + }, + { + "epoch": 0.9850441659491693, + "grad_norm": 0.1940104365348816, + "learning_rate": 1.2174802053027486e-05, + "loss": 0.8703, + "step": 30890 + }, + { + "epoch": 0.9853630536688032, + "grad_norm": 0.1930721402168274, + "learning_rate": 1.2123825959074799e-05, + "loss": 0.8716, + "step": 30900 + }, + { + "epoch": 0.9856819413884371, + "grad_norm": 0.1939040571451187, + "learning_rate": 1.2073063302855502e-05, + "loss": 0.8594, + "step": 30910 + }, + { + "epoch": 0.9860008291080711, + "grad_norm": 0.19301022589206696, + "learning_rate": 1.2022513190702337e-05, + "loss": 0.8726, + "step": 30920 + }, + { + "epoch": 0.9863197168277049, + "grad_norm": 0.19931401312351227, + "learning_rate": 1.1972174732689848e-05, + "loss": 0.8657, + "step": 30930 + }, + { + "epoch": 0.9866386045473389, + "grad_norm": 0.1907082498073578, + "learning_rate": 1.1922047042618712e-05, + "loss": 0.8565, + "step": 30940 + }, + { + "epoch": 0.9869574922669728, + "grad_norm": 0.19912324845790863, + "learning_rate": 1.1872129238000134e-05, + "loss": 0.8668, + "step": 30950 + }, + { + "epoch": 0.9872763799866067, + "grad_norm": 0.19937056303024292, + "learning_rate": 1.1822420440040315e-05, + "loss": 0.8742, + "step": 30960 + }, + { + "epoch": 0.9875952677062406, + "grad_norm": 0.19725482165813446, + "learning_rate": 1.1772919773624978e-05, + "loss": 0.8697, + "step": 30970 + }, + { + "epoch": 0.9879141554258746, + "grad_norm": 0.20150548219680786, + "learning_rate": 1.1723626367303966e-05, + "loss": 0.8748, + "step": 30980 + }, + { + "epoch": 0.9882330431455084, + "grad_norm": 0.200142964720726, + "learning_rate": 1.1674539353275899e-05, + "loss": 0.8869, + "step": 30990 + }, + { + "epoch": 0.9885519308651424, + "grad_norm": 0.18772722780704498, + "learning_rate": 1.1625657867372895e-05, + "loss": 0.8625, + "step": 31000 + }, + { + "epoch": 0.9888708185847763, + "grad_norm": 0.2008049041032791, + "learning_rate": 1.1576981049045353e-05, + "loss": 0.8983, + "step": 31010 + }, + { + "epoch": 0.9891897063044102, + "grad_norm": 0.1990879476070404, + "learning_rate": 1.1528508041346812e-05, + "loss": 0.868, + "step": 31020 + }, + { + "epoch": 0.9895085940240441, + "grad_norm": 0.19177481532096863, + "learning_rate": 1.1480237990918854e-05, + "loss": 0.869, + "step": 31030 + }, + { + "epoch": 0.9898274817436781, + "grad_norm": 0.19938461482524872, + "learning_rate": 1.1432170047976097e-05, + "loss": 0.8819, + "step": 31040 + }, + { + "epoch": 0.9901463694633119, + "grad_norm": 0.20128324627876282, + "learning_rate": 1.1384303366291208e-05, + "loss": 0.8721, + "step": 31050 + }, + { + "epoch": 0.9904652571829459, + "grad_norm": 0.19463203847408295, + "learning_rate": 1.1336637103180043e-05, + "loss": 0.863, + "step": 31060 + }, + { + "epoch": 0.9907841449025798, + "grad_norm": 0.20132602751255035, + "learning_rate": 1.1289170419486774e-05, + "loss": 0.8694, + "step": 31070 + }, + { + "epoch": 0.9911030326222137, + "grad_norm": 0.19993562996387482, + "learning_rate": 1.1241902479569133e-05, + "loss": 0.8651, + "step": 31080 + }, + { + "epoch": 0.9914219203418476, + "grad_norm": 0.20050421357154846, + "learning_rate": 1.1194832451283707e-05, + "loss": 0.8667, + "step": 31090 + }, + { + "epoch": 0.9917408080614816, + "grad_norm": 0.19323430955410004, + "learning_rate": 1.1147959505971274e-05, + "loss": 0.8682, + "step": 31100 + }, + { + "epoch": 0.9920596957811154, + "grad_norm": 0.19004856050014496, + "learning_rate": 1.110128281844223e-05, + "loss": 0.8526, + "step": 31110 + }, + { + "epoch": 0.9923785835007494, + "grad_norm": 0.19814559817314148, + "learning_rate": 1.1054801566962041e-05, + "loss": 0.8697, + "step": 31120 + }, + { + "epoch": 0.9926974712203833, + "grad_norm": 0.1958136260509491, + "learning_rate": 1.1008514933236803e-05, + "loss": 0.8659, + "step": 31130 + }, + { + "epoch": 0.9930163589400173, + "grad_norm": 0.19568458199501038, + "learning_rate": 1.096242210239881e-05, + "loss": 0.8782, + "step": 31140 + }, + { + "epoch": 0.9933352466596511, + "grad_norm": 0.20434001088142395, + "learning_rate": 1.0916522262992225e-05, + "loss": 0.868, + "step": 31150 + }, + { + "epoch": 0.9936541343792851, + "grad_norm": 0.19457891583442688, + "learning_rate": 1.0870814606958793e-05, + "loss": 0.8653, + "step": 31160 + }, + { + "epoch": 0.993973022098919, + "grad_norm": 0.19215627014636993, + "learning_rate": 1.0825298329623609e-05, + "loss": 0.8612, + "step": 31170 + }, + { + "epoch": 0.9942919098185529, + "grad_norm": 0.19397249817848206, + "learning_rate": 1.0779972629680952e-05, + "loss": 0.8594, + "step": 31180 + }, + { + "epoch": 0.9946107975381868, + "grad_norm": 0.19824834167957306, + "learning_rate": 1.0734836709180184e-05, + "loss": 0.8608, + "step": 31190 + }, + { + "epoch": 0.9949296852578208, + "grad_norm": 0.20180141925811768, + "learning_rate": 1.0689889773511703e-05, + "loss": 0.8641, + "step": 31200 + }, + { + "epoch": 0.9952485729774546, + "grad_norm": 0.2057270109653473, + "learning_rate": 1.0645131031392937e-05, + "loss": 0.8876, + "step": 31210 + }, + { + "epoch": 0.9955674606970886, + "grad_norm": 0.19196845591068268, + "learning_rate": 1.060055969485445e-05, + "loss": 0.8536, + "step": 31220 + }, + { + "epoch": 0.9958863484167225, + "grad_norm": 0.18629442155361176, + "learning_rate": 1.0556174979226025e-05, + "loss": 0.8596, + "step": 31230 + }, + { + "epoch": 0.9962052361363564, + "grad_norm": 0.1941034346818924, + "learning_rate": 1.0511976103122883e-05, + "loss": 0.856, + "step": 31240 + }, + { + "epoch": 0.9965241238559903, + "grad_norm": 0.20251859724521637, + "learning_rate": 1.0467962288431913e-05, + "loss": 0.8726, + "step": 31250 + }, + { + "epoch": 0.9968430115756243, + "grad_norm": 0.19077450037002563, + "learning_rate": 1.0424132760297974e-05, + "loss": 0.8549, + "step": 31260 + }, + { + "epoch": 0.9971618992952581, + "grad_norm": 0.2010996788740158, + "learning_rate": 1.0380486747110261e-05, + "loss": 0.8776, + "step": 31270 + }, + { + "epoch": 0.9974807870148921, + "grad_norm": 0.2025349736213684, + "learning_rate": 1.033702348048871e-05, + "loss": 0.8617, + "step": 31280 + }, + { + "epoch": 0.997799674734526, + "grad_norm": 0.19157281517982483, + "learning_rate": 1.0293742195270484e-05, + "loss": 0.8576, + "step": 31290 + }, + { + "epoch": 0.9981185624541599, + "grad_norm": 0.20192226767539978, + "learning_rate": 1.0250642129496489e-05, + "loss": 0.8726, + "step": 31300 + }, + { + "epoch": 0.9984374501737938, + "grad_norm": 0.19540460407733917, + "learning_rate": 1.0207722524397968e-05, + "loss": 0.8525, + "step": 31310 + }, + { + "epoch": 0.9987563378934278, + "grad_norm": 0.19221796095371246, + "learning_rate": 1.0164982624383143e-05, + "loss": 0.8681, + "step": 31320 + }, + { + "epoch": 0.9990752256130616, + "grad_norm": 0.1946989744901657, + "learning_rate": 1.0122421677023911e-05, + "loss": 0.8784, + "step": 31330 + }, + { + "epoch": 0.9993941133326956, + "grad_norm": 0.1920672059059143, + "learning_rate": 1.0080038933042593e-05, + "loss": 0.8539, + "step": 31340 + }, + { + "epoch": 0.9997130010523295, + "grad_norm": 0.19885636866092682, + "learning_rate": 1.003783364629875e-05, + "loss": 0.8791, + "step": 31350 + } + ], + "logging_steps": 10, + "max_steps": 31359, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1770770611838413e+18, + "train_batch_size": 512, + "trial_name": null, + "trial_params": null +}