{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.6255767963085037, "eval_steps": 500, "global_step": 5500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0065919578114700065, "grad_norm": 10.419755935668945, "learning_rate": 7.017543859649123e-06, "loss": 1.3206, "step": 10 }, { "epoch": 0.013183915622940013, "grad_norm": 7.308764457702637, "learning_rate": 1.4035087719298246e-05, "loss": 1.0588, "step": 20 }, { "epoch": 0.01977587343441002, "grad_norm": 3.7246360778808594, "learning_rate": 2.105263157894737e-05, "loss": 0.4529, "step": 30 }, { "epoch": 0.026367831245880026, "grad_norm": 1.944443702697754, "learning_rate": 2.8070175438596492e-05, "loss": 0.3158, "step": 40 }, { "epoch": 0.03295978905735003, "grad_norm": 1.4747883081436157, "learning_rate": 3.508771929824561e-05, "loss": 0.2119, "step": 50 }, { "epoch": 0.03955174686882004, "grad_norm": 1.336323857307434, "learning_rate": 4.210526315789474e-05, "loss": 0.1738, "step": 60 }, { "epoch": 0.04614370468029005, "grad_norm": 1.3995416164398193, "learning_rate": 4.912280701754386e-05, "loss": 0.1525, "step": 70 }, { "epoch": 0.05273566249176005, "grad_norm": 1.2422436475753784, "learning_rate": 5.6140350877192984e-05, "loss": 0.1339, "step": 80 }, { "epoch": 0.05932762030323006, "grad_norm": 1.134950876235962, "learning_rate": 6.31578947368421e-05, "loss": 0.1051, "step": 90 }, { "epoch": 0.06591957811470006, "grad_norm": 1.2677438259124756, "learning_rate": 7.017543859649122e-05, "loss": 0.1054, "step": 100 }, { "epoch": 0.07251153592617007, "grad_norm": 0.9868601560592651, "learning_rate": 7.719298245614036e-05, "loss": 0.0977, "step": 110 }, { "epoch": 0.07910349373764008, "grad_norm": 1.3295618295669556, "learning_rate": 8.421052631578948e-05, "loss": 0.0948, "step": 120 }, { "epoch": 0.08569545154911008, "grad_norm": 0.7764300107955933, "learning_rate": 9.12280701754386e-05, "loss": 0.0714, "step": 130 }, { "epoch": 0.0922874093605801, "grad_norm": 1.17441987991333, "learning_rate": 9.824561403508771e-05, "loss": 0.0742, "step": 140 }, { "epoch": 0.09887936717205009, "grad_norm": 1.4673572778701782, "learning_rate": 0.00010526315789473685, "loss": 0.0782, "step": 150 }, { "epoch": 0.1054713249835201, "grad_norm": 0.6098851561546326, "learning_rate": 0.00011228070175438597, "loss": 0.0778, "step": 160 }, { "epoch": 0.11206328279499012, "grad_norm": 0.921156644821167, "learning_rate": 0.00011929824561403509, "loss": 0.0761, "step": 170 }, { "epoch": 0.11865524060646011, "grad_norm": 0.9673042297363281, "learning_rate": 0.0001263157894736842, "loss": 0.0723, "step": 180 }, { "epoch": 0.12524719841793014, "grad_norm": 0.6645211577415466, "learning_rate": 0.00013333333333333334, "loss": 0.0672, "step": 190 }, { "epoch": 0.13183915622940012, "grad_norm": 1.5125294923782349, "learning_rate": 0.00014035087719298245, "loss": 0.0778, "step": 200 }, { "epoch": 0.13843111404087013, "grad_norm": 0.8468068242073059, "learning_rate": 0.00014736842105263158, "loss": 0.0674, "step": 210 }, { "epoch": 0.14502307185234015, "grad_norm": 0.9815834164619446, "learning_rate": 0.0001543859649122807, "loss": 0.068, "step": 220 }, { "epoch": 0.15161502966381016, "grad_norm": 0.7477302551269531, "learning_rate": 0.00016140350877192982, "loss": 0.0611, "step": 230 }, { "epoch": 0.15820698747528017, "grad_norm": 0.815642237663269, "learning_rate": 0.00016842105263157895, "loss": 0.0631, "step": 240 }, { "epoch": 0.16479894528675015, "grad_norm": 0.5081785321235657, "learning_rate": 0.00017543859649122806, "loss": 0.0607, "step": 250 }, { "epoch": 0.17139090309822017, "grad_norm": 0.718622624874115, "learning_rate": 0.0001824561403508772, "loss": 0.063, "step": 260 }, { "epoch": 0.17798286090969018, "grad_norm": 0.6826403141021729, "learning_rate": 0.00018947368421052632, "loss": 0.0611, "step": 270 }, { "epoch": 0.1845748187211602, "grad_norm": 0.730829656124115, "learning_rate": 0.00019649122807017543, "loss": 0.0636, "step": 280 }, { "epoch": 0.1911667765326302, "grad_norm": 0.976826012134552, "learning_rate": 0.00019999957723376656, "loss": 0.05, "step": 290 }, { "epoch": 0.19775873434410018, "grad_norm": 0.943250298500061, "learning_rate": 0.00019999619512534684, "loss": 0.0578, "step": 300 }, { "epoch": 0.2043506921555702, "grad_norm": 0.8867354393005371, "learning_rate": 0.00019998943102289446, "loss": 0.0623, "step": 310 }, { "epoch": 0.2109426499670402, "grad_norm": 0.4728466272354126, "learning_rate": 0.0001999792851551796, "loss": 0.0527, "step": 320 }, { "epoch": 0.21753460777851022, "grad_norm": 1.1633834838867188, "learning_rate": 0.00019996575786534806, "loss": 0.0568, "step": 330 }, { "epoch": 0.22412656558998023, "grad_norm": 1.2754548788070679, "learning_rate": 0.00019994884961090933, "loss": 0.0542, "step": 340 }, { "epoch": 0.23071852340145024, "grad_norm": 1.3917484283447266, "learning_rate": 0.0001999285609637213, "loss": 0.0627, "step": 350 }, { "epoch": 0.23731048121292023, "grad_norm": 0.7138040065765381, "learning_rate": 0.00019990489260997095, "loss": 0.0548, "step": 360 }, { "epoch": 0.24390243902439024, "grad_norm": 0.48101598024368286, "learning_rate": 0.00019987784535015107, "loss": 0.039, "step": 370 }, { "epoch": 0.2504943968358603, "grad_norm": 0.6185160279273987, "learning_rate": 0.00019984742009903313, "loss": 0.0516, "step": 380 }, { "epoch": 0.25708635464733026, "grad_norm": 0.5346279144287109, "learning_rate": 0.00019981361788563647, "loss": 0.0594, "step": 390 }, { "epoch": 0.26367831245880025, "grad_norm": 0.644879937171936, "learning_rate": 0.00019977643985319345, "loss": 0.0415, "step": 400 }, { "epoch": 0.2702702702702703, "grad_norm": 0.8102220296859741, "learning_rate": 0.00019973588725911076, "loss": 0.0466, "step": 410 }, { "epoch": 0.27686222808174027, "grad_norm": 0.6942636370658875, "learning_rate": 0.00019969196147492685, "loss": 0.0498, "step": 420 }, { "epoch": 0.2834541858932103, "grad_norm": 0.9616207480430603, "learning_rate": 0.0001996446639862657, "loss": 0.0489, "step": 430 }, { "epoch": 0.2900461437046803, "grad_norm": 0.7776756882667542, "learning_rate": 0.00019959399639278636, "loss": 0.0481, "step": 440 }, { "epoch": 0.2966381015161503, "grad_norm": 1.2910106182098389, "learning_rate": 0.0001995399604081291, "loss": 0.045, "step": 450 }, { "epoch": 0.3032300593276203, "grad_norm": 0.5743927955627441, "learning_rate": 0.00019948255785985717, "loss": 0.0419, "step": 460 }, { "epoch": 0.3098220171390903, "grad_norm": 0.6184401512145996, "learning_rate": 0.0001994217906893952, "loss": 0.0445, "step": 470 }, { "epoch": 0.31641397495056034, "grad_norm": 0.7128023505210876, "learning_rate": 0.00019935766095196348, "loss": 0.038, "step": 480 }, { "epoch": 0.3230059327620303, "grad_norm": 0.5777243375778198, "learning_rate": 0.00019929017081650845, "loss": 0.0417, "step": 490 }, { "epoch": 0.3295978905735003, "grad_norm": 0.6238442659378052, "learning_rate": 0.00019921932256562928, "loss": 0.0343, "step": 500 }, { "epoch": 0.33618984838497035, "grad_norm": 0.5910335183143616, "learning_rate": 0.00019914511859550076, "loss": 0.0428, "step": 510 }, { "epoch": 0.34278180619644033, "grad_norm": 0.5180881023406982, "learning_rate": 0.00019906756141579226, "loss": 0.034, "step": 520 }, { "epoch": 0.34937376400791037, "grad_norm": 0.34267082810401917, "learning_rate": 0.00019898665364958274, "loss": 0.0391, "step": 530 }, { "epoch": 0.35596572181938035, "grad_norm": 0.4261837303638458, "learning_rate": 0.00019890239803327213, "loss": 0.0453, "step": 540 }, { "epoch": 0.36255767963085034, "grad_norm": 0.5537737011909485, "learning_rate": 0.0001988147974164888, "loss": 0.0323, "step": 550 }, { "epoch": 0.3691496374423204, "grad_norm": 0.419604629278183, "learning_rate": 0.0001987238547619931, "loss": 0.037, "step": 560 }, { "epoch": 0.37574159525379036, "grad_norm": 0.8267730474472046, "learning_rate": 0.00019862957314557722, "loss": 0.0452, "step": 570 }, { "epoch": 0.3823335530652604, "grad_norm": 0.4021684229373932, "learning_rate": 0.0001985319557559611, "loss": 0.042, "step": 580 }, { "epoch": 0.3889255108767304, "grad_norm": 0.5992162227630615, "learning_rate": 0.0001984310058946847, "loss": 0.0323, "step": 590 }, { "epoch": 0.39551746868820037, "grad_norm": 0.5908779501914978, "learning_rate": 0.00019832672697599616, "loss": 0.0391, "step": 600 }, { "epoch": 0.4021094264996704, "grad_norm": 0.6762053370475769, "learning_rate": 0.0001982191225267365, "loss": 0.0444, "step": 610 }, { "epoch": 0.4087013843111404, "grad_norm": 0.6451637148857117, "learning_rate": 0.00019810819618622025, "loss": 0.0405, "step": 620 }, { "epoch": 0.41529334212261043, "grad_norm": 0.40719446539878845, "learning_rate": 0.00019799395170611237, "loss": 0.032, "step": 630 }, { "epoch": 0.4218852999340804, "grad_norm": 0.3678116202354431, "learning_rate": 0.00019787639295030135, "loss": 0.0295, "step": 640 }, { "epoch": 0.42847725774555045, "grad_norm": 0.5194733738899231, "learning_rate": 0.00019775552389476864, "loss": 0.0336, "step": 650 }, { "epoch": 0.43506921555702044, "grad_norm": 0.3977694511413574, "learning_rate": 0.00019763134862745395, "loss": 0.0339, "step": 660 }, { "epoch": 0.4416611733684904, "grad_norm": 0.4669642150402069, "learning_rate": 0.00019750387134811725, "loss": 0.0256, "step": 670 }, { "epoch": 0.44825313117996046, "grad_norm": 0.36751696467399597, "learning_rate": 0.00019737309636819658, "loss": 0.0437, "step": 680 }, { "epoch": 0.45484508899143045, "grad_norm": 0.8601805567741394, "learning_rate": 0.00019723902811066222, "loss": 0.0311, "step": 690 }, { "epoch": 0.4614370468029005, "grad_norm": 0.3772714138031006, "learning_rate": 0.00019710167110986718, "loss": 0.0334, "step": 700 }, { "epoch": 0.46802900461437047, "grad_norm": 0.5447872281074524, "learning_rate": 0.00019696103001139373, "loss": 0.0379, "step": 710 }, { "epoch": 0.47462096242584045, "grad_norm": 0.3882730007171631, "learning_rate": 0.00019681710957189651, "loss": 0.0314, "step": 720 }, { "epoch": 0.4812129202373105, "grad_norm": 0.3971300423145294, "learning_rate": 0.00019666991465894134, "loss": 0.0349, "step": 730 }, { "epoch": 0.4878048780487805, "grad_norm": 0.41290801763534546, "learning_rate": 0.00019651945025084082, "loss": 0.026, "step": 740 }, { "epoch": 0.4943968358602505, "grad_norm": 0.5641320943832397, "learning_rate": 0.00019636572143648588, "loss": 0.0415, "step": 750 }, { "epoch": 0.5009887936717206, "grad_norm": 0.537247896194458, "learning_rate": 0.00019620873341517366, "loss": 0.0336, "step": 760 }, { "epoch": 0.5075807514831905, "grad_norm": 0.3074246644973755, "learning_rate": 0.00019604849149643173, "loss": 0.0353, "step": 770 }, { "epoch": 0.5141727092946605, "grad_norm": 0.5712188482284546, "learning_rate": 0.0001958850010998384, "loss": 0.0356, "step": 780 }, { "epoch": 0.5207646671061306, "grad_norm": 0.7442903518676758, "learning_rate": 0.00019571826775483954, "loss": 0.0329, "step": 790 }, { "epoch": 0.5273566249176005, "grad_norm": 0.3076368570327759, "learning_rate": 0.00019554829710056147, "loss": 0.0366, "step": 800 }, { "epoch": 0.5339485827290705, "grad_norm": 0.4598945081233978, "learning_rate": 0.00019537509488562032, "loss": 0.0271, "step": 810 }, { "epoch": 0.5405405405405406, "grad_norm": 0.6214160919189453, "learning_rate": 0.0001951986669679275, "loss": 0.0414, "step": 820 }, { "epoch": 0.5471324983520105, "grad_norm": 0.5321261882781982, "learning_rate": 0.00019501901931449176, "loss": 0.0322, "step": 830 }, { "epoch": 0.5537244561634805, "grad_norm": 0.5436400771141052, "learning_rate": 0.00019483615800121716, "loss": 0.0261, "step": 840 }, { "epoch": 0.5603164139749506, "grad_norm": 0.39354392886161804, "learning_rate": 0.00019465008921269774, "loss": 0.0299, "step": 850 }, { "epoch": 0.5669083717864206, "grad_norm": 0.5768340229988098, "learning_rate": 0.00019446081924200825, "loss": 0.034, "step": 860 }, { "epoch": 0.5735003295978905, "grad_norm": 0.43069860339164734, "learning_rate": 0.0001942683544904914, "loss": 0.0286, "step": 870 }, { "epoch": 0.5800922874093606, "grad_norm": 0.6545283794403076, "learning_rate": 0.0001940727014675412, "loss": 0.0353, "step": 880 }, { "epoch": 0.5866842452208306, "grad_norm": 0.4092136323451996, "learning_rate": 0.00019387386679038307, "loss": 0.0284, "step": 890 }, { "epoch": 0.5932762030323006, "grad_norm": 0.38882556557655334, "learning_rate": 0.0001936718571838498, "loss": 0.0294, "step": 900 }, { "epoch": 0.5998681608437706, "grad_norm": 0.6306661367416382, "learning_rate": 0.0001934666794801541, "loss": 0.0331, "step": 910 }, { "epoch": 0.6064601186552406, "grad_norm": 0.35673242807388306, "learning_rate": 0.00019325834061865776, "loss": 0.0356, "step": 920 }, { "epoch": 0.6130520764667106, "grad_norm": 0.5133143067359924, "learning_rate": 0.00019304684764563675, "loss": 0.0349, "step": 930 }, { "epoch": 0.6196440342781806, "grad_norm": 0.32806074619293213, "learning_rate": 0.00019283220771404297, "loss": 0.0333, "step": 940 }, { "epoch": 0.6262359920896506, "grad_norm": 0.4773825705051422, "learning_rate": 0.00019261442808326225, "loss": 0.0302, "step": 950 }, { "epoch": 0.6328279499011207, "grad_norm": 0.7042701244354248, "learning_rate": 0.000192393516118869, "loss": 0.0375, "step": 960 }, { "epoch": 0.6394199077125906, "grad_norm": 0.40759798884391785, "learning_rate": 0.00019216947929237696, "loss": 0.0273, "step": 970 }, { "epoch": 0.6460118655240606, "grad_norm": 0.2918468713760376, "learning_rate": 0.0001919423251809865, "loss": 0.0321, "step": 980 }, { "epoch": 0.6526038233355307, "grad_norm": 0.3963848352432251, "learning_rate": 0.00019171206146732848, "loss": 0.0272, "step": 990 }, { "epoch": 0.6591957811470006, "grad_norm": 0.4451894462108612, "learning_rate": 0.00019147869593920424, "loss": 0.0348, "step": 1000 }, { "epoch": 0.6657877389584707, "grad_norm": 0.5182775855064392, "learning_rate": 0.00019124223648932232, "loss": 0.037, "step": 1010 }, { "epoch": 0.6723796967699407, "grad_norm": 0.6909258961677551, "learning_rate": 0.00019100269111503148, "loss": 0.0307, "step": 1020 }, { "epoch": 0.6789716545814107, "grad_norm": 0.4598976969718933, "learning_rate": 0.0001907600679180503, "loss": 0.0308, "step": 1030 }, { "epoch": 0.6855636123928807, "grad_norm": 0.5611470937728882, "learning_rate": 0.00019051437510419296, "loss": 0.0303, "step": 1040 }, { "epoch": 0.6921555702043507, "grad_norm": 0.2982472777366638, "learning_rate": 0.00019026562098309202, "loss": 0.0273, "step": 1050 }, { "epoch": 0.6987475280158207, "grad_norm": 0.4202234745025635, "learning_rate": 0.00019001381396791707, "loss": 0.0348, "step": 1060 }, { "epoch": 0.7053394858272907, "grad_norm": 0.3558894991874695, "learning_rate": 0.0001897589625750904, "loss": 0.0233, "step": 1070 }, { "epoch": 0.7119314436387607, "grad_norm": 0.4467407464981079, "learning_rate": 0.00018950107542399885, "loss": 0.0284, "step": 1080 }, { "epoch": 0.7185234014502307, "grad_norm": 1.036746859550476, "learning_rate": 0.00018924016123670235, "loss": 0.0969, "step": 1090 }, { "epoch": 0.7251153592617007, "grad_norm": 0.5457211136817932, "learning_rate": 0.00018897622883763888, "loss": 0.0622, "step": 1100 }, { "epoch": 0.7317073170731707, "grad_norm": 0.675666093826294, "learning_rate": 0.0001887092871533261, "loss": 0.0585, "step": 1110 }, { "epoch": 0.7382992748846408, "grad_norm": 1.0533791780471802, "learning_rate": 0.0001884393452120594, "loss": 0.0619, "step": 1120 }, { "epoch": 0.7448912326961108, "grad_norm": 0.5425913333892822, "learning_rate": 0.0001881664121436064, "loss": 0.0619, "step": 1130 }, { "epoch": 0.7514831905075807, "grad_norm": 0.46993035078048706, "learning_rate": 0.0001878904971788985, "loss": 0.0614, "step": 1140 }, { "epoch": 0.7580751483190508, "grad_norm": 0.5931456089019775, "learning_rate": 0.00018761160964971837, "loss": 0.0636, "step": 1150 }, { "epoch": 0.7646671061305208, "grad_norm": 0.42014068365097046, "learning_rate": 0.00018732975898838445, "loss": 0.0637, "step": 1160 }, { "epoch": 0.7712590639419907, "grad_norm": 0.5799768567085266, "learning_rate": 0.00018704495472743207, "loss": 0.062, "step": 1170 }, { "epoch": 0.7778510217534608, "grad_norm": 0.9316193461418152, "learning_rate": 0.0001867572064992908, "loss": 0.0674, "step": 1180 }, { "epoch": 0.7844429795649308, "grad_norm": 0.9099822044372559, "learning_rate": 0.00018646652403595885, "loss": 0.0656, "step": 1190 }, { "epoch": 0.7910349373764007, "grad_norm": 0.5124749541282654, "learning_rate": 0.00018617291716867382, "loss": 0.0801, "step": 1200 }, { "epoch": 0.7976268951878708, "grad_norm": 0.6862666010856628, "learning_rate": 0.00018587639582758031, "loss": 0.0668, "step": 1210 }, { "epoch": 0.8042188529993408, "grad_norm": 0.6310227513313293, "learning_rate": 0.00018557697004139393, "loss": 0.0663, "step": 1220 }, { "epoch": 0.8108108108108109, "grad_norm": 0.6013323068618774, "learning_rate": 0.00018527464993706226, "loss": 0.065, "step": 1230 }, { "epoch": 0.8174027686222808, "grad_norm": 0.5435302257537842, "learning_rate": 0.00018496944573942218, "loss": 0.0569, "step": 1240 }, { "epoch": 0.8239947264337508, "grad_norm": 0.6998788118362427, "learning_rate": 0.0001846613677708542, "loss": 0.0656, "step": 1250 }, { "epoch": 0.8305866842452209, "grad_norm": 0.6783934235572815, "learning_rate": 0.0001843504264509333, "loss": 0.0524, "step": 1260 }, { "epoch": 0.8371786420566908, "grad_norm": 0.6428571343421936, "learning_rate": 0.00018403663229607652, "loss": 0.0537, "step": 1270 }, { "epoch": 0.8437705998681608, "grad_norm": 0.6518540382385254, "learning_rate": 0.00018371999591918723, "loss": 0.0679, "step": 1280 }, { "epoch": 0.8503625576796309, "grad_norm": 0.5521682500839233, "learning_rate": 0.0001834005280292963, "loss": 0.0609, "step": 1290 }, { "epoch": 0.8569545154911009, "grad_norm": 0.5179183483123779, "learning_rate": 0.00018307823943119976, "loss": 0.0623, "step": 1300 }, { "epoch": 0.8635464733025708, "grad_norm": 0.46778643131256104, "learning_rate": 0.00018275314102509356, "loss": 0.0543, "step": 1310 }, { "epoch": 0.8701384311140409, "grad_norm": 0.5073345303535461, "learning_rate": 0.00018242524380620472, "loss": 0.0498, "step": 1320 }, { "epoch": 0.8767303889255109, "grad_norm": 0.6397154331207275, "learning_rate": 0.00018209455886441958, "loss": 0.0638, "step": 1330 }, { "epoch": 0.8833223467369808, "grad_norm": 0.5524711012840271, "learning_rate": 0.00018176109738390865, "loss": 0.0697, "step": 1340 }, { "epoch": 0.8899143045484509, "grad_norm": 0.5283142328262329, "learning_rate": 0.00018142487064274848, "loss": 0.0535, "step": 1350 }, { "epoch": 0.8965062623599209, "grad_norm": 0.6182568669319153, "learning_rate": 0.00018108589001254, "loss": 0.075, "step": 1360 }, { "epoch": 0.9030982201713909, "grad_norm": 0.6030199527740479, "learning_rate": 0.0001807441669580242, "loss": 0.0507, "step": 1370 }, { "epoch": 0.9096901779828609, "grad_norm": 0.8080623149871826, "learning_rate": 0.00018039971303669407, "loss": 0.0573, "step": 1380 }, { "epoch": 0.9162821357943309, "grad_norm": 0.666598379611969, "learning_rate": 0.00018005253989840397, "loss": 0.0564, "step": 1390 }, { "epoch": 0.922874093605801, "grad_norm": 0.4886372983455658, "learning_rate": 0.00017970265928497542, "loss": 0.058, "step": 1400 }, { "epoch": 0.9294660514172709, "grad_norm": 0.5167548656463623, "learning_rate": 0.00017935008302980023, "loss": 0.0651, "step": 1410 }, { "epoch": 0.9360580092287409, "grad_norm": 0.48855462670326233, "learning_rate": 0.00017899482305743997, "loss": 0.0539, "step": 1420 }, { "epoch": 0.942649967040211, "grad_norm": 0.42956027388572693, "learning_rate": 0.00017863689138322282, "loss": 0.0517, "step": 1430 }, { "epoch": 0.9492419248516809, "grad_norm": 0.8002837896347046, "learning_rate": 0.00017827630011283739, "loss": 0.0417, "step": 1440 }, { "epoch": 0.955833882663151, "grad_norm": 0.6120493412017822, "learning_rate": 0.00017791306144192284, "loss": 0.0558, "step": 1450 }, { "epoch": 0.962425840474621, "grad_norm": 0.4339682161808014, "learning_rate": 0.00017754718765565681, "loss": 0.0502, "step": 1460 }, { "epoch": 0.9690177982860909, "grad_norm": 0.34096962213516235, "learning_rate": 0.00017717869112833978, "loss": 0.0362, "step": 1470 }, { "epoch": 0.975609756097561, "grad_norm": 0.2626149356365204, "learning_rate": 0.00017680758432297644, "loss": 0.0505, "step": 1480 }, { "epoch": 0.982201713909031, "grad_norm": 0.46172085404396057, "learning_rate": 0.00017643387979085444, "loss": 0.0481, "step": 1490 }, { "epoch": 0.988793671720501, "grad_norm": 0.7232904434204102, "learning_rate": 0.00017605759017111963, "loss": 0.0609, "step": 1500 }, { "epoch": 0.995385629531971, "grad_norm": 0.594722330570221, "learning_rate": 0.00017567872819034875, "loss": 0.0457, "step": 1510 }, { "epoch": 1.0019775873434411, "grad_norm": 0.5150778889656067, "learning_rate": 0.00017529730666211886, "loss": 0.0508, "step": 1520 }, { "epoch": 1.008569545154911, "grad_norm": 0.7054618000984192, "learning_rate": 0.00017491333848657418, "loss": 0.0584, "step": 1530 }, { "epoch": 1.015161502966381, "grad_norm": 0.7400113940238953, "learning_rate": 0.00017452683664998954, "loss": 0.0479, "step": 1540 }, { "epoch": 1.0217534607778511, "grad_norm": 0.42146751284599304, "learning_rate": 0.00017413781422433145, "loss": 0.0593, "step": 1550 }, { "epoch": 1.028345418589321, "grad_norm": 0.43894773721694946, "learning_rate": 0.00017374628436681567, "loss": 0.0516, "step": 1560 }, { "epoch": 1.034937376400791, "grad_norm": 0.6406469941139221, "learning_rate": 0.0001733522603194625, "loss": 0.0426, "step": 1570 }, { "epoch": 1.0415293342122611, "grad_norm": 0.4033351540565491, "learning_rate": 0.00017295575540864877, "loss": 0.0508, "step": 1580 }, { "epoch": 1.048121292023731, "grad_norm": 0.4245661199092865, "learning_rate": 0.0001725567830446571, "loss": 0.0525, "step": 1590 }, { "epoch": 1.054713249835201, "grad_norm": 0.4368399381637573, "learning_rate": 0.0001721553567212225, "loss": 0.0525, "step": 1600 }, { "epoch": 1.0613052076466711, "grad_norm": 0.4312591552734375, "learning_rate": 0.00017175149001507581, "loss": 0.0501, "step": 1610 }, { "epoch": 1.067897165458141, "grad_norm": 0.43050646781921387, "learning_rate": 0.0001713451965854847, "loss": 0.0511, "step": 1620 }, { "epoch": 1.074489123269611, "grad_norm": 0.28264015913009644, "learning_rate": 0.0001709364901737915, "loss": 0.0407, "step": 1630 }, { "epoch": 1.0810810810810811, "grad_norm": 0.44433021545410156, "learning_rate": 0.00017052538460294865, "loss": 0.0434, "step": 1640 }, { "epoch": 1.087673038892551, "grad_norm": 0.40889009833335876, "learning_rate": 0.00017011189377705104, "loss": 0.0363, "step": 1650 }, { "epoch": 1.094264996704021, "grad_norm": 0.5670244693756104, "learning_rate": 0.00016969603168086583, "loss": 0.0526, "step": 1660 }, { "epoch": 1.1008569545154911, "grad_norm": 0.4179556965827942, "learning_rate": 0.00016927781237935946, "loss": 0.0444, "step": 1670 }, { "epoch": 1.107448912326961, "grad_norm": 0.49149006605148315, "learning_rate": 0.00016885725001722197, "loss": 0.0503, "step": 1680 }, { "epoch": 1.1140408701384312, "grad_norm": 0.5782753229141235, "learning_rate": 0.00016843435881838846, "loss": 0.0644, "step": 1690 }, { "epoch": 1.1206328279499012, "grad_norm": 0.3774556815624237, "learning_rate": 0.00016800915308555833, "loss": 0.0434, "step": 1700 }, { "epoch": 1.127224785761371, "grad_norm": 0.4292038679122925, "learning_rate": 0.00016758164719971123, "loss": 0.0369, "step": 1710 }, { "epoch": 1.133816743572841, "grad_norm": 0.4183276891708374, "learning_rate": 0.00016715185561962085, "loss": 0.0456, "step": 1720 }, { "epoch": 1.1404087013843112, "grad_norm": 0.5465393662452698, "learning_rate": 0.00016671979288136588, "loss": 0.0467, "step": 1730 }, { "epoch": 1.147000659195781, "grad_norm": 0.5326853394508362, "learning_rate": 0.0001662854735978383, "loss": 0.0541, "step": 1740 }, { "epoch": 1.1535926170072512, "grad_norm": 0.4676070809364319, "learning_rate": 0.00016584891245824933, "loss": 0.0409, "step": 1750 }, { "epoch": 1.1601845748187212, "grad_norm": 0.3939380645751953, "learning_rate": 0.00016541012422763242, "loss": 0.0405, "step": 1760 }, { "epoch": 1.166776532630191, "grad_norm": 0.4636197090148926, "learning_rate": 0.00016496912374634398, "loss": 0.047, "step": 1770 }, { "epoch": 1.1733684904416612, "grad_norm": 0.3863416612148285, "learning_rate": 0.0001645259259295615, "loss": 0.0411, "step": 1780 }, { "epoch": 1.1799604482531312, "grad_norm": 0.4277307093143463, "learning_rate": 0.00016408054576677906, "loss": 0.0436, "step": 1790 }, { "epoch": 1.186552406064601, "grad_norm": 0.3793676793575287, "learning_rate": 0.00016363299832130026, "loss": 0.0444, "step": 1800 }, { "epoch": 1.1931443638760713, "grad_norm": 0.4446391761302948, "learning_rate": 0.00016318329872972898, "loss": 0.0362, "step": 1810 }, { "epoch": 1.1997363216875412, "grad_norm": 0.44605711102485657, "learning_rate": 0.00016273146220145725, "loss": 0.032, "step": 1820 }, { "epoch": 1.2063282794990111, "grad_norm": 0.46649622917175293, "learning_rate": 0.00016227750401815092, "loss": 0.0367, "step": 1830 }, { "epoch": 1.2129202373104813, "grad_norm": 0.44259113073349, "learning_rate": 0.00016182143953323287, "loss": 0.0387, "step": 1840 }, { "epoch": 1.2195121951219512, "grad_norm": 0.519391655921936, "learning_rate": 0.0001613632841713637, "loss": 0.0384, "step": 1850 }, { "epoch": 1.2261041529334213, "grad_norm": 0.5217844843864441, "learning_rate": 0.00016090305342791989, "loss": 0.0366, "step": 1860 }, { "epoch": 1.2326961107448913, "grad_norm": 0.6881841421127319, "learning_rate": 0.00016044076286847, "loss": 0.0393, "step": 1870 }, { "epoch": 1.2392880685563612, "grad_norm": 0.39065277576446533, "learning_rate": 0.0001599764281282481, "loss": 0.0373, "step": 1880 }, { "epoch": 1.2458800263678311, "grad_norm": 0.5099205374717712, "learning_rate": 0.00015951006491162493, "loss": 0.0467, "step": 1890 }, { "epoch": 1.2524719841793013, "grad_norm": 0.6664186716079712, "learning_rate": 0.00015904168899157672, "loss": 0.0582, "step": 1900 }, { "epoch": 1.2590639419907712, "grad_norm": 0.5066046714782715, "learning_rate": 0.00015857131620915194, "loss": 0.0369, "step": 1910 }, { "epoch": 1.2656558998022414, "grad_norm": 0.3705768585205078, "learning_rate": 0.0001580989624729353, "loss": 0.0361, "step": 1920 }, { "epoch": 1.2722478576137113, "grad_norm": 0.5163792371749878, "learning_rate": 0.00015762464375850983, "loss": 0.0411, "step": 1930 }, { "epoch": 1.2788398154251812, "grad_norm": 0.4548538029193878, "learning_rate": 0.0001571483761079165, "loss": 0.0365, "step": 1940 }, { "epoch": 1.2854317732366514, "grad_norm": 0.3885802626609802, "learning_rate": 0.00015667017562911176, "loss": 0.0411, "step": 1950 }, { "epoch": 1.2920237310481213, "grad_norm": 0.4474758803844452, "learning_rate": 0.00015619005849542262, "loss": 0.0427, "step": 1960 }, { "epoch": 1.2986156888595912, "grad_norm": 0.33950087428092957, "learning_rate": 0.00015570804094499976, "loss": 0.0421, "step": 1970 }, { "epoch": 1.3052076466710614, "grad_norm": 0.33460333943367004, "learning_rate": 0.0001552241392802682, "loss": 0.0411, "step": 1980 }, { "epoch": 1.3117996044825313, "grad_norm": 0.5268707871437073, "learning_rate": 0.00015473836986737611, "loss": 0.0315, "step": 1990 }, { "epoch": 1.3183915622940012, "grad_norm": 0.42631861567497253, "learning_rate": 0.00015425074913564117, "loss": 0.0409, "step": 2000 }, { "epoch": 1.3249835201054714, "grad_norm": 0.3418896496295929, "learning_rate": 0.00015376129357699494, "loss": 0.0402, "step": 2010 }, { "epoch": 1.3315754779169413, "grad_norm": 0.43932780623435974, "learning_rate": 0.000153270019745425, "loss": 0.0408, "step": 2020 }, { "epoch": 1.3381674357284115, "grad_norm": 0.653154730796814, "learning_rate": 0.00015277694425641528, "loss": 0.0395, "step": 2030 }, { "epoch": 1.3447593935398814, "grad_norm": 0.49582821130752563, "learning_rate": 0.0001522820837863839, "loss": 0.0356, "step": 2040 }, { "epoch": 1.3513513513513513, "grad_norm": 0.24573160707950592, "learning_rate": 0.0001517854550721192, "loss": 0.0347, "step": 2050 }, { "epoch": 1.3579433091628212, "grad_norm": 0.2804608643054962, "learning_rate": 0.0001512870749102138, "loss": 0.04, "step": 2060 }, { "epoch": 1.3645352669742914, "grad_norm": 0.3183498680591583, "learning_rate": 0.00015078696015649639, "loss": 0.0388, "step": 2070 }, { "epoch": 1.3711272247857613, "grad_norm": 0.4484601318836212, "learning_rate": 0.0001502851277254616, "loss": 0.0376, "step": 2080 }, { "epoch": 1.3777191825972315, "grad_norm": 0.3878334164619446, "learning_rate": 0.00014978159458969815, "loss": 0.0349, "step": 2090 }, { "epoch": 1.3843111404087014, "grad_norm": 0.5986606478691101, "learning_rate": 0.0001492763777793146, "loss": 0.0333, "step": 2100 }, { "epoch": 1.3909030982201713, "grad_norm": 0.509537935256958, "learning_rate": 0.00014876949438136347, "loss": 0.0344, "step": 2110 }, { "epoch": 1.3974950560316415, "grad_norm": 0.5156903862953186, "learning_rate": 0.00014826096153926332, "loss": 0.0511, "step": 2120 }, { "epoch": 1.4040870138431114, "grad_norm": 0.5031445622444153, "learning_rate": 0.0001477507964522189, "loss": 0.0427, "step": 2130 }, { "epoch": 1.4106789716545813, "grad_norm": 0.603550910949707, "learning_rate": 0.00014723901637463956, "loss": 0.0379, "step": 2140 }, { "epoch": 1.4172709294660515, "grad_norm": 0.38619402050971985, "learning_rate": 0.0001467256386155556, "loss": 0.0365, "step": 2150 }, { "epoch": 1.4238628872775214, "grad_norm": 0.4471161365509033, "learning_rate": 0.00014621068053803272, "loss": 0.037, "step": 2160 }, { "epoch": 1.4304548450889913, "grad_norm": 0.5192623734474182, "learning_rate": 0.00014569415955858518, "loss": 0.0374, "step": 2170 }, { "epoch": 1.4370468029004615, "grad_norm": 0.3612404465675354, "learning_rate": 0.00014517609314658632, "loss": 0.0334, "step": 2180 }, { "epoch": 1.4436387607119314, "grad_norm": 0.5519164800643921, "learning_rate": 0.00014465649882367793, "loss": 0.036, "step": 2190 }, { "epoch": 1.4502307185234016, "grad_norm": 0.3743966221809387, "learning_rate": 0.00014413539416317767, "loss": 0.0316, "step": 2200 }, { "epoch": 1.4568226763348715, "grad_norm": 0.44635990262031555, "learning_rate": 0.00014361279678948467, "loss": 0.0419, "step": 2210 }, { "epoch": 1.4634146341463414, "grad_norm": 0.42854979634284973, "learning_rate": 0.0001430887243774834, "loss": 0.0332, "step": 2220 }, { "epoch": 1.4700065919578114, "grad_norm": 0.349597692489624, "learning_rate": 0.000142563194651946, "loss": 0.0379, "step": 2230 }, { "epoch": 1.4765985497692815, "grad_norm": 0.7277832627296448, "learning_rate": 0.00014203622538693262, "loss": 0.0357, "step": 2240 }, { "epoch": 1.4831905075807514, "grad_norm": 0.664358377456665, "learning_rate": 0.0001415078344051906, "loss": 0.0392, "step": 2250 }, { "epoch": 1.4897824653922216, "grad_norm": 0.7072487473487854, "learning_rate": 0.00014097803957755132, "loss": 0.0459, "step": 2260 }, { "epoch": 1.4963744232036915, "grad_norm": 0.426103413105011, "learning_rate": 0.000140446858822326, "loss": 0.0413, "step": 2270 }, { "epoch": 1.5029663810151614, "grad_norm": 0.9970699548721313, "learning_rate": 0.00013991431010469967, "loss": 0.0409, "step": 2280 }, { "epoch": 1.5095583388266314, "grad_norm": 0.4669429063796997, "learning_rate": 0.0001393804114361235, "loss": 0.0334, "step": 2290 }, { "epoch": 1.5161502966381015, "grad_norm": 0.8753983974456787, "learning_rate": 0.0001388451808737056, "loss": 0.0425, "step": 2300 }, { "epoch": 1.5227422544495717, "grad_norm": 0.3841276466846466, "learning_rate": 0.00013830863651960042, "loss": 0.0352, "step": 2310 }, { "epoch": 1.5293342122610416, "grad_norm": 0.6196696758270264, "learning_rate": 0.0001377707965203965, "loss": 0.0368, "step": 2320 }, { "epoch": 1.5359261700725115, "grad_norm": 0.3803444802761078, "learning_rate": 0.00013723167906650263, "loss": 0.0381, "step": 2330 }, { "epoch": 1.5425181278839815, "grad_norm": 0.4198368191719055, "learning_rate": 0.00013669130239153264, "loss": 0.0414, "step": 2340 }, { "epoch": 1.5491100856954514, "grad_norm": 0.5054610967636108, "learning_rate": 0.0001361496847716889, "loss": 0.0345, "step": 2350 }, { "epoch": 1.5557020435069215, "grad_norm": 0.46151942014694214, "learning_rate": 0.00013560684452514394, "loss": 0.0325, "step": 2360 }, { "epoch": 1.5622940013183917, "grad_norm": 0.5142484903335571, "learning_rate": 0.00013506280001142104, "loss": 0.034, "step": 2370 }, { "epoch": 1.5688859591298616, "grad_norm": 0.46030113101005554, "learning_rate": 0.0001345175696307733, "loss": 0.0271, "step": 2380 }, { "epoch": 1.5754779169413315, "grad_norm": 0.7596203088760376, "learning_rate": 0.00013397117182356136, "loss": 0.0389, "step": 2390 }, { "epoch": 1.5820698747528015, "grad_norm": 0.5491079688072205, "learning_rate": 0.00013342362506962952, "loss": 0.0348, "step": 2400 }, { "epoch": 1.5886618325642716, "grad_norm": 0.7162690162658691, "learning_rate": 0.00013287494788768095, "loss": 0.0377, "step": 2410 }, { "epoch": 1.5952537903757416, "grad_norm": 1.0456359386444092, "learning_rate": 0.00013232515883465125, "loss": 0.0364, "step": 2420 }, { "epoch": 1.6018457481872117, "grad_norm": 0.5690692067146301, "learning_rate": 0.00013177427650508093, "loss": 0.0338, "step": 2430 }, { "epoch": 1.6084377059986816, "grad_norm": 0.46750396490097046, "learning_rate": 0.0001312223195304864, "loss": 0.0357, "step": 2440 }, { "epoch": 1.6150296638101516, "grad_norm": 0.3841594159603119, "learning_rate": 0.00013066930657872986, "loss": 0.033, "step": 2450 }, { "epoch": 1.6216216216216215, "grad_norm": 0.46823808550834656, "learning_rate": 0.00013011525635338796, "loss": 0.0306, "step": 2460 }, { "epoch": 1.6282135794330916, "grad_norm": 0.5914480686187744, "learning_rate": 0.00012956018759311934, "loss": 0.0322, "step": 2470 }, { "epoch": 1.6348055372445618, "grad_norm": 0.2972632944583893, "learning_rate": 0.0001290041190710306, "loss": 0.0436, "step": 2480 }, { "epoch": 1.6413974950560317, "grad_norm": 0.3952536880970001, "learning_rate": 0.0001284470695940415, "loss": 0.0386, "step": 2490 }, { "epoch": 1.6479894528675016, "grad_norm": 0.42823290824890137, "learning_rate": 0.00012788905800224904, "loss": 0.0396, "step": 2500 }, { "epoch": 1.6545814106789716, "grad_norm": 0.39351433515548706, "learning_rate": 0.00012733010316828998, "loss": 0.0307, "step": 2510 }, { "epoch": 1.6611733684904415, "grad_norm": 0.23917658627033234, "learning_rate": 0.00012677022399670279, "loss": 0.0248, "step": 2520 }, { "epoch": 1.6677653263019117, "grad_norm": 0.3799072802066803, "learning_rate": 0.00012620943942328802, "loss": 0.0313, "step": 2530 }, { "epoch": 1.6743572841133818, "grad_norm": 0.36729612946510315, "learning_rate": 0.00012564776841446826, "loss": 0.0392, "step": 2540 }, { "epoch": 1.6809492419248517, "grad_norm": 0.45756933093070984, "learning_rate": 0.00012508522996664622, "loss": 0.0333, "step": 2550 }, { "epoch": 1.6875411997363217, "grad_norm": 0.6483779549598694, "learning_rate": 0.00012452184310556257, "loss": 0.0398, "step": 2560 }, { "epoch": 1.6941331575477916, "grad_norm": 0.3594001829624176, "learning_rate": 0.0001239576268856523, "loss": 0.0418, "step": 2570 }, { "epoch": 1.7007251153592617, "grad_norm": 0.7351908683776855, "learning_rate": 0.00012339260038940044, "loss": 0.0401, "step": 2580 }, { "epoch": 1.7073170731707317, "grad_norm": 0.6257510781288147, "learning_rate": 0.0001228267827266964, "loss": 0.0323, "step": 2590 }, { "epoch": 1.7139090309822018, "grad_norm": 0.2904309034347534, "learning_rate": 0.00012226019303418793, "loss": 0.0353, "step": 2600 }, { "epoch": 1.7205009887936717, "grad_norm": 0.403761625289917, "learning_rate": 0.00012169285047463372, "loss": 0.0303, "step": 2610 }, { "epoch": 1.7270929466051417, "grad_norm": 0.5491513013839722, "learning_rate": 0.00012112477423625539, "loss": 0.03, "step": 2620 }, { "epoch": 1.7336849044166116, "grad_norm": 0.6333141922950745, "learning_rate": 0.00012055598353208845, "loss": 0.0323, "step": 2630 }, { "epoch": 1.7402768622280818, "grad_norm": 0.5011347532272339, "learning_rate": 0.00011998649759933253, "loss": 0.0306, "step": 2640 }, { "epoch": 1.746868820039552, "grad_norm": 0.48753035068511963, "learning_rate": 0.00011941633569870073, "loss": 0.0251, "step": 2650 }, { "epoch": 1.7534607778510218, "grad_norm": 0.3967491388320923, "learning_rate": 0.0001188455171137682, "loss": 0.0285, "step": 2660 }, { "epoch": 1.7600527356624918, "grad_norm": 0.5975573062896729, "learning_rate": 0.00011827406115032005, "loss": 0.0355, "step": 2670 }, { "epoch": 1.7666446934739617, "grad_norm": 0.3887605369091034, "learning_rate": 0.00011770198713569814, "loss": 0.0293, "step": 2680 }, { "epoch": 1.7732366512854316, "grad_norm": 0.2963058650493622, "learning_rate": 0.00011712931441814776, "loss": 0.0229, "step": 2690 }, { "epoch": 1.7798286090969018, "grad_norm": 0.2195950299501419, "learning_rate": 0.00011655606236616302, "loss": 0.0285, "step": 2700 }, { "epoch": 1.786420566908372, "grad_norm": 0.3120424151420593, "learning_rate": 0.00011598225036783168, "loss": 0.0261, "step": 2710 }, { "epoch": 1.7930125247198418, "grad_norm": 0.23726621270179749, "learning_rate": 0.0001154078978301798, "loss": 0.03, "step": 2720 }, { "epoch": 1.7996044825313118, "grad_norm": 0.3860594928264618, "learning_rate": 0.00011483302417851503, "loss": 0.0279, "step": 2730 }, { "epoch": 1.8061964403427817, "grad_norm": 0.3965723514556885, "learning_rate": 0.00011425764885576968, "loss": 0.0291, "step": 2740 }, { "epoch": 1.8127883981542519, "grad_norm": 0.415278822183609, "learning_rate": 0.00011368179132184331, "loss": 0.0313, "step": 2750 }, { "epoch": 1.8193803559657218, "grad_norm": 0.43956270813941956, "learning_rate": 0.0001131054710529444, "loss": 0.0334, "step": 2760 }, { "epoch": 1.825972313777192, "grad_norm": 0.509265124797821, "learning_rate": 0.00011252870754093168, "loss": 0.0285, "step": 2770 }, { "epoch": 1.8325642715886619, "grad_norm": 0.373096227645874, "learning_rate": 0.00011195152029265492, "loss": 0.0323, "step": 2780 }, { "epoch": 1.8391562294001318, "grad_norm": 0.5025624632835388, "learning_rate": 0.00011137392882929515, "loss": 0.0266, "step": 2790 }, { "epoch": 1.8457481872116017, "grad_norm": 0.6312963962554932, "learning_rate": 0.00011079595268570451, "loss": 0.033, "step": 2800 }, { "epoch": 1.8523401450230719, "grad_norm": 0.525477409362793, "learning_rate": 0.00011021761140974547, "loss": 0.0323, "step": 2810 }, { "epoch": 1.858932102834542, "grad_norm": 0.5473213791847229, "learning_rate": 0.00010963892456162966, "loss": 0.0297, "step": 2820 }, { "epoch": 1.865524060646012, "grad_norm": 0.3462737202644348, "learning_rate": 0.00010905991171325645, "loss": 0.035, "step": 2830 }, { "epoch": 1.8721160184574819, "grad_norm": 0.41637200117111206, "learning_rate": 0.00010848059244755093, "loss": 0.0342, "step": 2840 }, { "epoch": 1.8787079762689518, "grad_norm": 0.4165598154067993, "learning_rate": 0.00010790098635780164, "loss": 0.0282, "step": 2850 }, { "epoch": 1.8852999340804217, "grad_norm": 0.5277098417282104, "learning_rate": 0.00010732111304699772, "loss": 0.0324, "step": 2860 }, { "epoch": 1.8918918918918919, "grad_norm": 0.6012358069419861, "learning_rate": 0.00010674099212716627, "loss": 0.0326, "step": 2870 }, { "epoch": 1.898483849703362, "grad_norm": 0.4933426082134247, "learning_rate": 0.00010616064321870862, "loss": 0.0261, "step": 2880 }, { "epoch": 1.905075807514832, "grad_norm": 0.5412283539772034, "learning_rate": 0.00010558008594973709, "loss": 0.0394, "step": 2890 }, { "epoch": 1.911667765326302, "grad_norm": 0.33404308557510376, "learning_rate": 0.00010499933995541096, "loss": 0.0324, "step": 2900 }, { "epoch": 1.9182597231377718, "grad_norm": 0.33016228675842285, "learning_rate": 0.00010441842487727247, "loss": 0.0291, "step": 2910 }, { "epoch": 1.924851680949242, "grad_norm": 0.3352147936820984, "learning_rate": 0.0001038373603625824, "loss": 0.0284, "step": 2920 }, { "epoch": 1.931443638760712, "grad_norm": 0.6947084665298462, "learning_rate": 0.00010325616606365571, "loss": 0.0338, "step": 2930 }, { "epoch": 1.938035596572182, "grad_norm": 0.5351916551589966, "learning_rate": 0.00010267486163719684, "loss": 0.037, "step": 2940 }, { "epoch": 1.944627554383652, "grad_norm": 0.3044882118701935, "learning_rate": 0.00010209346674363487, "loss": 0.0434, "step": 2950 }, { "epoch": 1.951219512195122, "grad_norm": 0.3456181287765503, "learning_rate": 0.00010151200104645855, "loss": 0.0313, "step": 2960 }, { "epoch": 1.9578114700065918, "grad_norm": 0.5266302824020386, "learning_rate": 0.00010093048421155136, "loss": 0.0315, "step": 2970 }, { "epoch": 1.964403427818062, "grad_norm": 0.43212220072746277, "learning_rate": 0.00010034893590652625, "loss": 0.0338, "step": 2980 }, { "epoch": 1.9709953856295321, "grad_norm": 0.48578178882598877, "learning_rate": 9.976737580006057e-05, "loss": 0.0351, "step": 2990 }, { "epoch": 1.977587343441002, "grad_norm": 0.38649991154670715, "learning_rate": 9.91858235612308e-05, "loss": 0.0376, "step": 3000 }, { "epoch": 1.984179301252472, "grad_norm": 0.392405241727829, "learning_rate": 9.860429885884736e-05, "loss": 0.0303, "step": 3010 }, { "epoch": 1.990771259063942, "grad_norm": 0.4340122640132904, "learning_rate": 9.802282136078927e-05, "loss": 0.031, "step": 3020 }, { "epoch": 1.9973632168754119, "grad_norm": 0.41526365280151367, "learning_rate": 9.74414107333391e-05, "loss": 0.036, "step": 3030 }, { "epoch": 2.0039551746868822, "grad_norm": 0.497365266084671, "learning_rate": 9.686008664051784e-05, "loss": 0.0329, "step": 3040 }, { "epoch": 2.010547132498352, "grad_norm": 0.29831039905548096, "learning_rate": 9.627886874341958e-05, "loss": 0.0387, "step": 3050 }, { "epoch": 2.017139090309822, "grad_norm": 0.5178902745246887, "learning_rate": 9.569777669954694e-05, "loss": 0.0333, "step": 3060 }, { "epoch": 2.023731048121292, "grad_norm": 0.42481425404548645, "learning_rate": 9.511683016214598e-05, "loss": 0.0285, "step": 3070 }, { "epoch": 2.030323005932762, "grad_norm": 0.38960328698158264, "learning_rate": 9.45360487795414e-05, "loss": 0.0304, "step": 3080 }, { "epoch": 2.036914963744232, "grad_norm": 0.2346969097852707, "learning_rate": 9.395545219447244e-05, "loss": 0.039, "step": 3090 }, { "epoch": 2.0435069215557022, "grad_norm": 0.3628898561000824, "learning_rate": 9.337506004342796e-05, "loss": 0.032, "step": 3100 }, { "epoch": 2.050098879367172, "grad_norm": 0.32154813408851624, "learning_rate": 9.279489195598281e-05, "loss": 0.0287, "step": 3110 }, { "epoch": 2.056690837178642, "grad_norm": 0.5084890723228455, "learning_rate": 9.221496755413375e-05, "loss": 0.0284, "step": 3120 }, { "epoch": 2.063282794990112, "grad_norm": 0.7246992588043213, "learning_rate": 9.163530645163562e-05, "loss": 0.0289, "step": 3130 }, { "epoch": 2.069874752801582, "grad_norm": 0.3673562705516815, "learning_rate": 9.105592825333829e-05, "loss": 0.025, "step": 3140 }, { "epoch": 2.076466710613052, "grad_norm": 0.25608980655670166, "learning_rate": 9.047685255452347e-05, "loss": 0.0302, "step": 3150 }, { "epoch": 2.0830586684245223, "grad_norm": 0.4780118763446808, "learning_rate": 8.989809894024186e-05, "loss": 0.0276, "step": 3160 }, { "epoch": 2.089650626235992, "grad_norm": 0.2582448124885559, "learning_rate": 8.931968698465096e-05, "loss": 0.0316, "step": 3170 }, { "epoch": 2.096242584047462, "grad_norm": 0.4123593270778656, "learning_rate": 8.874163625035295e-05, "loss": 0.0296, "step": 3180 }, { "epoch": 2.102834541858932, "grad_norm": 0.3350667357444763, "learning_rate": 8.8163966287733e-05, "loss": 0.024, "step": 3190 }, { "epoch": 2.109426499670402, "grad_norm": 0.518301784992218, "learning_rate": 8.758669663429818e-05, "loss": 0.0346, "step": 3200 }, { "epoch": 2.1160184574818723, "grad_norm": 0.25243598222732544, "learning_rate": 8.700984681401657e-05, "loss": 0.0266, "step": 3210 }, { "epoch": 2.1226104152933423, "grad_norm": 0.40096205472946167, "learning_rate": 8.643343633665697e-05, "loss": 0.0271, "step": 3220 }, { "epoch": 2.129202373104812, "grad_norm": 0.4391838014125824, "learning_rate": 8.585748469712913e-05, "loss": 0.0251, "step": 3230 }, { "epoch": 2.135794330916282, "grad_norm": 0.4758305251598358, "learning_rate": 8.528201137482426e-05, "loss": 0.0217, "step": 3240 }, { "epoch": 2.142386288727752, "grad_norm": 0.31953004002571106, "learning_rate": 8.470703583295634e-05, "loss": 0.0312, "step": 3250 }, { "epoch": 2.148978246539222, "grad_norm": 0.5065709352493286, "learning_rate": 8.413257751790382e-05, "loss": 0.0274, "step": 3260 }, { "epoch": 2.1555702043506924, "grad_norm": 0.44011905789375305, "learning_rate": 8.355865585855185e-05, "loss": 0.0284, "step": 3270 }, { "epoch": 2.1621621621621623, "grad_norm": 0.5650854706764221, "learning_rate": 8.298529026563525e-05, "loss": 0.0305, "step": 3280 }, { "epoch": 2.168754119973632, "grad_norm": 0.5405374765396118, "learning_rate": 8.241250013108204e-05, "loss": 0.0273, "step": 3290 }, { "epoch": 2.175346077785102, "grad_norm": 0.4947541654109955, "learning_rate": 8.184030482735744e-05, "loss": 0.0328, "step": 3300 }, { "epoch": 2.181938035596572, "grad_norm": 0.4064697027206421, "learning_rate": 8.126872370680889e-05, "loss": 0.0235, "step": 3310 }, { "epoch": 2.188529993408042, "grad_norm": 0.19350074231624603, "learning_rate": 8.069777610101117e-05, "loss": 0.0215, "step": 3320 }, { "epoch": 2.1951219512195124, "grad_norm": 0.22878113389015198, "learning_rate": 8.012748132011308e-05, "loss": 0.0226, "step": 3330 }, { "epoch": 2.2017139090309823, "grad_norm": 0.3672931492328644, "learning_rate": 7.955785865218399e-05, "loss": 0.027, "step": 3340 }, { "epoch": 2.2083058668424522, "grad_norm": 0.20331457257270813, "learning_rate": 7.898892736256147e-05, "loss": 0.0293, "step": 3350 }, { "epoch": 2.214897824653922, "grad_norm": 0.4426509439945221, "learning_rate": 7.842070669319994e-05, "loss": 0.0281, "step": 3360 }, { "epoch": 2.221489782465392, "grad_norm": 0.39120975136756897, "learning_rate": 7.785321586201983e-05, "loss": 0.0271, "step": 3370 }, { "epoch": 2.2280817402768625, "grad_norm": 0.3522750437259674, "learning_rate": 7.728647406225736e-05, "loss": 0.0257, "step": 3380 }, { "epoch": 2.2346736980883324, "grad_norm": 0.382185697555542, "learning_rate": 7.672050046181576e-05, "loss": 0.0227, "step": 3390 }, { "epoch": 2.2412656558998023, "grad_norm": 0.5848444104194641, "learning_rate": 7.615531420261669e-05, "loss": 0.0274, "step": 3400 }, { "epoch": 2.2478576137112722, "grad_norm": 0.39687180519104004, "learning_rate": 7.559093439995306e-05, "loss": 0.0306, "step": 3410 }, { "epoch": 2.254449571522742, "grad_norm": 0.4460280239582062, "learning_rate": 7.502738014184243e-05, "loss": 0.0319, "step": 3420 }, { "epoch": 2.261041529334212, "grad_norm": 0.23736679553985596, "learning_rate": 7.446467048838131e-05, "loss": 0.0223, "step": 3430 }, { "epoch": 2.267633487145682, "grad_norm": 0.4469570219516754, "learning_rate": 7.390282447110078e-05, "loss": 0.0367, "step": 3440 }, { "epoch": 2.2742254449571524, "grad_norm": 0.5411940813064575, "learning_rate": 7.334186109232264e-05, "loss": 0.0239, "step": 3450 }, { "epoch": 2.2808174027686223, "grad_norm": 0.5765545964241028, "learning_rate": 7.278179932451673e-05, "loss": 0.0253, "step": 3460 }, { "epoch": 2.2874093605800923, "grad_norm": 0.541907548904419, "learning_rate": 7.222265810965935e-05, "loss": 0.0331, "step": 3470 }, { "epoch": 2.294001318391562, "grad_norm": 0.2490926831960678, "learning_rate": 7.166445635859257e-05, "loss": 0.0222, "step": 3480 }, { "epoch": 2.300593276203032, "grad_norm": 0.3459300100803375, "learning_rate": 7.110721295038459e-05, "loss": 0.031, "step": 3490 }, { "epoch": 2.3071852340145025, "grad_norm": 0.2858792543411255, "learning_rate": 7.055094673169137e-05, "loss": 0.0227, "step": 3500 }, { "epoch": 2.3137771918259724, "grad_norm": 0.23397736251354218, "learning_rate": 6.9995676516119e-05, "loss": 0.0324, "step": 3510 }, { "epoch": 2.3203691496374423, "grad_norm": 0.41427797079086304, "learning_rate": 6.944142108358764e-05, "loss": 0.0274, "step": 3520 }, { "epoch": 2.3269611074489123, "grad_norm": 0.3530319631099701, "learning_rate": 6.888819917969618e-05, "loss": 0.0259, "step": 3530 }, { "epoch": 2.333553065260382, "grad_norm": 0.6433699727058411, "learning_rate": 6.833602951508827e-05, "loss": 0.0264, "step": 3540 }, { "epoch": 2.3401450230718526, "grad_norm": 0.41642090678215027, "learning_rate": 6.778493076481964e-05, "loss": 0.0319, "step": 3550 }, { "epoch": 2.3467369808833225, "grad_norm": 0.5816253423690796, "learning_rate": 6.723492156772633e-05, "loss": 0.0282, "step": 3560 }, { "epoch": 2.3533289386947924, "grad_norm": 0.4890478551387787, "learning_rate": 6.668602052579424e-05, "loss": 0.032, "step": 3570 }, { "epoch": 2.3599208965062624, "grad_norm": 0.4867573380470276, "learning_rate": 6.613824620353032e-05, "loss": 0.0284, "step": 3580 }, { "epoch": 2.3665128543177323, "grad_norm": 0.2637820541858673, "learning_rate": 6.559161712733437e-05, "loss": 0.0258, "step": 3590 }, { "epoch": 2.373104812129202, "grad_norm": 0.2814655601978302, "learning_rate": 6.504615178487246e-05, "loss": 0.0288, "step": 3600 }, { "epoch": 2.379696769940672, "grad_norm": 0.3690190315246582, "learning_rate": 6.450186862445197e-05, "loss": 0.0296, "step": 3610 }, { "epoch": 2.3862887277521425, "grad_norm": 0.47703316807746887, "learning_rate": 6.395878605439725e-05, "loss": 0.0284, "step": 3620 }, { "epoch": 2.3928806855636124, "grad_norm": 0.42962104082107544, "learning_rate": 6.341692244242727e-05, "loss": 0.0279, "step": 3630 }, { "epoch": 2.3994726433750824, "grad_norm": 0.38502004742622375, "learning_rate": 6.287629611503446e-05, "loss": 0.02, "step": 3640 }, { "epoch": 2.4060646011865523, "grad_norm": 0.39698663353919983, "learning_rate": 6.23369253568645e-05, "loss": 0.026, "step": 3650 }, { "epoch": 2.4126565589980222, "grad_norm": 0.34033870697021484, "learning_rate": 6.179882841009845e-05, "loss": 0.0226, "step": 3660 }, { "epoch": 2.4192485168094926, "grad_norm": 0.2257242500782013, "learning_rate": 6.126202347383537e-05, "loss": 0.0249, "step": 3670 }, { "epoch": 2.4258404746209625, "grad_norm": 0.44853436946868896, "learning_rate": 6.0726528703476935e-05, "loss": 0.0268, "step": 3680 }, { "epoch": 2.4324324324324325, "grad_norm": 0.3105967044830322, "learning_rate": 6.0192362210113454e-05, "loss": 0.0268, "step": 3690 }, { "epoch": 2.4390243902439024, "grad_norm": 0.19689629971981049, "learning_rate": 5.9659542059911266e-05, "loss": 0.0241, "step": 3700 }, { "epoch": 2.4456163480553723, "grad_norm": 0.3638206422328949, "learning_rate": 5.912808627350166e-05, "loss": 0.0286, "step": 3710 }, { "epoch": 2.4522083058668427, "grad_norm": 0.24674852192401886, "learning_rate": 5.8598012825371594e-05, "loss": 0.0279, "step": 3720 }, { "epoch": 2.4588002636783126, "grad_norm": 0.46596574783325195, "learning_rate": 5.8069339643255526e-05, "loss": 0.0277, "step": 3730 }, { "epoch": 2.4653922214897825, "grad_norm": 0.3376254737377167, "learning_rate": 5.7542084607529254e-05, "loss": 0.026, "step": 3740 }, { "epoch": 2.4719841793012525, "grad_norm": 0.3310098946094513, "learning_rate": 5.701626555060523e-05, "loss": 0.0237, "step": 3750 }, { "epoch": 2.4785761371127224, "grad_norm": 0.276920348405838, "learning_rate": 5.6491900256329063e-05, "loss": 0.0216, "step": 3760 }, { "epoch": 2.4851680949241923, "grad_norm": 0.26669642329216003, "learning_rate": 5.596900645937859e-05, "loss": 0.0218, "step": 3770 }, { "epoch": 2.4917600527356623, "grad_norm": 0.34589439630508423, "learning_rate": 5.5447601844663786e-05, "loss": 0.0231, "step": 3780 }, { "epoch": 2.4983520105471326, "grad_norm": 0.36905866861343384, "learning_rate": 5.4927704046728426e-05, "loss": 0.0243, "step": 3790 }, { "epoch": 2.5049439683586026, "grad_norm": 0.36546382308006287, "learning_rate": 5.440933064915414e-05, "loss": 0.0285, "step": 3800 }, { "epoch": 2.5115359261700725, "grad_norm": 0.6371796727180481, "learning_rate": 5.389249918396535e-05, "loss": 0.0293, "step": 3810 }, { "epoch": 2.5181278839815424, "grad_norm": 0.3491004705429077, "learning_rate": 5.3377227131036426e-05, "loss": 0.0262, "step": 3820 }, { "epoch": 2.5247198417930123, "grad_norm": 0.5807764530181885, "learning_rate": 5.28635319175006e-05, "loss": 0.0241, "step": 3830 }, { "epoch": 2.5313117996044827, "grad_norm": 0.33311110734939575, "learning_rate": 5.2351430917160327e-05, "loss": 0.0298, "step": 3840 }, { "epoch": 2.5379037574159526, "grad_norm": 0.42765146493911743, "learning_rate": 5.184094144989988e-05, "loss": 0.0215, "step": 3850 }, { "epoch": 2.5444957152274226, "grad_norm": 0.4080013036727905, "learning_rate": 5.1332080781099565e-05, "loss": 0.0267, "step": 3860 }, { "epoch": 2.5510876730388925, "grad_norm": 0.3769298195838928, "learning_rate": 5.082486612105164e-05, "loss": 0.025, "step": 3870 }, { "epoch": 2.5576796308503624, "grad_norm": 0.36813291907310486, "learning_rate": 5.031931462437829e-05, "loss": 0.0218, "step": 3880 }, { "epoch": 2.564271588661833, "grad_norm": 0.5158575177192688, "learning_rate": 4.981544338945163e-05, "loss": 0.0235, "step": 3890 }, { "epoch": 2.5708635464733027, "grad_norm": 0.26838409900665283, "learning_rate": 4.9313269457815124e-05, "loss": 0.0242, "step": 3900 }, { "epoch": 2.5774555042847727, "grad_norm": 0.3350141644477844, "learning_rate": 4.8812809813607366e-05, "loss": 0.0223, "step": 3910 }, { "epoch": 2.5840474620962426, "grad_norm": 0.43595895171165466, "learning_rate": 4.831408138298774e-05, "loss": 0.0178, "step": 3920 }, { "epoch": 2.5906394199077125, "grad_norm": 0.5088134407997131, "learning_rate": 4.7817101033563785e-05, "loss": 0.0257, "step": 3930 }, { "epoch": 2.5972313777191824, "grad_norm": 0.3459358811378479, "learning_rate": 4.732188557382078e-05, "loss": 0.0326, "step": 3940 }, { "epoch": 2.6038233355306524, "grad_norm": 0.5011377334594727, "learning_rate": 4.682845175255326e-05, "loss": 0.0263, "step": 3950 }, { "epoch": 2.6104152933421227, "grad_norm": 0.3368130624294281, "learning_rate": 4.633681625829869e-05, "loss": 0.0274, "step": 3960 }, { "epoch": 2.6170072511535927, "grad_norm": 0.3485671281814575, "learning_rate": 4.584699571877275e-05, "loss": 0.0237, "step": 3970 }, { "epoch": 2.6235992089650626, "grad_norm": 0.22069136798381805, "learning_rate": 4.535900670030715e-05, "loss": 0.0199, "step": 3980 }, { "epoch": 2.6301911667765325, "grad_norm": 0.21239358186721802, "learning_rate": 4.487286570728944e-05, "loss": 0.0198, "step": 3990 }, { "epoch": 2.6367831245880025, "grad_norm": 0.3069987893104553, "learning_rate": 4.438858918160452e-05, "loss": 0.0236, "step": 4000 }, { "epoch": 2.643375082399473, "grad_norm": 0.21374328434467316, "learning_rate": 4.390619350207882e-05, "loss": 0.0231, "step": 4010 }, { "epoch": 2.6499670402109428, "grad_norm": 0.2509225010871887, "learning_rate": 4.342569498392615e-05, "loss": 0.0192, "step": 4020 }, { "epoch": 2.6565589980224127, "grad_norm": 0.3323153257369995, "learning_rate": 4.294710987819612e-05, "loss": 0.028, "step": 4030 }, { "epoch": 2.6631509558338826, "grad_norm": 0.28733029961586, "learning_rate": 4.24704543712243e-05, "loss": 0.0261, "step": 4040 }, { "epoch": 2.6697429136453525, "grad_norm": 0.4628933370113373, "learning_rate": 4.199574458408487e-05, "loss": 0.0275, "step": 4050 }, { "epoch": 2.676334871456823, "grad_norm": 0.27761244773864746, "learning_rate": 4.1522996572045345e-05, "loss": 0.0278, "step": 4060 }, { "epoch": 2.682926829268293, "grad_norm": 0.31634223461151123, "learning_rate": 4.1052226324023724e-05, "loss": 0.0243, "step": 4070 }, { "epoch": 2.6895187870797628, "grad_norm": 0.2185506969690323, "learning_rate": 4.0583449762047464e-05, "loss": 0.0252, "step": 4080 }, { "epoch": 2.6961107448912327, "grad_norm": 0.43438541889190674, "learning_rate": 4.011668274071514e-05, "loss": 0.0213, "step": 4090 }, { "epoch": 2.7027027027027026, "grad_norm": 0.33553093671798706, "learning_rate": 3.965194104666029e-05, "loss": 0.0207, "step": 4100 }, { "epoch": 2.7092946605141726, "grad_norm": 0.21318501234054565, "learning_rate": 3.91892403980173e-05, "loss": 0.0225, "step": 4110 }, { "epoch": 2.7158866183256425, "grad_norm": 0.2645362615585327, "learning_rate": 3.872859644388989e-05, "loss": 0.0219, "step": 4120 }, { "epoch": 2.722478576137113, "grad_norm": 0.21204468607902527, "learning_rate": 3.827002476382193e-05, "loss": 0.0222, "step": 4130 }, { "epoch": 2.729070533948583, "grad_norm": 0.20982545614242554, "learning_rate": 3.781354086727038e-05, "loss": 0.0233, "step": 4140 }, { "epoch": 2.7356624917600527, "grad_norm": 0.41608384251594543, "learning_rate": 3.735916019308078e-05, "loss": 0.0288, "step": 4150 }, { "epoch": 2.7422544495715226, "grad_norm": 0.5119547843933105, "learning_rate": 3.690689810896518e-05, "loss": 0.0231, "step": 4160 }, { "epoch": 2.7488464073829926, "grad_norm": 0.3134307861328125, "learning_rate": 3.645676991098227e-05, "loss": 0.0216, "step": 4170 }, { "epoch": 2.755438365194463, "grad_norm": 0.26220589876174927, "learning_rate": 3.6008790823020043e-05, "loss": 0.022, "step": 4180 }, { "epoch": 2.762030323005933, "grad_norm": 0.41492775082588196, "learning_rate": 3.5562975996281064e-05, "loss": 0.0201, "step": 4190 }, { "epoch": 2.768622280817403, "grad_norm": 0.29042404890060425, "learning_rate": 3.511934050876986e-05, "loss": 0.0172, "step": 4200 }, { "epoch": 2.7752142386288727, "grad_norm": 0.20791085064411163, "learning_rate": 3.4677899364783e-05, "loss": 0.0232, "step": 4210 }, { "epoch": 2.7818061964403427, "grad_norm": 0.218453049659729, "learning_rate": 3.4238667494401786e-05, "loss": 0.0188, "step": 4220 }, { "epoch": 2.788398154251813, "grad_norm": 0.3837571144104004, "learning_rate": 3.380165975298697e-05, "loss": 0.0229, "step": 4230 }, { "epoch": 2.794990112063283, "grad_norm": 0.39384233951568604, "learning_rate": 3.336689092067673e-05, "loss": 0.0195, "step": 4240 }, { "epoch": 2.801582069874753, "grad_norm": 0.42460018396377563, "learning_rate": 3.2934375701886566e-05, "loss": 0.0306, "step": 4250 }, { "epoch": 2.808174027686223, "grad_norm": 0.21645621955394745, "learning_rate": 3.2504128724811835e-05, "loss": 0.0184, "step": 4260 }, { "epoch": 2.8147659854976927, "grad_norm": 0.3433361053466797, "learning_rate": 3.207616454093337e-05, "loss": 0.0177, "step": 4270 }, { "epoch": 2.8213579433091627, "grad_norm": 0.4108859598636627, "learning_rate": 3.1650497624525046e-05, "loss": 0.0246, "step": 4280 }, { "epoch": 2.8279499011206326, "grad_norm": 0.45586344599723816, "learning_rate": 3.122714237216431e-05, "loss": 0.0325, "step": 4290 }, { "epoch": 2.834541858932103, "grad_norm": 0.2443486601114273, "learning_rate": 3.080611310224539e-05, "loss": 0.0191, "step": 4300 }, { "epoch": 2.841133816743573, "grad_norm": 0.3153115510940552, "learning_rate": 3.038742405449485e-05, "loss": 0.0206, "step": 4310 }, { "epoch": 2.847725774555043, "grad_norm": 0.2398747354745865, "learning_rate": 2.997108938949006e-05, "loss": 0.0224, "step": 4320 }, { "epoch": 2.8543177323665128, "grad_norm": 0.3573615252971649, "learning_rate": 2.9557123188180358e-05, "loss": 0.0245, "step": 4330 }, { "epoch": 2.8609096901779827, "grad_norm": 0.5803711414337158, "learning_rate": 2.9145539451410675e-05, "loss": 0.0197, "step": 4340 }, { "epoch": 2.867501647989453, "grad_norm": 0.3499305248260498, "learning_rate": 2.8736352099448028e-05, "loss": 0.019, "step": 4350 }, { "epoch": 2.874093605800923, "grad_norm": 0.31976768374443054, "learning_rate": 2.8329574971510886e-05, "loss": 0.0219, "step": 4360 }, { "epoch": 2.880685563612393, "grad_norm": 0.44260144233703613, "learning_rate": 2.7925221825300852e-05, "loss": 0.0229, "step": 4370 }, { "epoch": 2.887277521423863, "grad_norm": 0.31007763743400574, "learning_rate": 2.7523306336537568e-05, "loss": 0.0174, "step": 4380 }, { "epoch": 2.8938694792353328, "grad_norm": 0.3109738826751709, "learning_rate": 2.712384209849603e-05, "loss": 0.0224, "step": 4390 }, { "epoch": 2.900461437046803, "grad_norm": 0.2678034007549286, "learning_rate": 2.672684262154709e-05, "loss": 0.0218, "step": 4400 }, { "epoch": 2.9070533948582726, "grad_norm": 0.31142377853393555, "learning_rate": 2.6332321332700172e-05, "loss": 0.0145, "step": 4410 }, { "epoch": 2.913645352669743, "grad_norm": 0.19287629425525665, "learning_rate": 2.59402915751494e-05, "loss": 0.0174, "step": 4420 }, { "epoch": 2.920237310481213, "grad_norm": 0.5407594442367554, "learning_rate": 2.5550766607822342e-05, "loss": 0.0201, "step": 4430 }, { "epoch": 2.926829268292683, "grad_norm": 0.2303568720817566, "learning_rate": 2.516375960493136e-05, "loss": 0.0194, "step": 4440 }, { "epoch": 2.933421226104153, "grad_norm": 0.2476644515991211, "learning_rate": 2.4779283655528195e-05, "loss": 0.0206, "step": 4450 }, { "epoch": 2.9400131839156227, "grad_norm": 0.2864633798599243, "learning_rate": 2.439735176306135e-05, "loss": 0.0217, "step": 4460 }, { "epoch": 2.946605141727093, "grad_norm": 0.5562433004379272, "learning_rate": 2.4017976844936084e-05, "loss": 0.0226, "step": 4470 }, { "epoch": 2.953197099538563, "grad_norm": 0.3683149516582489, "learning_rate": 2.364117173207766e-05, "loss": 0.021, "step": 4480 }, { "epoch": 2.959789057350033, "grad_norm": 0.5093694925308228, "learning_rate": 2.326694916849751e-05, "loss": 0.0194, "step": 4490 }, { "epoch": 2.966381015161503, "grad_norm": 0.16352635622024536, "learning_rate": 2.2895321810861837e-05, "loss": 0.0255, "step": 4500 }, { "epoch": 2.972972972972973, "grad_norm": 0.32750794291496277, "learning_rate": 2.252630222806399e-05, "loss": 0.0162, "step": 4510 }, { "epoch": 2.979564930784443, "grad_norm": 0.9548207521438599, "learning_rate": 2.215990290079918e-05, "loss": 0.0262, "step": 4520 }, { "epoch": 2.986156888595913, "grad_norm": 0.33009737730026245, "learning_rate": 2.1796136221142184e-05, "loss": 0.0201, "step": 4530 }, { "epoch": 2.992748846407383, "grad_norm": 0.30861908197402954, "learning_rate": 2.1435014492128547e-05, "loss": 0.0186, "step": 4540 }, { "epoch": 2.999340804218853, "grad_norm": 0.37097203731536865, "learning_rate": 2.1076549927338397e-05, "loss": 0.0233, "step": 4550 }, { "epoch": 3.005932762030323, "grad_norm": 0.29048582911491394, "learning_rate": 2.0720754650483076e-05, "loss": 0.0229, "step": 4560 }, { "epoch": 3.012524719841793, "grad_norm": 0.3153735399246216, "learning_rate": 2.036764069499555e-05, "loss": 0.0229, "step": 4570 }, { "epoch": 3.019116677653263, "grad_norm": 0.3723648488521576, "learning_rate": 2.001722000362317e-05, "loss": 0.025, "step": 4580 }, { "epoch": 3.025708635464733, "grad_norm": 0.3493257164955139, "learning_rate": 1.9669504428023644e-05, "loss": 0.0161, "step": 4590 }, { "epoch": 3.032300593276203, "grad_norm": 0.4804138243198395, "learning_rate": 1.9324505728364527e-05, "loss": 0.0199, "step": 4600 }, { "epoch": 3.038892551087673, "grad_norm": 0.40249496698379517, "learning_rate": 1.898223557292519e-05, "loss": 0.0239, "step": 4610 }, { "epoch": 3.045484508899143, "grad_norm": 0.4595593810081482, "learning_rate": 1.86427055377023e-05, "loss": 0.0255, "step": 4620 }, { "epoch": 3.052076466710613, "grad_norm": 0.22908352315425873, "learning_rate": 1.830592710601835e-05, "loss": 0.0165, "step": 4630 }, { "epoch": 3.058668424522083, "grad_norm": 0.2505341172218323, "learning_rate": 1.797191166813319e-05, "loss": 0.0186, "step": 4640 }, { "epoch": 3.065260382333553, "grad_norm": 0.20365647971630096, "learning_rate": 1.764067052085877e-05, "loss": 0.021, "step": 4650 }, { "epoch": 3.071852340145023, "grad_norm": 0.2220371514558792, "learning_rate": 1.7312214867177246e-05, "loss": 0.0252, "step": 4660 }, { "epoch": 3.078444297956493, "grad_norm": 0.2889988124370575, "learning_rate": 1.6986555815861882e-05, "loss": 0.0318, "step": 4670 }, { "epoch": 3.085036255767963, "grad_norm": 0.4432000517845154, "learning_rate": 1.666370438110141e-05, "loss": 0.0163, "step": 4680 }, { "epoch": 3.0916282135794333, "grad_norm": 0.44843411445617676, "learning_rate": 1.6343671482127597e-05, "loss": 0.0265, "step": 4690 }, { "epoch": 3.098220171390903, "grad_norm": 0.2467808723449707, "learning_rate": 1.6026467942845804e-05, "loss": 0.019, "step": 4700 }, { "epoch": 3.104812129202373, "grad_norm": 0.26013877987861633, "learning_rate": 1.5712104491468983e-05, "loss": 0.024, "step": 4710 }, { "epoch": 3.111404087013843, "grad_norm": 0.32995596528053284, "learning_rate": 1.5400591760154826e-05, "loss": 0.0202, "step": 4720 }, { "epoch": 3.117996044825313, "grad_norm": 0.3786914050579071, "learning_rate": 1.5091940284646245e-05, "loss": 0.0183, "step": 4730 }, { "epoch": 3.124588002636783, "grad_norm": 0.46880772709846497, "learning_rate": 1.478616050391488e-05, "loss": 0.0222, "step": 4740 }, { "epoch": 3.1311799604482533, "grad_norm": 0.24594926834106445, "learning_rate": 1.4483262759808136e-05, "loss": 0.0216, "step": 4750 }, { "epoch": 3.1377719182597232, "grad_norm": 0.40315452218055725, "learning_rate": 1.4183257296699493e-05, "loss": 0.0171, "step": 4760 }, { "epoch": 3.144363876071193, "grad_norm": 0.4592204988002777, "learning_rate": 1.3886154261141826e-05, "loss": 0.0186, "step": 4770 }, { "epoch": 3.150955833882663, "grad_norm": 0.413077175617218, "learning_rate": 1.3591963701524401e-05, "loss": 0.0296, "step": 4780 }, { "epoch": 3.157547791694133, "grad_norm": 0.2753660976886749, "learning_rate": 1.330069556773299e-05, "loss": 0.0258, "step": 4790 }, { "epoch": 3.164139749505603, "grad_norm": 0.33898794651031494, "learning_rate": 1.3012359710813304e-05, "loss": 0.0199, "step": 4800 }, { "epoch": 3.1707317073170733, "grad_norm": 0.2542150914669037, "learning_rate": 1.2726965882637853e-05, "loss": 0.0187, "step": 4810 }, { "epoch": 3.1773236651285433, "grad_norm": 0.2193000614643097, "learning_rate": 1.2444523735576197e-05, "loss": 0.0236, "step": 4820 }, { "epoch": 3.183915622940013, "grad_norm": 0.20990586280822754, "learning_rate": 1.216504282216826e-05, "loss": 0.017, "step": 4830 }, { "epoch": 3.190507580751483, "grad_norm": 0.30340689420700073, "learning_rate": 1.1888532594801583e-05, "loss": 0.0175, "step": 4840 }, { "epoch": 3.197099538562953, "grad_norm": 0.21258412301540375, "learning_rate": 1.1615002405391351e-05, "loss": 0.0182, "step": 4850 }, { "epoch": 3.2036914963744234, "grad_norm": 0.21666236221790314, "learning_rate": 1.134446150506423e-05, "loss": 0.0201, "step": 4860 }, { "epoch": 3.2102834541858933, "grad_norm": 0.44433897733688354, "learning_rate": 1.1076919043845513e-05, "loss": 0.0228, "step": 4870 }, { "epoch": 3.2168754119973633, "grad_norm": 0.35092124342918396, "learning_rate": 1.0812384070349535e-05, "loss": 0.02, "step": 4880 }, { "epoch": 3.223467369808833, "grad_norm": 0.33713802695274353, "learning_rate": 1.055086553147373e-05, "loss": 0.0193, "step": 4890 }, { "epoch": 3.230059327620303, "grad_norm": 0.45195287466049194, "learning_rate": 1.0292372272096029e-05, "loss": 0.0154, "step": 4900 }, { "epoch": 3.236651285431773, "grad_norm": 0.2565387487411499, "learning_rate": 1.0036913034775674e-05, "loss": 0.0289, "step": 4910 }, { "epoch": 3.2432432432432434, "grad_norm": 0.33470308780670166, "learning_rate": 9.784496459457549e-06, "loss": 0.0178, "step": 4920 }, { "epoch": 3.2498352010547134, "grad_norm": 0.2145528793334961, "learning_rate": 9.535131083180027e-06, "loss": 0.0225, "step": 4930 }, { "epoch": 3.2564271588661833, "grad_norm": 0.2850882411003113, "learning_rate": 9.288825339786133e-06, "loss": 0.0212, "step": 4940 }, { "epoch": 3.263019116677653, "grad_norm": 0.30832791328430176, "learning_rate": 9.045587559638358e-06, "loss": 0.0179, "step": 4950 }, { "epoch": 3.269611074489123, "grad_norm": 0.301908403635025, "learning_rate": 8.805425969336945e-06, "loss": 0.02, "step": 4960 }, { "epoch": 3.276203032300593, "grad_norm": 0.24023061990737915, "learning_rate": 8.568348691441596e-06, "loss": 0.0188, "step": 4970 }, { "epoch": 3.2827949901120634, "grad_norm": 0.22343115508556366, "learning_rate": 8.33436374419676e-06, "loss": 0.0215, "step": 4980 }, { "epoch": 3.2893869479235334, "grad_norm": 0.29182159900665283, "learning_rate": 8.10347904126053e-06, "loss": 0.0229, "step": 4990 }, { "epoch": 3.2959789057350033, "grad_norm": 0.2520536780357361, "learning_rate": 7.87570239143689e-06, "loss": 0.0219, "step": 5000 }, { "epoch": 3.3025708635464732, "grad_norm": 0.3384230136871338, "learning_rate": 7.651041498411637e-06, "loss": 0.0199, "step": 5010 }, { "epoch": 3.309162821357943, "grad_norm": 0.30371981859207153, "learning_rate": 7.429503960491901e-06, "loss": 0.0231, "step": 5020 }, { "epoch": 3.3157547791694135, "grad_norm": 0.21716776490211487, "learning_rate": 7.211097270349066e-06, "loss": 0.0158, "step": 5030 }, { "epoch": 3.3223467369808835, "grad_norm": 0.2968665659427643, "learning_rate": 6.995828814765426e-06, "loss": 0.0153, "step": 5040 }, { "epoch": 3.3289386947923534, "grad_norm": 0.2764769196510315, "learning_rate": 6.7837058743843275e-06, "loss": 0.0194, "step": 5050 }, { "epoch": 3.3355306526038233, "grad_norm": 0.3363369405269623, "learning_rate": 6.57473562346389e-06, "loss": 0.0156, "step": 5060 }, { "epoch": 3.3421226104152932, "grad_norm": 0.386843740940094, "learning_rate": 6.3689251296344845e-06, "loss": 0.0208, "step": 5070 }, { "epoch": 3.348714568226763, "grad_norm": 0.142068549990654, "learning_rate": 6.166281353659575e-06, "loss": 0.0205, "step": 5080 }, { "epoch": 3.3553065260382335, "grad_norm": 0.4907970130443573, "learning_rate": 5.966811149200324e-06, "loss": 0.0201, "step": 5090 }, { "epoch": 3.3618984838497035, "grad_norm": 0.3856694996356964, "learning_rate": 5.77052126258385e-06, "loss": 0.0259, "step": 5100 }, { "epoch": 3.3684904416611734, "grad_norm": 0.28562742471694946, "learning_rate": 5.577418332575002e-06, "loss": 0.02, "step": 5110 }, { "epoch": 3.3750823994726433, "grad_norm": 0.24548883736133575, "learning_rate": 5.387508890151827e-06, "loss": 0.0194, "step": 5120 }, { "epoch": 3.3816743572841133, "grad_norm": 0.19759061932563782, "learning_rate": 5.200799358284758e-06, "loss": 0.0148, "step": 5130 }, { "epoch": 3.388266315095583, "grad_norm": 0.41944679617881775, "learning_rate": 5.017296051719256e-06, "loss": 0.0199, "step": 5140 }, { "epoch": 3.3948582729070536, "grad_norm": 0.42225712537765503, "learning_rate": 4.837005176762355e-06, "loss": 0.0225, "step": 5150 }, { "epoch": 3.4014502307185235, "grad_norm": 0.2787589728832245, "learning_rate": 4.659932831072677e-06, "loss": 0.0189, "step": 5160 }, { "epoch": 3.4080421885299934, "grad_norm": 0.3040010333061218, "learning_rate": 4.4860850034542765e-06, "loss": 0.0197, "step": 5170 }, { "epoch": 3.4146341463414633, "grad_norm": 0.23647019267082214, "learning_rate": 4.315467573654031e-06, "loss": 0.0201, "step": 5180 }, { "epoch": 3.4212261041529333, "grad_norm": 0.3153569996356964, "learning_rate": 4.148086312162758e-06, "loss": 0.0201, "step": 5190 }, { "epoch": 3.4278180619644036, "grad_norm": 0.37144842743873596, "learning_rate": 3.983946880020173e-06, "loss": 0.0192, "step": 5200 }, { "epoch": 3.4344100197758736, "grad_norm": 0.30148690938949585, "learning_rate": 3.823054828623285e-06, "loss": 0.0196, "step": 5210 }, { "epoch": 3.4410019775873435, "grad_norm": 0.2592622935771942, "learning_rate": 3.6654155995386905e-06, "loss": 0.0145, "step": 5220 }, { "epoch": 3.4475939353988134, "grad_norm": 0.45873746275901794, "learning_rate": 3.5110345243186015e-06, "loss": 0.0233, "step": 5230 }, { "epoch": 3.4541858932102834, "grad_norm": 0.2648702561855316, "learning_rate": 3.3599168243204193e-06, "loss": 0.0205, "step": 5240 }, { "epoch": 3.4607778510217533, "grad_norm": 0.3078472316265106, "learning_rate": 3.212067610530212e-06, "loss": 0.0151, "step": 5250 }, { "epoch": 3.4673698088332237, "grad_norm": 0.237100750207901, "learning_rate": 3.067491883389839e-06, "loss": 0.0165, "step": 5260 }, { "epoch": 3.4739617666446936, "grad_norm": 0.21496574580669403, "learning_rate": 2.9261945326277794e-06, "loss": 0.0165, "step": 5270 }, { "epoch": 3.4805537244561635, "grad_norm": 0.24198366701602936, "learning_rate": 2.7881803370938597e-06, "loss": 0.0171, "step": 5280 }, { "epoch": 3.4871456822676334, "grad_norm": 0.20283302664756775, "learning_rate": 2.653453964597563e-06, "loss": 0.0148, "step": 5290 }, { "epoch": 3.4937376400791034, "grad_norm": 0.24728865921497345, "learning_rate": 2.522019971750089e-06, "loss": 0.0208, "step": 5300 }, { "epoch": 3.5003295978905733, "grad_norm": 0.18928475677967072, "learning_rate": 2.393882803810388e-06, "loss": 0.0305, "step": 5310 }, { "epoch": 3.5069215557020437, "grad_norm": 0.31071001291275024, "learning_rate": 2.2690467945347682e-06, "loss": 0.019, "step": 5320 }, { "epoch": 3.5135135135135136, "grad_norm": 0.49363845586776733, "learning_rate": 2.1475161660301944e-06, "loss": 0.0217, "step": 5330 }, { "epoch": 3.5201054713249835, "grad_norm": 0.464982807636261, "learning_rate": 2.0292950286116996e-06, "loss": 0.0167, "step": 5340 }, { "epoch": 3.5266974291364535, "grad_norm": 0.5095881819725037, "learning_rate": 1.9143873806632406e-06, "loss": 0.0221, "step": 5350 }, { "epoch": 3.5332893869479234, "grad_norm": 0.23975181579589844, "learning_rate": 1.802797108502452e-06, "loss": 0.0226, "step": 5360 }, { "epoch": 3.5398813447593938, "grad_norm": 0.30347514152526855, "learning_rate": 1.6945279862493059e-06, "loss": 0.0187, "step": 5370 }, { "epoch": 3.5464733025708637, "grad_norm": 0.29192495346069336, "learning_rate": 1.5895836756983695e-06, "loss": 0.0202, "step": 5380 }, { "epoch": 3.5530652603823336, "grad_norm": 0.2690478563308716, "learning_rate": 1.4879677261950164e-06, "loss": 0.0225, "step": 5390 }, { "epoch": 3.5596572181938035, "grad_norm": 0.2959778904914856, "learning_rate": 1.3896835745153547e-06, "loss": 0.0204, "step": 5400 }, { "epoch": 3.5662491760052735, "grad_norm": 0.31142422556877136, "learning_rate": 1.2947345447500092e-06, "loss": 0.0242, "step": 5410 }, { "epoch": 3.572841133816744, "grad_norm": 0.1820339411497116, "learning_rate": 1.2031238481916896e-06, "loss": 0.0213, "step": 5420 }, { "epoch": 3.5794330916282133, "grad_norm": 0.24110187590122223, "learning_rate": 1.1148545832265767e-06, "loss": 0.0172, "step": 5430 }, { "epoch": 3.5860250494396837, "grad_norm": 0.14942501485347748, "learning_rate": 1.029929735229529e-06, "loss": 0.0204, "step": 5440 }, { "epoch": 3.5926170072511536, "grad_norm": 0.4690724313259125, "learning_rate": 9.483521764631298e-07, "loss": 0.0219, "step": 5450 }, { "epoch": 3.5992089650626236, "grad_norm": 0.3038761019706726, "learning_rate": 8.701246659805206e-07, "loss": 0.0134, "step": 5460 }, { "epoch": 3.6058009228740935, "grad_norm": 0.2972651422023773, "learning_rate": 7.952498495321093e-07, "loss": 0.0191, "step": 5470 }, { "epoch": 3.6123928806855634, "grad_norm": 0.4311395287513733, "learning_rate": 7.237302594760741e-07, "loss": 0.0208, "step": 5480 }, { "epoch": 3.618984838497034, "grad_norm": 0.45453134179115295, "learning_rate": 6.555683146927117e-07, "loss": 0.0158, "step": 5490 }, { "epoch": 3.6255767963085037, "grad_norm": 0.31247788667678833, "learning_rate": 5.907663205026559e-07, "loss": 0.0191, "step": 5500 } ], "logging_steps": 10, "max_steps": 5687, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.928222264873016e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }