{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5226048254860625, "eval_steps": 500, "global_step": 6500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023424689622862497, "grad_norm": 4.449338436126709, "learning_rate": 2.0000000000000003e-06, "loss": 1.3236, "step": 10 }, { "epoch": 0.004684937924572499, "grad_norm": 3.442420721054077, "learning_rate": 4.000000000000001e-06, "loss": 1.1552, "step": 20 }, { "epoch": 0.007027406886858749, "grad_norm": 2.458024263381958, "learning_rate": 6e-06, "loss": 0.9371, "step": 30 }, { "epoch": 0.009369875849144999, "grad_norm": 2.4206013679504395, "learning_rate": 8.000000000000001e-06, "loss": 1.1333, "step": 40 }, { "epoch": 0.011712344811431248, "grad_norm": 4.484491348266602, "learning_rate": 1e-05, "loss": 0.7988, "step": 50 }, { "epoch": 0.014054813773717497, "grad_norm": 7.087528228759766, "learning_rate": 1.2e-05, "loss": 0.8714, "step": 60 }, { "epoch": 0.016397282736003747, "grad_norm": 2.9479169845581055, "learning_rate": 1.4000000000000001e-05, "loss": 0.5826, "step": 70 }, { "epoch": 0.018739751698289998, "grad_norm": 2.2344982624053955, "learning_rate": 1.6000000000000003e-05, "loss": 0.457, "step": 80 }, { "epoch": 0.02108222066057625, "grad_norm": 1.1311728954315186, "learning_rate": 1.8e-05, "loss": 0.3638, "step": 90 }, { "epoch": 0.023424689622862496, "grad_norm": 5.992610931396484, "learning_rate": 2e-05, "loss": 0.7519, "step": 100 }, { "epoch": 0.025767158585148747, "grad_norm": 1.328804612159729, "learning_rate": 2.2000000000000003e-05, "loss": 0.369, "step": 110 }, { "epoch": 0.028109627547434995, "grad_norm": 2.6690480709075928, "learning_rate": 2.4e-05, "loss": 0.2072, "step": 120 }, { "epoch": 0.030452096509721246, "grad_norm": 1.2436017990112305, "learning_rate": 2.6000000000000002e-05, "loss": 0.2564, "step": 130 }, { "epoch": 0.03279456547200749, "grad_norm": 2.130502939224243, "learning_rate": 2.8000000000000003e-05, "loss": 0.1741, "step": 140 }, { "epoch": 0.035137034434293744, "grad_norm": 1.1833769083023071, "learning_rate": 3e-05, "loss": 0.1982, "step": 150 }, { "epoch": 0.037479503396579995, "grad_norm": 0.887791633605957, "learning_rate": 3.2000000000000005e-05, "loss": 0.2346, "step": 160 }, { "epoch": 0.039821972358866246, "grad_norm": 2.4128785133361816, "learning_rate": 3.4000000000000007e-05, "loss": 0.1967, "step": 170 }, { "epoch": 0.0421644413211525, "grad_norm": 1.2833918333053589, "learning_rate": 3.6e-05, "loss": 0.1552, "step": 180 }, { "epoch": 0.04450691028343874, "grad_norm": 1.459666132926941, "learning_rate": 3.8e-05, "loss": 0.2237, "step": 190 }, { "epoch": 0.04684937924572499, "grad_norm": 1.7674411535263062, "learning_rate": 4e-05, "loss": 0.1619, "step": 200 }, { "epoch": 0.049191848208011243, "grad_norm": 1.2941542863845825, "learning_rate": 4.2e-05, "loss": 0.184, "step": 210 }, { "epoch": 0.051534317170297494, "grad_norm": 1.7022488117218018, "learning_rate": 4.4000000000000006e-05, "loss": 0.1501, "step": 220 }, { "epoch": 0.053876786132583745, "grad_norm": 0.8502867221832275, "learning_rate": 4.600000000000001e-05, "loss": 0.2449, "step": 230 }, { "epoch": 0.05621925509486999, "grad_norm": 2.1729302406311035, "learning_rate": 4.8e-05, "loss": 0.141, "step": 240 }, { "epoch": 0.05856172405715624, "grad_norm": 1.9990278482437134, "learning_rate": 5e-05, "loss": 0.1569, "step": 250 }, { "epoch": 0.06090419301944249, "grad_norm": 1.0973132848739624, "learning_rate": 5.2000000000000004e-05, "loss": 0.1574, "step": 260 }, { "epoch": 0.06324666198172874, "grad_norm": 1.5121344327926636, "learning_rate": 5.4000000000000005e-05, "loss": 0.1309, "step": 270 }, { "epoch": 0.06558913094401499, "grad_norm": 1.0041357278823853, "learning_rate": 5.6000000000000006e-05, "loss": 0.2048, "step": 280 }, { "epoch": 0.06793159990630124, "grad_norm": 1.9920216798782349, "learning_rate": 5.8e-05, "loss": 0.1425, "step": 290 }, { "epoch": 0.07027406886858749, "grad_norm": 0.6136835217475891, "learning_rate": 6e-05, "loss": 0.1236, "step": 300 }, { "epoch": 0.07261653783087374, "grad_norm": 1.2063113451004028, "learning_rate": 6.2e-05, "loss": 0.1342, "step": 310 }, { "epoch": 0.07495900679315999, "grad_norm": 0.7644496560096741, "learning_rate": 6.400000000000001e-05, "loss": 0.1205, "step": 320 }, { "epoch": 0.07730147575544624, "grad_norm": 0.973790168762207, "learning_rate": 6.6e-05, "loss": 0.1551, "step": 330 }, { "epoch": 0.07964394471773249, "grad_norm": 1.9004161357879639, "learning_rate": 6.800000000000001e-05, "loss": 0.1395, "step": 340 }, { "epoch": 0.08198641368001874, "grad_norm": 0.8575976490974426, "learning_rate": 7e-05, "loss": 0.1081, "step": 350 }, { "epoch": 0.084328882642305, "grad_norm": 1.3740334510803223, "learning_rate": 7.2e-05, "loss": 0.18, "step": 360 }, { "epoch": 0.08667135160459125, "grad_norm": 0.7421107888221741, "learning_rate": 7.4e-05, "loss": 0.1496, "step": 370 }, { "epoch": 0.08901382056687748, "grad_norm": 1.4952155351638794, "learning_rate": 7.6e-05, "loss": 0.1491, "step": 380 }, { "epoch": 0.09135628952916373, "grad_norm": 1.0072972774505615, "learning_rate": 7.800000000000001e-05, "loss": 0.1282, "step": 390 }, { "epoch": 0.09369875849144998, "grad_norm": 1.719224452972412, "learning_rate": 8e-05, "loss": 0.1779, "step": 400 }, { "epoch": 0.09604122745373624, "grad_norm": 1.4302623271942139, "learning_rate": 8.2e-05, "loss": 0.1145, "step": 410 }, { "epoch": 0.09838369641602249, "grad_norm": 0.6622968316078186, "learning_rate": 8.4e-05, "loss": 0.1159, "step": 420 }, { "epoch": 0.10072616537830874, "grad_norm": 1.0967049598693848, "learning_rate": 8.6e-05, "loss": 0.1659, "step": 430 }, { "epoch": 0.10306863434059499, "grad_norm": 1.1332488059997559, "learning_rate": 8.800000000000001e-05, "loss": 0.1292, "step": 440 }, { "epoch": 0.10541110330288124, "grad_norm": 1.308289647102356, "learning_rate": 9e-05, "loss": 0.1202, "step": 450 }, { "epoch": 0.10775357226516749, "grad_norm": 0.5696719884872437, "learning_rate": 9.200000000000001e-05, "loss": 0.1118, "step": 460 }, { "epoch": 0.11009604122745374, "grad_norm": 0.9922944903373718, "learning_rate": 9.4e-05, "loss": 0.1644, "step": 470 }, { "epoch": 0.11243851018973998, "grad_norm": 1.5004724264144897, "learning_rate": 9.6e-05, "loss": 0.2011, "step": 480 }, { "epoch": 0.11478097915202623, "grad_norm": 0.9503705501556396, "learning_rate": 9.8e-05, "loss": 0.1038, "step": 490 }, { "epoch": 0.11712344811431248, "grad_norm": 1.421077013015747, "learning_rate": 0.0001, "loss": 0.0944, "step": 500 }, { "epoch": 0.11946591707659873, "grad_norm": 0.8938995599746704, "learning_rate": 9.999972660400536e-05, "loss": 0.1216, "step": 510 }, { "epoch": 0.12180838603888498, "grad_norm": 0.46683940291404724, "learning_rate": 9.999890641901125e-05, "loss": 0.1278, "step": 520 }, { "epoch": 0.12415085500117123, "grad_norm": 0.8092114925384521, "learning_rate": 9.999753945398704e-05, "loss": 0.0794, "step": 530 }, { "epoch": 0.12649332396345747, "grad_norm": 0.27710163593292236, "learning_rate": 9.99956257238817e-05, "loss": 0.1266, "step": 540 }, { "epoch": 0.12883579292574374, "grad_norm": 0.81737220287323, "learning_rate": 9.999316524962345e-05, "loss": 0.1108, "step": 550 }, { "epoch": 0.13117826188802997, "grad_norm": 0.6735175848007202, "learning_rate": 9.999015805811965e-05, "loss": 0.0854, "step": 560 }, { "epoch": 0.13352073085031624, "grad_norm": 0.2487485110759735, "learning_rate": 9.998660418225645e-05, "loss": 0.1045, "step": 570 }, { "epoch": 0.13586319981260248, "grad_norm": 0.3255215287208557, "learning_rate": 9.998250366089848e-05, "loss": 0.0948, "step": 580 }, { "epoch": 0.13820566877488874, "grad_norm": 0.7749798893928528, "learning_rate": 9.997785653888835e-05, "loss": 0.0775, "step": 590 }, { "epoch": 0.14054813773717498, "grad_norm": 1.220957636833191, "learning_rate": 9.997266286704631e-05, "loss": 0.1201, "step": 600 }, { "epoch": 0.14289060669946124, "grad_norm": 0.8066214919090271, "learning_rate": 9.996692270216947e-05, "loss": 0.0815, "step": 610 }, { "epoch": 0.14523307566174748, "grad_norm": 0.6408377885818481, "learning_rate": 9.996063610703137e-05, "loss": 0.1, "step": 620 }, { "epoch": 0.14757554462403374, "grad_norm": 0.8596289753913879, "learning_rate": 9.995380315038119e-05, "loss": 0.1008, "step": 630 }, { "epoch": 0.14991801358631998, "grad_norm": 0.972243070602417, "learning_rate": 9.994642390694308e-05, "loss": 0.1075, "step": 640 }, { "epoch": 0.15226048254860622, "grad_norm": 0.5220253467559814, "learning_rate": 9.993849845741524e-05, "loss": 0.1021, "step": 650 }, { "epoch": 0.15460295151089248, "grad_norm": 0.5453582406044006, "learning_rate": 9.993002688846913e-05, "loss": 0.0963, "step": 660 }, { "epoch": 0.15694542047317872, "grad_norm": 0.24789837002754211, "learning_rate": 9.992100929274846e-05, "loss": 0.0712, "step": 670 }, { "epoch": 0.15928788943546499, "grad_norm": 0.31857672333717346, "learning_rate": 9.991144576886823e-05, "loss": 0.0859, "step": 680 }, { "epoch": 0.16163035839775122, "grad_norm": 0.7285981178283691, "learning_rate": 9.990133642141359e-05, "loss": 0.1274, "step": 690 }, { "epoch": 0.1639728273600375, "grad_norm": 1.0549755096435547, "learning_rate": 9.989068136093873e-05, "loss": 0.1187, "step": 700 }, { "epoch": 0.16631529632232372, "grad_norm": 0.204506054520607, "learning_rate": 9.987948070396571e-05, "loss": 0.1005, "step": 710 }, { "epoch": 0.16865776528461, "grad_norm": 0.4295964241027832, "learning_rate": 9.986773457298311e-05, "loss": 0.0937, "step": 720 }, { "epoch": 0.17100023424689623, "grad_norm": 1.0681158304214478, "learning_rate": 9.985544309644475e-05, "loss": 0.0855, "step": 730 }, { "epoch": 0.1733427032091825, "grad_norm": 0.667492151260376, "learning_rate": 9.984260640876821e-05, "loss": 0.1096, "step": 740 }, { "epoch": 0.17568517217146873, "grad_norm": 0.6995371580123901, "learning_rate": 9.98292246503335e-05, "loss": 0.108, "step": 750 }, { "epoch": 0.17802764113375497, "grad_norm": 0.9727945923805237, "learning_rate": 9.981529796748134e-05, "loss": 0.1155, "step": 760 }, { "epoch": 0.18037011009604123, "grad_norm": 0.3702404201030731, "learning_rate": 9.980082651251175e-05, "loss": 0.0846, "step": 770 }, { "epoch": 0.18271257905832747, "grad_norm": 0.3169856667518616, "learning_rate": 9.97858104436822e-05, "loss": 0.0917, "step": 780 }, { "epoch": 0.18505504802061373, "grad_norm": 0.6973789930343628, "learning_rate": 9.977024992520602e-05, "loss": 0.0785, "step": 790 }, { "epoch": 0.18739751698289997, "grad_norm": 0.5686987042427063, "learning_rate": 9.975414512725057e-05, "loss": 0.1015, "step": 800 }, { "epoch": 0.18973998594518623, "grad_norm": 0.6190043687820435, "learning_rate": 9.973749622593534e-05, "loss": 0.0753, "step": 810 }, { "epoch": 0.19208245490747247, "grad_norm": 0.3807699382305145, "learning_rate": 9.972030340333001e-05, "loss": 0.0734, "step": 820 }, { "epoch": 0.19442492386975874, "grad_norm": 0.45342546701431274, "learning_rate": 9.970256684745258e-05, "loss": 0.1012, "step": 830 }, { "epoch": 0.19676739283204497, "grad_norm": 0.2780962586402893, "learning_rate": 9.968428675226714e-05, "loss": 0.0757, "step": 840 }, { "epoch": 0.19910986179433124, "grad_norm": 0.20734530687332153, "learning_rate": 9.966546331768191e-05, "loss": 0.0751, "step": 850 }, { "epoch": 0.20145233075661748, "grad_norm": 0.3406268358230591, "learning_rate": 9.964609674954696e-05, "loss": 0.0937, "step": 860 }, { "epoch": 0.2037947997189037, "grad_norm": 0.33824971318244934, "learning_rate": 9.962618725965196e-05, "loss": 0.0913, "step": 870 }, { "epoch": 0.20613726868118998, "grad_norm": 0.5773669481277466, "learning_rate": 9.96057350657239e-05, "loss": 0.0834, "step": 880 }, { "epoch": 0.20847973764347622, "grad_norm": 0.5624499917030334, "learning_rate": 9.95847403914247e-05, "loss": 0.1001, "step": 890 }, { "epoch": 0.21082220660576248, "grad_norm": 0.5361132025718689, "learning_rate": 9.956320346634876e-05, "loss": 0.1233, "step": 900 }, { "epoch": 0.21316467556804872, "grad_norm": 0.4824270009994507, "learning_rate": 9.954112452602045e-05, "loss": 0.0882, "step": 910 }, { "epoch": 0.21550714453033498, "grad_norm": 0.6482338905334473, "learning_rate": 9.95185038118915e-05, "loss": 0.0647, "step": 920 }, { "epoch": 0.21784961349262122, "grad_norm": 0.2783452868461609, "learning_rate": 9.949534157133844e-05, "loss": 0.0917, "step": 930 }, { "epoch": 0.22019208245490748, "grad_norm": 0.4593198597431183, "learning_rate": 9.94716380576598e-05, "loss": 0.068, "step": 940 }, { "epoch": 0.22253455141719372, "grad_norm": 0.7751959562301636, "learning_rate": 9.944739353007344e-05, "loss": 0.1032, "step": 950 }, { "epoch": 0.22487702037947996, "grad_norm": 0.3963168263435364, "learning_rate": 9.942260825371358e-05, "loss": 0.0942, "step": 960 }, { "epoch": 0.22721948934176622, "grad_norm": 0.40413302183151245, "learning_rate": 9.939728249962807e-05, "loss": 0.0736, "step": 970 }, { "epoch": 0.22956195830405246, "grad_norm": 0.3862430155277252, "learning_rate": 9.937141654477528e-05, "loss": 0.0726, "step": 980 }, { "epoch": 0.23190442726633873, "grad_norm": 0.5864925384521484, "learning_rate": 9.934501067202117e-05, "loss": 0.0872, "step": 990 }, { "epoch": 0.23424689622862496, "grad_norm": 0.31625375151634216, "learning_rate": 9.931806517013612e-05, "loss": 0.0708, "step": 1000 }, { "epoch": 0.23658936519091123, "grad_norm": 0.5403046011924744, "learning_rate": 9.929058033379181e-05, "loss": 0.073, "step": 1010 }, { "epoch": 0.23893183415319746, "grad_norm": 0.4366021156311035, "learning_rate": 9.926255646355804e-05, "loss": 0.0643, "step": 1020 }, { "epoch": 0.24127430311548373, "grad_norm": 0.500108540058136, "learning_rate": 9.923399386589933e-05, "loss": 0.0437, "step": 1030 }, { "epoch": 0.24361677207776997, "grad_norm": 0.8096440434455872, "learning_rate": 9.92048928531717e-05, "loss": 0.0555, "step": 1040 }, { "epoch": 0.24595924104005623, "grad_norm": 0.6826971173286438, "learning_rate": 9.917525374361912e-05, "loss": 0.0704, "step": 1050 }, { "epoch": 0.24830171000234247, "grad_norm": 0.27831944823265076, "learning_rate": 9.914507686137019e-05, "loss": 0.0659, "step": 1060 }, { "epoch": 0.2506441789646287, "grad_norm": 0.35980355739593506, "learning_rate": 9.911436253643445e-05, "loss": 0.0652, "step": 1070 }, { "epoch": 0.25298664792691494, "grad_norm": 0.7075427174568176, "learning_rate": 9.90831111046988e-05, "loss": 0.0933, "step": 1080 }, { "epoch": 0.25532911688920124, "grad_norm": 0.33446595072746277, "learning_rate": 9.905132290792394e-05, "loss": 0.0594, "step": 1090 }, { "epoch": 0.2576715858514875, "grad_norm": 0.21890777349472046, "learning_rate": 9.901899829374047e-05, "loss": 0.0636, "step": 1100 }, { "epoch": 0.2600140548137737, "grad_norm": 0.19606763124465942, "learning_rate": 9.89861376156452e-05, "loss": 0.0573, "step": 1110 }, { "epoch": 0.26235652377605995, "grad_norm": 0.40309399366378784, "learning_rate": 9.895274123299723e-05, "loss": 0.0711, "step": 1120 }, { "epoch": 0.26469899273834624, "grad_norm": 0.15657459199428558, "learning_rate": 9.891880951101407e-05, "loss": 0.0596, "step": 1130 }, { "epoch": 0.2670414617006325, "grad_norm": 0.5244103670120239, "learning_rate": 9.888434282076758e-05, "loss": 0.0624, "step": 1140 }, { "epoch": 0.2693839306629187, "grad_norm": 0.6240133047103882, "learning_rate": 9.884934153917997e-05, "loss": 0.1013, "step": 1150 }, { "epoch": 0.27172639962520495, "grad_norm": 0.2892966568470001, "learning_rate": 9.881380604901964e-05, "loss": 0.0886, "step": 1160 }, { "epoch": 0.27406886858749124, "grad_norm": 0.11301174759864807, "learning_rate": 9.877773673889701e-05, "loss": 0.0967, "step": 1170 }, { "epoch": 0.2764113375497775, "grad_norm": 0.6525554060935974, "learning_rate": 9.87411340032603e-05, "loss": 0.0857, "step": 1180 }, { "epoch": 0.2787538065120637, "grad_norm": 0.27176904678344727, "learning_rate": 9.870399824239117e-05, "loss": 0.0556, "step": 1190 }, { "epoch": 0.28109627547434995, "grad_norm": 0.4166867136955261, "learning_rate": 9.86663298624003e-05, "loss": 0.0684, "step": 1200 }, { "epoch": 0.2834387444366362, "grad_norm": 0.19580566883087158, "learning_rate": 9.862812927522309e-05, "loss": 0.0882, "step": 1210 }, { "epoch": 0.2857812133989225, "grad_norm": 0.44604888558387756, "learning_rate": 9.858939689861506e-05, "loss": 0.0883, "step": 1220 }, { "epoch": 0.2881236823612087, "grad_norm": 0.49636200070381165, "learning_rate": 9.855013315614725e-05, "loss": 0.0912, "step": 1230 }, { "epoch": 0.29046615132349496, "grad_norm": 0.1988007128238678, "learning_rate": 9.851033847720166e-05, "loss": 0.0719, "step": 1240 }, { "epoch": 0.2928086202857812, "grad_norm": 0.30095556378364563, "learning_rate": 9.847001329696653e-05, "loss": 0.078, "step": 1250 }, { "epoch": 0.2951510892480675, "grad_norm": 0.34190279245376587, "learning_rate": 9.842915805643155e-05, "loss": 0.0442, "step": 1260 }, { "epoch": 0.2974935582103537, "grad_norm": 0.25464609265327454, "learning_rate": 9.838777320238312e-05, "loss": 0.0583, "step": 1270 }, { "epoch": 0.29983602717263996, "grad_norm": 0.07694657146930695, "learning_rate": 9.834585918739936e-05, "loss": 0.0359, "step": 1280 }, { "epoch": 0.3021784961349262, "grad_norm": 0.19848985970020294, "learning_rate": 9.830341646984521e-05, "loss": 0.0812, "step": 1290 }, { "epoch": 0.30452096509721244, "grad_norm": 0.27825915813446045, "learning_rate": 9.826044551386744e-05, "loss": 0.0496, "step": 1300 }, { "epoch": 0.30686343405949873, "grad_norm": 0.3718523681163788, "learning_rate": 9.821694678938953e-05, "loss": 0.0671, "step": 1310 }, { "epoch": 0.30920590302178497, "grad_norm": 0.5311722159385681, "learning_rate": 9.817292077210659e-05, "loss": 0.0739, "step": 1320 }, { "epoch": 0.3115483719840712, "grad_norm": 0.41185882687568665, "learning_rate": 9.812836794348004e-05, "loss": 0.0665, "step": 1330 }, { "epoch": 0.31389084094635744, "grad_norm": 0.2839798629283905, "learning_rate": 9.808328879073251e-05, "loss": 0.0495, "step": 1340 }, { "epoch": 0.31623330990864373, "grad_norm": 0.5456023812294006, "learning_rate": 9.803768380684242e-05, "loss": 0.0538, "step": 1350 }, { "epoch": 0.31857577887092997, "grad_norm": 1.1303348541259766, "learning_rate": 9.799155349053851e-05, "loss": 0.0948, "step": 1360 }, { "epoch": 0.3209182478332162, "grad_norm": 0.3756462633609772, "learning_rate": 9.794489834629455e-05, "loss": 0.0405, "step": 1370 }, { "epoch": 0.32326071679550245, "grad_norm": 0.45304539799690247, "learning_rate": 9.789771888432375e-05, "loss": 0.0518, "step": 1380 }, { "epoch": 0.3256031857577887, "grad_norm": 0.42578068375587463, "learning_rate": 9.785001562057309e-05, "loss": 0.0694, "step": 1390 }, { "epoch": 0.327945654720075, "grad_norm": 0.5314955711364746, "learning_rate": 9.780178907671789e-05, "loss": 0.0656, "step": 1400 }, { "epoch": 0.3302881236823612, "grad_norm": 0.445273220539093, "learning_rate": 9.775303978015585e-05, "loss": 0.0467, "step": 1410 }, { "epoch": 0.33263059264464745, "grad_norm": 0.45427191257476807, "learning_rate": 9.77037682640015e-05, "loss": 0.071, "step": 1420 }, { "epoch": 0.3349730616069337, "grad_norm": 1.1310575008392334, "learning_rate": 9.765397506708023e-05, "loss": 0.0783, "step": 1430 }, { "epoch": 0.33731553056922, "grad_norm": 0.37553080916404724, "learning_rate": 9.760366073392246e-05, "loss": 0.0595, "step": 1440 }, { "epoch": 0.3396579995315062, "grad_norm": 0.456626296043396, "learning_rate": 9.755282581475769e-05, "loss": 0.0684, "step": 1450 }, { "epoch": 0.34200046849379245, "grad_norm": 0.23000092804431915, "learning_rate": 9.750147086550844e-05, "loss": 0.0663, "step": 1460 }, { "epoch": 0.3443429374560787, "grad_norm": 0.8536004424095154, "learning_rate": 9.744959644778422e-05, "loss": 0.0615, "step": 1470 }, { "epoch": 0.346685406418365, "grad_norm": 0.2810976803302765, "learning_rate": 9.739720312887535e-05, "loss": 0.0499, "step": 1480 }, { "epoch": 0.3490278753806512, "grad_norm": 0.5517282485961914, "learning_rate": 9.734429148174675e-05, "loss": 0.0623, "step": 1490 }, { "epoch": 0.35137034434293746, "grad_norm": 0.5391654372215271, "learning_rate": 9.729086208503174e-05, "loss": 0.0701, "step": 1500 }, { "epoch": 0.3537128133052237, "grad_norm": 0.2104485183954239, "learning_rate": 9.723691552302562e-05, "loss": 0.0624, "step": 1510 }, { "epoch": 0.35605528226750993, "grad_norm": 0.6778100728988647, "learning_rate": 9.718245238567939e-05, "loss": 0.0735, "step": 1520 }, { "epoch": 0.3583977512297962, "grad_norm": 0.5578711628913879, "learning_rate": 9.712747326859315e-05, "loss": 0.0649, "step": 1530 }, { "epoch": 0.36074022019208246, "grad_norm": 0.19399204850196838, "learning_rate": 9.707197877300974e-05, "loss": 0.0696, "step": 1540 }, { "epoch": 0.3630826891543687, "grad_norm": 0.36409327387809753, "learning_rate": 9.701596950580806e-05, "loss": 0.0764, "step": 1550 }, { "epoch": 0.36542515811665494, "grad_norm": 0.3991371691226959, "learning_rate": 9.695944607949649e-05, "loss": 0.053, "step": 1560 }, { "epoch": 0.36776762707894123, "grad_norm": 0.24415276944637299, "learning_rate": 9.690240911220618e-05, "loss": 0.0359, "step": 1570 }, { "epoch": 0.37011009604122747, "grad_norm": 0.2075069695711136, "learning_rate": 9.684485922768422e-05, "loss": 0.0663, "step": 1580 }, { "epoch": 0.3724525650035137, "grad_norm": 0.6543785333633423, "learning_rate": 9.6786797055287e-05, "loss": 0.0494, "step": 1590 }, { "epoch": 0.37479503396579994, "grad_norm": 0.5545148253440857, "learning_rate": 9.672822322997305e-05, "loss": 0.0922, "step": 1600 }, { "epoch": 0.3771375029280862, "grad_norm": 0.3024766743183136, "learning_rate": 9.66691383922964e-05, "loss": 0.0458, "step": 1610 }, { "epoch": 0.37947997189037247, "grad_norm": 0.18543019890785217, "learning_rate": 9.660954318839933e-05, "loss": 0.0814, "step": 1620 }, { "epoch": 0.3818224408526587, "grad_norm": 0.6047130823135376, "learning_rate": 9.654943827000548e-05, "loss": 0.0749, "step": 1630 }, { "epoch": 0.38416490981494494, "grad_norm": 0.5619345307350159, "learning_rate": 9.648882429441257e-05, "loss": 0.0647, "step": 1640 }, { "epoch": 0.3865073787772312, "grad_norm": 0.3835267126560211, "learning_rate": 9.642770192448536e-05, "loss": 0.0526, "step": 1650 }, { "epoch": 0.3888498477395175, "grad_norm": 0.2994864583015442, "learning_rate": 9.636607182864827e-05, "loss": 0.0451, "step": 1660 }, { "epoch": 0.3911923167018037, "grad_norm": 0.5770288705825806, "learning_rate": 9.630393468087818e-05, "loss": 0.0716, "step": 1670 }, { "epoch": 0.39353478566408995, "grad_norm": 0.3165629506111145, "learning_rate": 9.624129116069694e-05, "loss": 0.0468, "step": 1680 }, { "epoch": 0.3958772546263762, "grad_norm": 0.11682554334402084, "learning_rate": 9.617814195316411e-05, "loss": 0.0669, "step": 1690 }, { "epoch": 0.3982197235886625, "grad_norm": 0.4979915916919708, "learning_rate": 9.611448774886924e-05, "loss": 0.0553, "step": 1700 }, { "epoch": 0.4005621925509487, "grad_norm": 0.14603012800216675, "learning_rate": 9.605032924392457e-05, "loss": 0.0597, "step": 1710 }, { "epoch": 0.40290466151323495, "grad_norm": 0.3345795273780823, "learning_rate": 9.598566713995718e-05, "loss": 0.049, "step": 1720 }, { "epoch": 0.4052471304755212, "grad_norm": 0.4213583171367645, "learning_rate": 9.59205021441015e-05, "loss": 0.0659, "step": 1730 }, { "epoch": 0.4075895994378074, "grad_norm": 0.1514274775981903, "learning_rate": 9.58548349689915e-05, "loss": 0.0803, "step": 1740 }, { "epoch": 0.4099320684000937, "grad_norm": 1.1298153400421143, "learning_rate": 9.578866633275288e-05, "loss": 0.0574, "step": 1750 }, { "epoch": 0.41227453736237996, "grad_norm": 0.2879124581813812, "learning_rate": 9.572199695899522e-05, "loss": 0.0618, "step": 1760 }, { "epoch": 0.4146170063246662, "grad_norm": 0.21584849059581757, "learning_rate": 9.565482757680415e-05, "loss": 0.069, "step": 1770 }, { "epoch": 0.41695947528695243, "grad_norm": 0.27666664123535156, "learning_rate": 9.558715892073323e-05, "loss": 0.0619, "step": 1780 }, { "epoch": 0.4193019442492387, "grad_norm": 0.36067232489585876, "learning_rate": 9.551899173079607e-05, "loss": 0.0512, "step": 1790 }, { "epoch": 0.42164441321152496, "grad_norm": 0.21706882119178772, "learning_rate": 9.545032675245813e-05, "loss": 0.0399, "step": 1800 }, { "epoch": 0.4239868821738112, "grad_norm": 0.2502746880054474, "learning_rate": 9.538116473662861e-05, "loss": 0.067, "step": 1810 }, { "epoch": 0.42632935113609743, "grad_norm": 0.19951611757278442, "learning_rate": 9.531150643965223e-05, "loss": 0.0572, "step": 1820 }, { "epoch": 0.42867182009838367, "grad_norm": 0.5946075916290283, "learning_rate": 9.524135262330098e-05, "loss": 0.0556, "step": 1830 }, { "epoch": 0.43101428906066996, "grad_norm": 0.20143412053585052, "learning_rate": 9.517070405476575e-05, "loss": 0.0556, "step": 1840 }, { "epoch": 0.4333567580229562, "grad_norm": 0.30480778217315674, "learning_rate": 9.509956150664796e-05, "loss": 0.0721, "step": 1850 }, { "epoch": 0.43569922698524244, "grad_norm": 0.289962500333786, "learning_rate": 9.502792575695112e-05, "loss": 0.0349, "step": 1860 }, { "epoch": 0.4380416959475287, "grad_norm": 0.23470467329025269, "learning_rate": 9.49557975890723e-05, "loss": 0.0508, "step": 1870 }, { "epoch": 0.44038416490981497, "grad_norm": 0.5040431022644043, "learning_rate": 9.488317779179361e-05, "loss": 0.0576, "step": 1880 }, { "epoch": 0.4427266338721012, "grad_norm": 0.4373694360256195, "learning_rate": 9.481006715927351e-05, "loss": 0.0526, "step": 1890 }, { "epoch": 0.44506910283438744, "grad_norm": 0.41776043176651, "learning_rate": 9.473646649103818e-05, "loss": 0.0417, "step": 1900 }, { "epoch": 0.4474115717966737, "grad_norm": 0.5410218238830566, "learning_rate": 9.46623765919727e-05, "loss": 0.0737, "step": 1910 }, { "epoch": 0.4497540407589599, "grad_norm": 0.4274581968784332, "learning_rate": 9.458779827231237e-05, "loss": 0.0715, "step": 1920 }, { "epoch": 0.4520965097212462, "grad_norm": 0.31722667813301086, "learning_rate": 9.451273234763371e-05, "loss": 0.0672, "step": 1930 }, { "epoch": 0.45443897868353245, "grad_norm": 0.221653014421463, "learning_rate": 9.443717963884569e-05, "loss": 0.0631, "step": 1940 }, { "epoch": 0.4567814476458187, "grad_norm": 0.2043227255344391, "learning_rate": 9.43611409721806e-05, "loss": 0.0436, "step": 1950 }, { "epoch": 0.4591239166081049, "grad_norm": 0.1967364400625229, "learning_rate": 9.428461717918511e-05, "loss": 0.0601, "step": 1960 }, { "epoch": 0.4614663855703912, "grad_norm": 0.23282958567142487, "learning_rate": 9.420760909671118e-05, "loss": 0.0441, "step": 1970 }, { "epoch": 0.46380885453267745, "grad_norm": 0.6064874529838562, "learning_rate": 9.413011756690685e-05, "loss": 0.0691, "step": 1980 }, { "epoch": 0.4661513234949637, "grad_norm": 0.29970476031303406, "learning_rate": 9.405214343720707e-05, "loss": 0.0362, "step": 1990 }, { "epoch": 0.4684937924572499, "grad_norm": 0.3310692310333252, "learning_rate": 9.397368756032445e-05, "loss": 0.045, "step": 2000 }, { "epoch": 0.4708362614195362, "grad_norm": 0.34072744846343994, "learning_rate": 9.389475079423988e-05, "loss": 0.0646, "step": 2010 }, { "epoch": 0.47317873038182245, "grad_norm": 0.09513302892446518, "learning_rate": 9.381533400219318e-05, "loss": 0.0543, "step": 2020 }, { "epoch": 0.4755211993441087, "grad_norm": 0.19264456629753113, "learning_rate": 9.373543805267368e-05, "loss": 0.0682, "step": 2030 }, { "epoch": 0.47786366830639493, "grad_norm": 0.3914099633693695, "learning_rate": 9.365506381941066e-05, "loss": 0.0455, "step": 2040 }, { "epoch": 0.48020613726868117, "grad_norm": 0.4226783514022827, "learning_rate": 9.357421218136386e-05, "loss": 0.0689, "step": 2050 }, { "epoch": 0.48254860623096746, "grad_norm": 0.41455796360969543, "learning_rate": 9.349288402271388e-05, "loss": 0.0596, "step": 2060 }, { "epoch": 0.4848910751932537, "grad_norm": 0.2510756254196167, "learning_rate": 9.341108023285238e-05, "loss": 0.0341, "step": 2070 }, { "epoch": 0.48723354415553993, "grad_norm": 0.40096133947372437, "learning_rate": 9.332880170637252e-05, "loss": 0.0813, "step": 2080 }, { "epoch": 0.48957601311782617, "grad_norm": 0.6878464221954346, "learning_rate": 9.32460493430591e-05, "loss": 0.044, "step": 2090 }, { "epoch": 0.49191848208011246, "grad_norm": 0.3416203558444977, "learning_rate": 9.316282404787871e-05, "loss": 0.0686, "step": 2100 }, { "epoch": 0.4942609510423987, "grad_norm": 0.12535825371742249, "learning_rate": 9.30791267309698e-05, "loss": 0.0354, "step": 2110 }, { "epoch": 0.49660342000468494, "grad_norm": 0.19023941457271576, "learning_rate": 9.299495830763286e-05, "loss": 0.0376, "step": 2120 }, { "epoch": 0.4989458889669712, "grad_norm": 0.3778730034828186, "learning_rate": 9.291031969832026e-05, "loss": 0.0518, "step": 2130 }, { "epoch": 0.5012883579292574, "grad_norm": 0.256195068359375, "learning_rate": 9.282521182862629e-05, "loss": 0.0571, "step": 2140 }, { "epoch": 0.5036308268915437, "grad_norm": 0.19933399558067322, "learning_rate": 9.273963562927695e-05, "loss": 0.0271, "step": 2150 }, { "epoch": 0.5059732958538299, "grad_norm": 0.06613205373287201, "learning_rate": 9.265359203611987e-05, "loss": 0.0334, "step": 2160 }, { "epoch": 0.5083157648161162, "grad_norm": 0.21248801052570343, "learning_rate": 9.256708199011401e-05, "loss": 0.0746, "step": 2170 }, { "epoch": 0.5106582337784025, "grad_norm": 0.3601578176021576, "learning_rate": 9.248010643731935e-05, "loss": 0.076, "step": 2180 }, { "epoch": 0.5130007027406887, "grad_norm": 0.0984947606921196, "learning_rate": 9.239266632888659e-05, "loss": 0.0892, "step": 2190 }, { "epoch": 0.515343171702975, "grad_norm": 0.13032953441143036, "learning_rate": 9.230476262104677e-05, "loss": 0.039, "step": 2200 }, { "epoch": 0.5176856406652612, "grad_norm": 0.48068541288375854, "learning_rate": 9.221639627510076e-05, "loss": 0.0585, "step": 2210 }, { "epoch": 0.5200281096275474, "grad_norm": 0.42812222242355347, "learning_rate": 9.212756825740873e-05, "loss": 0.0929, "step": 2220 }, { "epoch": 0.5223705785898337, "grad_norm": 0.3526000380516052, "learning_rate": 9.20382795393797e-05, "loss": 0.0657, "step": 2230 }, { "epoch": 0.5247130475521199, "grad_norm": 0.14142726361751556, "learning_rate": 9.194853109746074e-05, "loss": 0.0571, "step": 2240 }, { "epoch": 0.5270555165144062, "grad_norm": 0.10022013634443283, "learning_rate": 9.185832391312644e-05, "loss": 0.0362, "step": 2250 }, { "epoch": 0.5293979854766925, "grad_norm": 0.18126869201660156, "learning_rate": 9.176765897286813e-05, "loss": 0.0616, "step": 2260 }, { "epoch": 0.5317404544389787, "grad_norm": 0.22198501229286194, "learning_rate": 9.167653726818305e-05, "loss": 0.0227, "step": 2270 }, { "epoch": 0.534082923401265, "grad_norm": 0.07468587905168533, "learning_rate": 9.158495979556358e-05, "loss": 0.045, "step": 2280 }, { "epoch": 0.5364253923635511, "grad_norm": 0.1882839947938919, "learning_rate": 9.14929275564863e-05, "loss": 0.0569, "step": 2290 }, { "epoch": 0.5387678613258374, "grad_norm": 0.1339283585548401, "learning_rate": 9.140044155740101e-05, "loss": 0.0692, "step": 2300 }, { "epoch": 0.5411103302881237, "grad_norm": 0.19089505076408386, "learning_rate": 9.130750280971978e-05, "loss": 0.0638, "step": 2310 }, { "epoch": 0.5434527992504099, "grad_norm": 0.131087064743042, "learning_rate": 9.121411232980588e-05, "loss": 0.0656, "step": 2320 }, { "epoch": 0.5457952682126962, "grad_norm": 0.24333599209785461, "learning_rate": 9.112027113896262e-05, "loss": 0.0617, "step": 2330 }, { "epoch": 0.5481377371749825, "grad_norm": 0.4338069260120392, "learning_rate": 9.102598026342222e-05, "loss": 0.0384, "step": 2340 }, { "epoch": 0.5504802061372687, "grad_norm": 0.3546713888645172, "learning_rate": 9.093124073433463e-05, "loss": 0.0594, "step": 2350 }, { "epoch": 0.552822675099555, "grad_norm": 0.1043967604637146, "learning_rate": 9.083605358775612e-05, "loss": 0.0482, "step": 2360 }, { "epoch": 0.5551651440618411, "grad_norm": 0.16685545444488525, "learning_rate": 9.074041986463808e-05, "loss": 0.0439, "step": 2370 }, { "epoch": 0.5575076130241274, "grad_norm": 0.15651892125606537, "learning_rate": 9.064434061081562e-05, "loss": 0.0542, "step": 2380 }, { "epoch": 0.5598500819864137, "grad_norm": 0.33224546909332275, "learning_rate": 9.0547816876996e-05, "loss": 0.0772, "step": 2390 }, { "epoch": 0.5621925509486999, "grad_norm": 0.3219659626483917, "learning_rate": 9.045084971874738e-05, "loss": 0.0347, "step": 2400 }, { "epoch": 0.5645350199109862, "grad_norm": 0.3930731415748596, "learning_rate": 9.035344019648702e-05, "loss": 0.0386, "step": 2410 }, { "epoch": 0.5668774888732724, "grad_norm": 0.13527953624725342, "learning_rate": 9.025558937546988e-05, "loss": 0.0479, "step": 2420 }, { "epoch": 0.5692199578355587, "grad_norm": 0.1432938128709793, "learning_rate": 9.015729832577681e-05, "loss": 0.0319, "step": 2430 }, { "epoch": 0.571562426797845, "grad_norm": 0.25687897205352783, "learning_rate": 9.005856812230304e-05, "loss": 0.0387, "step": 2440 }, { "epoch": 0.5739048957601312, "grad_norm": 0.31300991773605347, "learning_rate": 8.995939984474624e-05, "loss": 0.0574, "step": 2450 }, { "epoch": 0.5762473647224174, "grad_norm": 0.25793933868408203, "learning_rate": 8.98597945775948e-05, "loss": 0.0415, "step": 2460 }, { "epoch": 0.5785898336847036, "grad_norm": 0.13978935778141022, "learning_rate": 8.975975341011596e-05, "loss": 0.0366, "step": 2470 }, { "epoch": 0.5809323026469899, "grad_norm": 0.20552988350391388, "learning_rate": 8.965927743634391e-05, "loss": 0.0519, "step": 2480 }, { "epoch": 0.5832747716092762, "grad_norm": 0.0843147486448288, "learning_rate": 8.955836775506776e-05, "loss": 0.0434, "step": 2490 }, { "epoch": 0.5856172405715624, "grad_norm": 0.519131600856781, "learning_rate": 8.945702546981969e-05, "loss": 0.044, "step": 2500 }, { "epoch": 0.5879597095338487, "grad_norm": 0.20150704681873322, "learning_rate": 8.935525168886262e-05, "loss": 0.0486, "step": 2510 }, { "epoch": 0.590302178496135, "grad_norm": 0.6557456851005554, "learning_rate": 8.92530475251784e-05, "loss": 0.0444, "step": 2520 }, { "epoch": 0.5926446474584212, "grad_norm": 0.48158717155456543, "learning_rate": 8.91504140964553e-05, "loss": 0.0512, "step": 2530 }, { "epoch": 0.5949871164207075, "grad_norm": 0.3636298179626465, "learning_rate": 8.90473525250761e-05, "loss": 0.052, "step": 2540 }, { "epoch": 0.5973295853829936, "grad_norm": 0.1767117828130722, "learning_rate": 8.894386393810563e-05, "loss": 0.0534, "step": 2550 }, { "epoch": 0.5996720543452799, "grad_norm": 0.30989664793014526, "learning_rate": 8.883994946727849e-05, "loss": 0.0765, "step": 2560 }, { "epoch": 0.6020145233075662, "grad_norm": 0.28089532256126404, "learning_rate": 8.873561024898668e-05, "loss": 0.0424, "step": 2570 }, { "epoch": 0.6043569922698524, "grad_norm": 0.5266916751861572, "learning_rate": 8.863084742426719e-05, "loss": 0.0364, "step": 2580 }, { "epoch": 0.6066994612321387, "grad_norm": 0.5653497576713562, "learning_rate": 8.852566213878947e-05, "loss": 0.0604, "step": 2590 }, { "epoch": 0.6090419301944249, "grad_norm": 0.34995973110198975, "learning_rate": 8.842005554284296e-05, "loss": 0.0386, "step": 2600 }, { "epoch": 0.6113843991567112, "grad_norm": 0.42935842275619507, "learning_rate": 8.831402879132446e-05, "loss": 0.0595, "step": 2610 }, { "epoch": 0.6137268681189975, "grad_norm": 0.19672085344791412, "learning_rate": 8.820758304372557e-05, "loss": 0.0426, "step": 2620 }, { "epoch": 0.6160693370812836, "grad_norm": 0.17344583570957184, "learning_rate": 8.810071946411989e-05, "loss": 0.0979, "step": 2630 }, { "epoch": 0.6184118060435699, "grad_norm": 0.19755525887012482, "learning_rate": 8.799343922115044e-05, "loss": 0.0322, "step": 2640 }, { "epoch": 0.6207542750058562, "grad_norm": 0.33817166090011597, "learning_rate": 8.788574348801675e-05, "loss": 0.0375, "step": 2650 }, { "epoch": 0.6230967439681424, "grad_norm": 0.44614845514297485, "learning_rate": 8.77776334424621e-05, "loss": 0.054, "step": 2660 }, { "epoch": 0.6254392129304287, "grad_norm": 0.4128440022468567, "learning_rate": 8.766911026676064e-05, "loss": 0.0422, "step": 2670 }, { "epoch": 0.6277816818927149, "grad_norm": 0.22449485957622528, "learning_rate": 8.756017514770443e-05, "loss": 0.037, "step": 2680 }, { "epoch": 0.6301241508550012, "grad_norm": 0.2689172029495239, "learning_rate": 8.745082927659047e-05, "loss": 0.0353, "step": 2690 }, { "epoch": 0.6324666198172875, "grad_norm": 0.05075841769576073, "learning_rate": 8.73410738492077e-05, "loss": 0.0333, "step": 2700 }, { "epoch": 0.6348090887795736, "grad_norm": 0.1499403417110443, "learning_rate": 8.723091006582389e-05, "loss": 0.0559, "step": 2710 }, { "epoch": 0.6371515577418599, "grad_norm": 0.36928892135620117, "learning_rate": 8.71203391311725e-05, "loss": 0.0763, "step": 2720 }, { "epoch": 0.6394940267041461, "grad_norm": 0.5727768540382385, "learning_rate": 8.700936225443959e-05, "loss": 0.0527, "step": 2730 }, { "epoch": 0.6418364956664324, "grad_norm": 0.30735543370246887, "learning_rate": 8.689798064925049e-05, "loss": 0.0585, "step": 2740 }, { "epoch": 0.6441789646287187, "grad_norm": 0.3882769048213959, "learning_rate": 8.678619553365659e-05, "loss": 0.0491, "step": 2750 }, { "epoch": 0.6465214335910049, "grad_norm": 0.365843802690506, "learning_rate": 8.6674008130122e-05, "loss": 0.0397, "step": 2760 }, { "epoch": 0.6488639025532912, "grad_norm": 0.21451324224472046, "learning_rate": 8.656141966551019e-05, "loss": 0.0365, "step": 2770 }, { "epoch": 0.6512063715155774, "grad_norm": 0.1609046310186386, "learning_rate": 8.644843137107059e-05, "loss": 0.039, "step": 2780 }, { "epoch": 0.6535488404778637, "grad_norm": 0.7074998021125793, "learning_rate": 8.633504448242505e-05, "loss": 0.0591, "step": 2790 }, { "epoch": 0.65589130944015, "grad_norm": 0.21024738252162933, "learning_rate": 8.622126023955446e-05, "loss": 0.0488, "step": 2800 }, { "epoch": 0.6582337784024361, "grad_norm": 0.3021513819694519, "learning_rate": 8.610707988678503e-05, "loss": 0.04, "step": 2810 }, { "epoch": 0.6605762473647224, "grad_norm": 0.19868189096450806, "learning_rate": 8.599250467277483e-05, "loss": 0.0319, "step": 2820 }, { "epoch": 0.6629187163270087, "grad_norm": 0.15607990324497223, "learning_rate": 8.587753585050004e-05, "loss": 0.036, "step": 2830 }, { "epoch": 0.6652611852892949, "grad_norm": 0.3136105239391327, "learning_rate": 8.576217467724128e-05, "loss": 0.0752, "step": 2840 }, { "epoch": 0.6676036542515812, "grad_norm": 0.21903324127197266, "learning_rate": 8.564642241456986e-05, "loss": 0.0416, "step": 2850 }, { "epoch": 0.6699461232138674, "grad_norm": 0.5193045735359192, "learning_rate": 8.553028032833397e-05, "loss": 0.0386, "step": 2860 }, { "epoch": 0.6722885921761537, "grad_norm": 0.5539060235023499, "learning_rate": 8.541374968864487e-05, "loss": 0.0439, "step": 2870 }, { "epoch": 0.67463106113844, "grad_norm": 0.2819710969924927, "learning_rate": 8.529683176986295e-05, "loss": 0.0541, "step": 2880 }, { "epoch": 0.6769735301007261, "grad_norm": 0.1039167121052742, "learning_rate": 8.517952785058385e-05, "loss": 0.039, "step": 2890 }, { "epoch": 0.6793159990630124, "grad_norm": 0.062352605164051056, "learning_rate": 8.506183921362443e-05, "loss": 0.0401, "step": 2900 }, { "epoch": 0.6816584680252986, "grad_norm": 0.5535932183265686, "learning_rate": 8.494376714600878e-05, "loss": 0.0505, "step": 2910 }, { "epoch": 0.6840009369875849, "grad_norm": 0.37601238489151, "learning_rate": 8.482531293895412e-05, "loss": 0.0391, "step": 2920 }, { "epoch": 0.6863434059498712, "grad_norm": 0.06856988370418549, "learning_rate": 8.470647788785665e-05, "loss": 0.0389, "step": 2930 }, { "epoch": 0.6886858749121574, "grad_norm": 0.5693712830543518, "learning_rate": 8.458726329227747e-05, "loss": 0.0495, "step": 2940 }, { "epoch": 0.6910283438744437, "grad_norm": 0.14418154954910278, "learning_rate": 8.44676704559283e-05, "loss": 0.0405, "step": 2950 }, { "epoch": 0.69337081283673, "grad_norm": 0.11880888044834137, "learning_rate": 8.434770068665723e-05, "loss": 0.0362, "step": 2960 }, { "epoch": 0.6957132817990161, "grad_norm": 0.6350199580192566, "learning_rate": 8.422735529643444e-05, "loss": 0.0607, "step": 2970 }, { "epoch": 0.6980557507613024, "grad_norm": 0.19949962198734283, "learning_rate": 8.410663560133784e-05, "loss": 0.0346, "step": 2980 }, { "epoch": 0.7003982197235886, "grad_norm": 0.19905024766921997, "learning_rate": 8.398554292153866e-05, "loss": 0.0455, "step": 2990 }, { "epoch": 0.7027406886858749, "grad_norm": 0.12724433839321136, "learning_rate": 8.386407858128706e-05, "loss": 0.0312, "step": 3000 }, { "epoch": 0.7050831576481612, "grad_norm": 0.6818522214889526, "learning_rate": 8.37422439088976e-05, "loss": 0.0477, "step": 3010 }, { "epoch": 0.7074256266104474, "grad_norm": 0.14397919178009033, "learning_rate": 8.362004023673474e-05, "loss": 0.054, "step": 3020 }, { "epoch": 0.7097680955727337, "grad_norm": 0.1597958207130432, "learning_rate": 8.349746890119826e-05, "loss": 0.0475, "step": 3030 }, { "epoch": 0.7121105645350199, "grad_norm": 0.2985258102416992, "learning_rate": 8.337453124270863e-05, "loss": 0.0276, "step": 3040 }, { "epoch": 0.7144530334973062, "grad_norm": 0.17043350636959076, "learning_rate": 8.32512286056924e-05, "loss": 0.0337, "step": 3050 }, { "epoch": 0.7167955024595924, "grad_norm": 0.390009343624115, "learning_rate": 8.31275623385675e-05, "loss": 0.0277, "step": 3060 }, { "epoch": 0.7191379714218786, "grad_norm": 0.20475880801677704, "learning_rate": 8.300353379372834e-05, "loss": 0.0691, "step": 3070 }, { "epoch": 0.7214804403841649, "grad_norm": 0.11685507744550705, "learning_rate": 8.287914432753123e-05, "loss": 0.0411, "step": 3080 }, { "epoch": 0.7238229093464511, "grad_norm": 0.531944990158081, "learning_rate": 8.275439530027948e-05, "loss": 0.0511, "step": 3090 }, { "epoch": 0.7261653783087374, "grad_norm": 0.05079588294029236, "learning_rate": 8.262928807620843e-05, "loss": 0.0664, "step": 3100 }, { "epoch": 0.7285078472710237, "grad_norm": 0.3010249435901642, "learning_rate": 8.250382402347065e-05, "loss": 0.0565, "step": 3110 }, { "epoch": 0.7308503162333099, "grad_norm": 0.2115558385848999, "learning_rate": 8.237800451412095e-05, "loss": 0.0615, "step": 3120 }, { "epoch": 0.7331927851955962, "grad_norm": 0.3865530490875244, "learning_rate": 8.225183092410128e-05, "loss": 0.0349, "step": 3130 }, { "epoch": 0.7355352541578825, "grad_norm": 0.07815901935100555, "learning_rate": 8.212530463322583e-05, "loss": 0.036, "step": 3140 }, { "epoch": 0.7378777231201686, "grad_norm": 0.11009709537029266, "learning_rate": 8.199842702516583e-05, "loss": 0.0386, "step": 3150 }, { "epoch": 0.7402201920824549, "grad_norm": 0.12392786890268326, "learning_rate": 8.18711994874345e-05, "loss": 0.0396, "step": 3160 }, { "epoch": 0.7425626610447411, "grad_norm": 0.16354168951511383, "learning_rate": 8.174362341137177e-05, "loss": 0.0446, "step": 3170 }, { "epoch": 0.7449051300070274, "grad_norm": 0.2223191112279892, "learning_rate": 8.161570019212921e-05, "loss": 0.0326, "step": 3180 }, { "epoch": 0.7472475989693137, "grad_norm": 0.176427960395813, "learning_rate": 8.148743122865463e-05, "loss": 0.0235, "step": 3190 }, { "epoch": 0.7495900679315999, "grad_norm": 0.19706971943378448, "learning_rate": 8.135881792367686e-05, "loss": 0.0417, "step": 3200 }, { "epoch": 0.7519325368938862, "grad_norm": 0.08818463236093521, "learning_rate": 8.12298616836904e-05, "loss": 0.0463, "step": 3210 }, { "epoch": 0.7542750058561724, "grad_norm": 0.08389343321323395, "learning_rate": 8.110056391894005e-05, "loss": 0.0259, "step": 3220 }, { "epoch": 0.7566174748184586, "grad_norm": 0.13730217516422272, "learning_rate": 8.097092604340542e-05, "loss": 0.0394, "step": 3230 }, { "epoch": 0.7589599437807449, "grad_norm": 0.48324722051620483, "learning_rate": 8.084094947478556e-05, "loss": 0.0488, "step": 3240 }, { "epoch": 0.7613024127430311, "grad_norm": 0.15898984670639038, "learning_rate": 8.07106356344834e-05, "loss": 0.0402, "step": 3250 }, { "epoch": 0.7636448817053174, "grad_norm": 0.19997884333133698, "learning_rate": 8.057998594759022e-05, "loss": 0.0406, "step": 3260 }, { "epoch": 0.7659873506676037, "grad_norm": 0.06215028837323189, "learning_rate": 8.044900184287007e-05, "loss": 0.0577, "step": 3270 }, { "epoch": 0.7683298196298899, "grad_norm": 0.28326717019081116, "learning_rate": 8.031768475274413e-05, "loss": 0.057, "step": 3280 }, { "epoch": 0.7706722885921762, "grad_norm": 0.29579654335975647, "learning_rate": 8.018603611327504e-05, "loss": 0.0563, "step": 3290 }, { "epoch": 0.7730147575544624, "grad_norm": 0.5313428044319153, "learning_rate": 8.005405736415126e-05, "loss": 0.0748, "step": 3300 }, { "epoch": 0.7753572265167487, "grad_norm": 0.45142146944999695, "learning_rate": 7.992174994867123e-05, "loss": 0.0344, "step": 3310 }, { "epoch": 0.777699695479035, "grad_norm": 0.22848837077617645, "learning_rate": 7.978911531372765e-05, "loss": 0.0367, "step": 3320 }, { "epoch": 0.7800421644413211, "grad_norm": 0.07316577434539795, "learning_rate": 7.965615490979163e-05, "loss": 0.0332, "step": 3330 }, { "epoch": 0.7823846334036074, "grad_norm": 0.08522647619247437, "learning_rate": 7.952287019089685e-05, "loss": 0.0313, "step": 3340 }, { "epoch": 0.7847271023658936, "grad_norm": 0.2560670077800751, "learning_rate": 7.938926261462366e-05, "loss": 0.0753, "step": 3350 }, { "epoch": 0.7870695713281799, "grad_norm": 0.2529207468032837, "learning_rate": 7.925533364208309e-05, "loss": 0.0584, "step": 3360 }, { "epoch": 0.7894120402904662, "grad_norm": 0.20108440518379211, "learning_rate": 7.912108473790092e-05, "loss": 0.0443, "step": 3370 }, { "epoch": 0.7917545092527524, "grad_norm": 0.09312764555215836, "learning_rate": 7.898651737020166e-05, "loss": 0.0529, "step": 3380 }, { "epoch": 0.7940969782150387, "grad_norm": 0.08973310142755508, "learning_rate": 7.88516330105925e-05, "loss": 0.0313, "step": 3390 }, { "epoch": 0.796439447177325, "grad_norm": 0.2917576730251312, "learning_rate": 7.871643313414718e-05, "loss": 0.0699, "step": 3400 }, { "epoch": 0.7987819161396111, "grad_norm": 0.3426614999771118, "learning_rate": 7.858091921938988e-05, "loss": 0.0554, "step": 3410 }, { "epoch": 0.8011243851018974, "grad_norm": 0.10231604427099228, "learning_rate": 7.844509274827907e-05, "loss": 0.0469, "step": 3420 }, { "epoch": 0.8034668540641836, "grad_norm": 0.36295169591903687, "learning_rate": 7.830895520619128e-05, "loss": 0.0489, "step": 3430 }, { "epoch": 0.8058093230264699, "grad_norm": 0.23017369210720062, "learning_rate": 7.817250808190483e-05, "loss": 0.0407, "step": 3440 }, { "epoch": 0.8081517919887562, "grad_norm": 0.2438231259584427, "learning_rate": 7.803575286758364e-05, "loss": 0.0542, "step": 3450 }, { "epoch": 0.8104942609510424, "grad_norm": 0.28502318263053894, "learning_rate": 7.789869105876083e-05, "loss": 0.0433, "step": 3460 }, { "epoch": 0.8128367299133287, "grad_norm": 0.7063993215560913, "learning_rate": 7.776132415432234e-05, "loss": 0.0687, "step": 3470 }, { "epoch": 0.8151791988756149, "grad_norm": 0.3574845492839813, "learning_rate": 7.762365365649067e-05, "loss": 0.0283, "step": 3480 }, { "epoch": 0.8175216678379011, "grad_norm": 0.1527651846408844, "learning_rate": 7.748568107080832e-05, "loss": 0.0502, "step": 3490 }, { "epoch": 0.8198641368001874, "grad_norm": 0.20111270248889923, "learning_rate": 7.734740790612136e-05, "loss": 0.0526, "step": 3500 }, { "epoch": 0.8222066057624736, "grad_norm": 0.5221764445304871, "learning_rate": 7.720883567456298e-05, "loss": 0.0385, "step": 3510 }, { "epoch": 0.8245490747247599, "grad_norm": 0.11450177431106567, "learning_rate": 7.70699658915369e-05, "loss": 0.0495, "step": 3520 }, { "epoch": 0.8268915436870461, "grad_norm": 0.2669161558151245, "learning_rate": 7.693080007570084e-05, "loss": 0.0419, "step": 3530 }, { "epoch": 0.8292340126493324, "grad_norm": 0.4859974682331085, "learning_rate": 7.679133974894983e-05, "loss": 0.0454, "step": 3540 }, { "epoch": 0.8315764816116187, "grad_norm": 0.13351887464523315, "learning_rate": 7.66515864363997e-05, "loss": 0.0401, "step": 3550 }, { "epoch": 0.8339189505739049, "grad_norm": 0.3376217484474182, "learning_rate": 7.651154166637025e-05, "loss": 0.0372, "step": 3560 }, { "epoch": 0.8362614195361912, "grad_norm": 0.4906126856803894, "learning_rate": 7.637120697036866e-05, "loss": 0.0444, "step": 3570 }, { "epoch": 0.8386038884984774, "grad_norm": 0.1525869518518448, "learning_rate": 7.623058388307269e-05, "loss": 0.0411, "step": 3580 }, { "epoch": 0.8409463574607636, "grad_norm": 0.10655678063631058, "learning_rate": 7.608967394231387e-05, "loss": 0.0322, "step": 3590 }, { "epoch": 0.8432888264230499, "grad_norm": 0.6658011674880981, "learning_rate": 7.594847868906076e-05, "loss": 0.0736, "step": 3600 }, { "epoch": 0.8456312953853361, "grad_norm": 0.2985578775405884, "learning_rate": 7.580699966740201e-05, "loss": 0.0296, "step": 3610 }, { "epoch": 0.8479737643476224, "grad_norm": 0.08989045768976212, "learning_rate": 7.566523842452958e-05, "loss": 0.0412, "step": 3620 }, { "epoch": 0.8503162333099087, "grad_norm": 0.37455546855926514, "learning_rate": 7.552319651072164e-05, "loss": 0.0473, "step": 3630 }, { "epoch": 0.8526587022721949, "grad_norm": 0.19339019060134888, "learning_rate": 7.538087547932585e-05, "loss": 0.0475, "step": 3640 }, { "epoch": 0.8550011712344812, "grad_norm": 0.22095589339733124, "learning_rate": 7.52382768867422e-05, "loss": 0.0287, "step": 3650 }, { "epoch": 0.8573436401967673, "grad_norm": 0.39905375242233276, "learning_rate": 7.509540229240601e-05, "loss": 0.0418, "step": 3660 }, { "epoch": 0.8596861091590536, "grad_norm": 0.1556907296180725, "learning_rate": 7.495225325877103e-05, "loss": 0.0462, "step": 3670 }, { "epoch": 0.8620285781213399, "grad_norm": 0.43170592188835144, "learning_rate": 7.480883135129211e-05, "loss": 0.0453, "step": 3680 }, { "epoch": 0.8643710470836261, "grad_norm": 0.09220433235168457, "learning_rate": 7.466513813840825e-05, "loss": 0.0414, "step": 3690 }, { "epoch": 0.8667135160459124, "grad_norm": 0.09303878992795944, "learning_rate": 7.452117519152542e-05, "loss": 0.0412, "step": 3700 }, { "epoch": 0.8690559850081987, "grad_norm": 0.456315279006958, "learning_rate": 7.437694408499933e-05, "loss": 0.0429, "step": 3710 }, { "epoch": 0.8713984539704849, "grad_norm": 0.0672278180718422, "learning_rate": 7.423244639611826e-05, "loss": 0.0492, "step": 3720 }, { "epoch": 0.8737409229327712, "grad_norm": 0.11052095890045166, "learning_rate": 7.408768370508576e-05, "loss": 0.0404, "step": 3730 }, { "epoch": 0.8760833918950574, "grad_norm": 0.20042133331298828, "learning_rate": 7.394265759500348e-05, "loss": 0.0597, "step": 3740 }, { "epoch": 0.8784258608573436, "grad_norm": 0.3536411225795746, "learning_rate": 7.379736965185368e-05, "loss": 0.0212, "step": 3750 }, { "epoch": 0.8807683298196299, "grad_norm": 0.28125354647636414, "learning_rate": 7.365182146448205e-05, "loss": 0.052, "step": 3760 }, { "epoch": 0.8831107987819161, "grad_norm": 0.12258744984865189, "learning_rate": 7.350601462458024e-05, "loss": 0.02, "step": 3770 }, { "epoch": 0.8854532677442024, "grad_norm": 0.5056569576263428, "learning_rate": 7.335995072666848e-05, "loss": 0.035, "step": 3780 }, { "epoch": 0.8877957367064886, "grad_norm": 0.2552855610847473, "learning_rate": 7.32136313680782e-05, "loss": 0.0421, "step": 3790 }, { "epoch": 0.8901382056687749, "grad_norm": 0.05761013180017471, "learning_rate": 7.30670581489344e-05, "loss": 0.0414, "step": 3800 }, { "epoch": 0.8924806746310612, "grad_norm": 0.9745859503746033, "learning_rate": 7.292023267213835e-05, "loss": 0.0725, "step": 3810 }, { "epoch": 0.8948231435933474, "grad_norm": 0.2608197033405304, "learning_rate": 7.277315654334997e-05, "loss": 0.0405, "step": 3820 }, { "epoch": 0.8971656125556337, "grad_norm": 0.3153429329395294, "learning_rate": 7.262583137097018e-05, "loss": 0.0407, "step": 3830 }, { "epoch": 0.8995080815179198, "grad_norm": 0.5415343642234802, "learning_rate": 7.247825876612353e-05, "loss": 0.0389, "step": 3840 }, { "epoch": 0.9018505504802061, "grad_norm": 0.4772924482822418, "learning_rate": 7.233044034264034e-05, "loss": 0.055, "step": 3850 }, { "epoch": 0.9041930194424924, "grad_norm": 0.41308316588401794, "learning_rate": 7.218237771703921e-05, "loss": 0.0578, "step": 3860 }, { "epoch": 0.9065354884047786, "grad_norm": 0.0859963595867157, "learning_rate": 7.203407250850928e-05, "loss": 0.0328, "step": 3870 }, { "epoch": 0.9088779573670649, "grad_norm": 0.4168371856212616, "learning_rate": 7.188552633889259e-05, "loss": 0.0493, "step": 3880 }, { "epoch": 0.9112204263293512, "grad_norm": 0.42193326354026794, "learning_rate": 7.173674083266624e-05, "loss": 0.052, "step": 3890 }, { "epoch": 0.9135628952916374, "grad_norm": 0.11540161073207855, "learning_rate": 7.158771761692464e-05, "loss": 0.0616, "step": 3900 }, { "epoch": 0.9159053642539237, "grad_norm": 0.1789163500070572, "learning_rate": 7.143845832136188e-05, "loss": 0.0315, "step": 3910 }, { "epoch": 0.9182478332162098, "grad_norm": 0.2873396873474121, "learning_rate": 7.128896457825364e-05, "loss": 0.0577, "step": 3920 }, { "epoch": 0.9205903021784961, "grad_norm": 0.035885997116565704, "learning_rate": 7.113923802243957e-05, "loss": 0.0462, "step": 3930 }, { "epoch": 0.9229327711407824, "grad_norm": 0.380929172039032, "learning_rate": 7.09892802913053e-05, "loss": 0.0285, "step": 3940 }, { "epoch": 0.9252752401030686, "grad_norm": 0.21406327188014984, "learning_rate": 7.083909302476453e-05, "loss": 0.0255, "step": 3950 }, { "epoch": 0.9276177090653549, "grad_norm": 0.04998482018709183, "learning_rate": 7.068867786524116e-05, "loss": 0.0285, "step": 3960 }, { "epoch": 0.9299601780276411, "grad_norm": 0.19604696333408356, "learning_rate": 7.053803645765128e-05, "loss": 0.0345, "step": 3970 }, { "epoch": 0.9323026469899274, "grad_norm": 0.6424615979194641, "learning_rate": 7.038717044938519e-05, "loss": 0.0411, "step": 3980 }, { "epoch": 0.9346451159522137, "grad_norm": 0.0754154697060585, "learning_rate": 7.023608149028937e-05, "loss": 0.0243, "step": 3990 }, { "epoch": 0.9369875849144998, "grad_norm": 0.26757097244262695, "learning_rate": 7.008477123264848e-05, "loss": 0.0414, "step": 4000 }, { "epoch": 0.9393300538767861, "grad_norm": 0.14239585399627686, "learning_rate": 6.993324133116726e-05, "loss": 0.0259, "step": 4010 }, { "epoch": 0.9416725228390724, "grad_norm": 0.12988215684890747, "learning_rate": 6.978149344295242e-05, "loss": 0.0279, "step": 4020 }, { "epoch": 0.9440149918013586, "grad_norm": 0.3678188920021057, "learning_rate": 6.962952922749457e-05, "loss": 0.0353, "step": 4030 }, { "epoch": 0.9463574607636449, "grad_norm": 0.6559092402458191, "learning_rate": 6.947735034665002e-05, "loss": 0.0558, "step": 4040 }, { "epoch": 0.9486999297259311, "grad_norm": 0.607363760471344, "learning_rate": 6.932495846462261e-05, "loss": 0.0459, "step": 4050 }, { "epoch": 0.9510423986882174, "grad_norm": 0.22406215965747833, "learning_rate": 6.917235524794558e-05, "loss": 0.0412, "step": 4060 }, { "epoch": 0.9533848676505037, "grad_norm": 0.2519318461418152, "learning_rate": 6.901954236546323e-05, "loss": 0.0355, "step": 4070 }, { "epoch": 0.9557273366127899, "grad_norm": 0.40484338998794556, "learning_rate": 6.886652148831279e-05, "loss": 0.0446, "step": 4080 }, { "epoch": 0.9580698055750761, "grad_norm": 0.36861318349838257, "learning_rate": 6.871329428990602e-05, "loss": 0.0324, "step": 4090 }, { "epoch": 0.9604122745373623, "grad_norm": 0.15483994781970978, "learning_rate": 6.855986244591104e-05, "loss": 0.0265, "step": 4100 }, { "epoch": 0.9627547434996486, "grad_norm": 0.12822240591049194, "learning_rate": 6.840622763423391e-05, "loss": 0.0251, "step": 4110 }, { "epoch": 0.9650972124619349, "grad_norm": 0.2436823546886444, "learning_rate": 6.825239153500029e-05, "loss": 0.0354, "step": 4120 }, { "epoch": 0.9674396814242211, "grad_norm": 0.11992768943309784, "learning_rate": 6.809835583053715e-05, "loss": 0.0355, "step": 4130 }, { "epoch": 0.9697821503865074, "grad_norm": 0.05282627418637276, "learning_rate": 6.794412220535426e-05, "loss": 0.0325, "step": 4140 }, { "epoch": 0.9721246193487937, "grad_norm": 0.1702210009098053, "learning_rate": 6.778969234612584e-05, "loss": 0.0421, "step": 4150 }, { "epoch": 0.9744670883110799, "grad_norm": 0.30918455123901367, "learning_rate": 6.763506794167208e-05, "loss": 0.0306, "step": 4160 }, { "epoch": 0.9768095572733662, "grad_norm": 0.18471957743167877, "learning_rate": 6.748025068294067e-05, "loss": 0.026, "step": 4170 }, { "epoch": 0.9791520262356523, "grad_norm": 0.2867111265659332, "learning_rate": 6.732524226298841e-05, "loss": 0.0368, "step": 4180 }, { "epoch": 0.9814944951979386, "grad_norm": 0.5615723729133606, "learning_rate": 6.71700443769625e-05, "loss": 0.0374, "step": 4190 }, { "epoch": 0.9838369641602249, "grad_norm": 0.06628378480672836, "learning_rate": 6.701465872208216e-05, "loss": 0.0432, "step": 4200 }, { "epoch": 0.9861794331225111, "grad_norm": 0.24212607741355896, "learning_rate": 6.685908699762002e-05, "loss": 0.0446, "step": 4210 }, { "epoch": 0.9885219020847974, "grad_norm": 0.1411833018064499, "learning_rate": 6.670333090488356e-05, "loss": 0.0281, "step": 4220 }, { "epoch": 0.9908643710470836, "grad_norm": 0.4957182705402374, "learning_rate": 6.654739214719641e-05, "loss": 0.0385, "step": 4230 }, { "epoch": 0.9932068400093699, "grad_norm": 0.2773032486438751, "learning_rate": 6.639127242987988e-05, "loss": 0.0351, "step": 4240 }, { "epoch": 0.9955493089716562, "grad_norm": 0.6347845196723938, "learning_rate": 6.623497346023418e-05, "loss": 0.0519, "step": 4250 }, { "epoch": 0.9978917779339423, "grad_norm": 0.39392927289009094, "learning_rate": 6.607849694751977e-05, "loss": 0.0415, "step": 4260 }, { "epoch": 1.0002342468962286, "grad_norm": 0.12185105681419373, "learning_rate": 6.592184460293877e-05, "loss": 0.0413, "step": 4270 }, { "epoch": 1.0025767158585148, "grad_norm": 0.4016129970550537, "learning_rate": 6.576501813961609e-05, "loss": 0.0473, "step": 4280 }, { "epoch": 1.0049191848208012, "grad_norm": 0.10202305018901825, "learning_rate": 6.56080192725808e-05, "loss": 0.0476, "step": 4290 }, { "epoch": 1.0072616537830874, "grad_norm": 0.08643211424350739, "learning_rate": 6.545084971874738e-05, "loss": 0.0363, "step": 4300 }, { "epoch": 1.0096041227453736, "grad_norm": 0.4279628396034241, "learning_rate": 6.529351119689688e-05, "loss": 0.0343, "step": 4310 }, { "epoch": 1.0119465917076598, "grad_norm": 0.0435931533575058, "learning_rate": 6.513600542765817e-05, "loss": 0.0363, "step": 4320 }, { "epoch": 1.0142890606699462, "grad_norm": 0.11314094811677933, "learning_rate": 6.497833413348909e-05, "loss": 0.0409, "step": 4330 }, { "epoch": 1.0166315296322324, "grad_norm": 0.049418941140174866, "learning_rate": 6.48204990386577e-05, "loss": 0.027, "step": 4340 }, { "epoch": 1.0189739985945185, "grad_norm": 0.0937579795718193, "learning_rate": 6.466250186922325e-05, "loss": 0.0386, "step": 4350 }, { "epoch": 1.021316467556805, "grad_norm": 0.17256158590316772, "learning_rate": 6.450434435301751e-05, "loss": 0.0283, "step": 4360 }, { "epoch": 1.0236589365190911, "grad_norm": 0.41623151302337646, "learning_rate": 6.43460282196257e-05, "loss": 0.0309, "step": 4370 }, { "epoch": 1.0260014054813773, "grad_norm": 0.25574249029159546, "learning_rate": 6.418755520036775e-05, "loss": 0.017, "step": 4380 }, { "epoch": 1.0283438744436637, "grad_norm": 0.12465788424015045, "learning_rate": 6.402892702827916e-05, "loss": 0.028, "step": 4390 }, { "epoch": 1.03068634340595, "grad_norm": 0.2367735058069229, "learning_rate": 6.387014543809223e-05, "loss": 0.0288, "step": 4400 }, { "epoch": 1.033028812368236, "grad_norm": 0.15218676626682281, "learning_rate": 6.371121216621698e-05, "loss": 0.0414, "step": 4410 }, { "epoch": 1.0353712813305225, "grad_norm": 0.09345823526382446, "learning_rate": 6.355212895072223e-05, "loss": 0.0348, "step": 4420 }, { "epoch": 1.0377137502928087, "grad_norm": 0.25038620829582214, "learning_rate": 6.339289753131649e-05, "loss": 0.0472, "step": 4430 }, { "epoch": 1.0400562192550948, "grad_norm": 0.5955792665481567, "learning_rate": 6.323351964932908e-05, "loss": 0.0612, "step": 4440 }, { "epoch": 1.042398688217381, "grad_norm": 0.10471931844949722, "learning_rate": 6.307399704769099e-05, "loss": 0.0319, "step": 4450 }, { "epoch": 1.0447411571796674, "grad_norm": 0.3728072941303253, "learning_rate": 6.291433147091583e-05, "loss": 0.0346, "step": 4460 }, { "epoch": 1.0470836261419536, "grad_norm": 0.13940206170082092, "learning_rate": 6.275452466508077e-05, "loss": 0.0315, "step": 4470 }, { "epoch": 1.0494260951042398, "grad_norm": 0.24892286956310272, "learning_rate": 6.259457837780742e-05, "loss": 0.0271, "step": 4480 }, { "epoch": 1.0517685640665262, "grad_norm": 0.09227164089679718, "learning_rate": 6.243449435824276e-05, "loss": 0.035, "step": 4490 }, { "epoch": 1.0541110330288124, "grad_norm": 0.4062785804271698, "learning_rate": 6.227427435703997e-05, "loss": 0.0381, "step": 4500 }, { "epoch": 1.0564535019910986, "grad_norm": 0.10490421950817108, "learning_rate": 6.211392012633932e-05, "loss": 0.0424, "step": 4510 }, { "epoch": 1.058795970953385, "grad_norm": 0.08822830021381378, "learning_rate": 6.195343341974899e-05, "loss": 0.0484, "step": 4520 }, { "epoch": 1.0611384399156711, "grad_norm": 0.22914232313632965, "learning_rate": 6.179281599232591e-05, "loss": 0.0388, "step": 4530 }, { "epoch": 1.0634809088779573, "grad_norm": 0.6712221503257751, "learning_rate": 6.163206960055651e-05, "loss": 0.0853, "step": 4540 }, { "epoch": 1.0658233778402435, "grad_norm": 0.2438327521085739, "learning_rate": 6.147119600233758e-05, "loss": 0.0177, "step": 4550 }, { "epoch": 1.06816584680253, "grad_norm": 0.45352616906166077, "learning_rate": 6.131019695695702e-05, "loss": 0.0798, "step": 4560 }, { "epoch": 1.070508315764816, "grad_norm": 0.17237244546413422, "learning_rate": 6.11490742250746e-05, "loss": 0.037, "step": 4570 }, { "epoch": 1.0728507847271023, "grad_norm": 0.7011030316352844, "learning_rate": 6.0987829568702656e-05, "loss": 0.0549, "step": 4580 }, { "epoch": 1.0751932536893887, "grad_norm": 0.14807315170764923, "learning_rate": 6.0826464751186994e-05, "loss": 0.0483, "step": 4590 }, { "epoch": 1.0775357226516749, "grad_norm": 0.42932969331741333, "learning_rate": 6.066498153718735e-05, "loss": 0.0388, "step": 4600 }, { "epoch": 1.079878191613961, "grad_norm": 0.13377119600772858, "learning_rate": 6.05033816926583e-05, "loss": 0.0464, "step": 4610 }, { "epoch": 1.0822206605762474, "grad_norm": 0.13043726980686188, "learning_rate": 6.034166698482984e-05, "loss": 0.0234, "step": 4620 }, { "epoch": 1.0845631295385336, "grad_norm": 0.23946554958820343, "learning_rate": 6.017983918218812e-05, "loss": 0.0415, "step": 4630 }, { "epoch": 1.0869055985008198, "grad_norm": 0.11139467358589172, "learning_rate": 6.001790005445607e-05, "loss": 0.0397, "step": 4640 }, { "epoch": 1.0892480674631062, "grad_norm": 0.1447746455669403, "learning_rate": 5.985585137257401e-05, "loss": 0.0293, "step": 4650 }, { "epoch": 1.0915905364253924, "grad_norm": 0.45925086736679077, "learning_rate": 5.969369490868042e-05, "loss": 0.03, "step": 4660 }, { "epoch": 1.0939330053876786, "grad_norm": 0.2177567183971405, "learning_rate": 5.953143243609235e-05, "loss": 0.042, "step": 4670 }, { "epoch": 1.096275474349965, "grad_norm": 0.20075875520706177, "learning_rate": 5.9369065729286245e-05, "loss": 0.0384, "step": 4680 }, { "epoch": 1.0986179433122512, "grad_norm": 0.16894571483135223, "learning_rate": 5.9206596563878357e-05, "loss": 0.0308, "step": 4690 }, { "epoch": 1.1009604122745373, "grad_norm": 0.09761305898427963, "learning_rate": 5.90440267166055e-05, "loss": 0.0244, "step": 4700 }, { "epoch": 1.1033028812368235, "grad_norm": 0.04163440316915512, "learning_rate": 5.888135796530544e-05, "loss": 0.0191, "step": 4710 }, { "epoch": 1.10564535019911, "grad_norm": 0.27570199966430664, "learning_rate": 5.871859208889759e-05, "loss": 0.0222, "step": 4720 }, { "epoch": 1.107987819161396, "grad_norm": 0.2948501706123352, "learning_rate": 5.85557308673635e-05, "loss": 0.0442, "step": 4730 }, { "epoch": 1.1103302881236823, "grad_norm": 0.26524093747138977, "learning_rate": 5.8392776081727385e-05, "loss": 0.0347, "step": 4740 }, { "epoch": 1.1126727570859687, "grad_norm": 0.26801493763923645, "learning_rate": 5.8229729514036705e-05, "loss": 0.0299, "step": 4750 }, { "epoch": 1.1150152260482549, "grad_norm": 0.0498003289103508, "learning_rate": 5.8066592947342555e-05, "loss": 0.0289, "step": 4760 }, { "epoch": 1.117357695010541, "grad_norm": 0.2827109694480896, "learning_rate": 5.7903368165680327e-05, "loss": 0.0328, "step": 4770 }, { "epoch": 1.1197001639728275, "grad_norm": 0.18607333302497864, "learning_rate": 5.7740056954050084e-05, "loss": 0.0277, "step": 4780 }, { "epoch": 1.1220426329351136, "grad_norm": 0.10899386554956436, "learning_rate": 5.757666109839702e-05, "loss": 0.0397, "step": 4790 }, { "epoch": 1.1243851018973998, "grad_norm": 0.9352733492851257, "learning_rate": 5.74131823855921e-05, "loss": 0.0801, "step": 4800 }, { "epoch": 1.126727570859686, "grad_norm": 0.15164723992347717, "learning_rate": 5.72496226034123e-05, "loss": 0.0572, "step": 4810 }, { "epoch": 1.1290700398219724, "grad_norm": 0.06457802653312683, "learning_rate": 5.7085983540521216e-05, "loss": 0.041, "step": 4820 }, { "epoch": 1.1314125087842586, "grad_norm": 0.13067546486854553, "learning_rate": 5.692226698644938e-05, "loss": 0.0345, "step": 4830 }, { "epoch": 1.1337549777465448, "grad_norm": 0.4330101013183594, "learning_rate": 5.675847473157485e-05, "loss": 0.0436, "step": 4840 }, { "epoch": 1.1360974467088312, "grad_norm": 0.41848742961883545, "learning_rate": 5.6594608567103456e-05, "loss": 0.0216, "step": 4850 }, { "epoch": 1.1384399156711174, "grad_norm": 0.13505397737026215, "learning_rate": 5.6430670285049314e-05, "loss": 0.0305, "step": 4860 }, { "epoch": 1.1407823846334035, "grad_norm": 0.4569176435470581, "learning_rate": 5.6266661678215216e-05, "loss": 0.0324, "step": 4870 }, { "epoch": 1.14312485359569, "grad_norm": 0.4705914556980133, "learning_rate": 5.6102584540173006e-05, "loss": 0.0478, "step": 4880 }, { "epoch": 1.1454673225579761, "grad_norm": 0.276143342256546, "learning_rate": 5.5938440665244006e-05, "loss": 0.0578, "step": 4890 }, { "epoch": 1.1478097915202623, "grad_norm": 0.3393331468105316, "learning_rate": 5.577423184847932e-05, "loss": 0.0507, "step": 4900 }, { "epoch": 1.1501522604825487, "grad_norm": 0.18119889497756958, "learning_rate": 5.560995988564023e-05, "loss": 0.0197, "step": 4910 }, { "epoch": 1.1524947294448349, "grad_norm": 0.0739196389913559, "learning_rate": 5.544562657317863e-05, "loss": 0.0297, "step": 4920 }, { "epoch": 1.154837198407121, "grad_norm": 0.22677703201770782, "learning_rate": 5.52812337082173e-05, "loss": 0.0407, "step": 4930 }, { "epoch": 1.1571796673694075, "grad_norm": 0.054532766342163086, "learning_rate": 5.511678308853026e-05, "loss": 0.0448, "step": 4940 }, { "epoch": 1.1595221363316937, "grad_norm": 0.45871463418006897, "learning_rate": 5.495227651252315e-05, "loss": 0.0316, "step": 4950 }, { "epoch": 1.1618646052939798, "grad_norm": 0.09669110924005508, "learning_rate": 5.478771577921351e-05, "loss": 0.0404, "step": 4960 }, { "epoch": 1.164207074256266, "grad_norm": 0.1810620278120041, "learning_rate": 5.462310268821118e-05, "loss": 0.0233, "step": 4970 }, { "epoch": 1.1665495432185524, "grad_norm": 0.10690245032310486, "learning_rate": 5.445843903969854e-05, "loss": 0.033, "step": 4980 }, { "epoch": 1.1688920121808386, "grad_norm": 0.3685993552207947, "learning_rate": 5.4293726634410855e-05, "loss": 0.0204, "step": 4990 }, { "epoch": 1.1712344811431248, "grad_norm": 0.17481215298175812, "learning_rate": 5.4128967273616625e-05, "loss": 0.0269, "step": 5000 }, { "epoch": 1.1735769501054112, "grad_norm": 0.6450178027153015, "learning_rate": 5.396416275909779e-05, "loss": 0.052, "step": 5010 }, { "epoch": 1.1759194190676974, "grad_norm": 0.0964297205209732, "learning_rate": 5.379931489313016e-05, "loss": 0.0299, "step": 5020 }, { "epoch": 1.1782618880299836, "grad_norm": 0.06013895943760872, "learning_rate": 5.363442547846356e-05, "loss": 0.0334, "step": 5030 }, { "epoch": 1.1806043569922697, "grad_norm": 0.032787106931209564, "learning_rate": 5.3469496318302204e-05, "loss": 0.0506, "step": 5040 }, { "epoch": 1.1829468259545561, "grad_norm": 0.3833360970020294, "learning_rate": 5.330452921628497e-05, "loss": 0.0331, "step": 5050 }, { "epoch": 1.1852892949168423, "grad_norm": 0.08078952878713608, "learning_rate": 5.313952597646568e-05, "loss": 0.0171, "step": 5060 }, { "epoch": 1.1876317638791285, "grad_norm": 0.09187212586402893, "learning_rate": 5.297448840329329e-05, "loss": 0.0195, "step": 5070 }, { "epoch": 1.189974232841415, "grad_norm": 0.2530211806297302, "learning_rate": 5.280941830159227e-05, "loss": 0.0219, "step": 5080 }, { "epoch": 1.192316701803701, "grad_norm": 0.059026945382356644, "learning_rate": 5.264431747654284e-05, "loss": 0.0362, "step": 5090 }, { "epoch": 1.1946591707659873, "grad_norm": 0.04210277274250984, "learning_rate": 5.247918773366112e-05, "loss": 0.0314, "step": 5100 }, { "epoch": 1.1970016397282737, "grad_norm": 0.4919138550758362, "learning_rate": 5.231403087877955e-05, "loss": 0.0335, "step": 5110 }, { "epoch": 1.1993441086905599, "grad_norm": 0.06546583771705627, "learning_rate": 5.214884871802703e-05, "loss": 0.0223, "step": 5120 }, { "epoch": 1.201686577652846, "grad_norm": 0.08152215927839279, "learning_rate": 5.198364305780922e-05, "loss": 0.0316, "step": 5130 }, { "epoch": 1.2040290466151324, "grad_norm": 0.2411283552646637, "learning_rate": 5.1818415704788725e-05, "loss": 0.0669, "step": 5140 }, { "epoch": 1.2063715155774186, "grad_norm": 0.49666517972946167, "learning_rate": 5.165316846586541e-05, "loss": 0.041, "step": 5150 }, { "epoch": 1.2087139845397048, "grad_norm": 0.08363020420074463, "learning_rate": 5.148790314815663e-05, "loss": 0.0209, "step": 5160 }, { "epoch": 1.2110564535019912, "grad_norm": 0.04317115619778633, "learning_rate": 5.132262155897739e-05, "loss": 0.0367, "step": 5170 }, { "epoch": 1.2133989224642774, "grad_norm": 0.1066800057888031, "learning_rate": 5.1157325505820694e-05, "loss": 0.0399, "step": 5180 }, { "epoch": 1.2157413914265636, "grad_norm": 0.17649437487125397, "learning_rate": 5.0992016796337686e-05, "loss": 0.0236, "step": 5190 }, { "epoch": 1.21808386038885, "grad_norm": 0.14966139197349548, "learning_rate": 5.0826697238317935e-05, "loss": 0.0195, "step": 5200 }, { "epoch": 1.2204263293511362, "grad_norm": 0.03593892604112625, "learning_rate": 5.066136863966963e-05, "loss": 0.0202, "step": 5210 }, { "epoch": 1.2227687983134223, "grad_norm": 0.46276217699050903, "learning_rate": 5.0496032808399815e-05, "loss": 0.0464, "step": 5220 }, { "epoch": 1.2251112672757085, "grad_norm": 0.21946477890014648, "learning_rate": 5.033069155259471e-05, "loss": 0.0301, "step": 5230 }, { "epoch": 1.227453736237995, "grad_norm": 0.08784784376621246, "learning_rate": 5.016534668039976e-05, "loss": 0.0316, "step": 5240 }, { "epoch": 1.229796205200281, "grad_norm": 0.1410629153251648, "learning_rate": 5e-05, "loss": 0.0263, "step": 5250 }, { "epoch": 1.2321386741625673, "grad_norm": 0.07868409156799316, "learning_rate": 4.9834653319600246e-05, "loss": 0.0213, "step": 5260 }, { "epoch": 1.2344811431248537, "grad_norm": 0.215213343501091, "learning_rate": 4.96693084474053e-05, "loss": 0.0457, "step": 5270 }, { "epoch": 1.2368236120871399, "grad_norm": 0.16864515841007233, "learning_rate": 4.950396719160018e-05, "loss": 0.0336, "step": 5280 }, { "epoch": 1.239166081049426, "grad_norm": 0.0474487841129303, "learning_rate": 4.93386313603304e-05, "loss": 0.0227, "step": 5290 }, { "epoch": 1.2415085500117122, "grad_norm": 0.5898747444152832, "learning_rate": 4.917330276168208e-05, "loss": 0.0165, "step": 5300 }, { "epoch": 1.2438510189739986, "grad_norm": 0.4065062403678894, "learning_rate": 4.9007983203662326e-05, "loss": 0.0271, "step": 5310 }, { "epoch": 1.2461934879362848, "grad_norm": 0.19243858754634857, "learning_rate": 4.884267449417931e-05, "loss": 0.0222, "step": 5320 }, { "epoch": 1.248535956898571, "grad_norm": 0.14905819296836853, "learning_rate": 4.867737844102261e-05, "loss": 0.0183, "step": 5330 }, { "epoch": 1.2508784258608574, "grad_norm": 0.28917795419692993, "learning_rate": 4.851209685184338e-05, "loss": 0.0194, "step": 5340 }, { "epoch": 1.2532208948231436, "grad_norm": 0.3423207104206085, "learning_rate": 4.834683153413459e-05, "loss": 0.0281, "step": 5350 }, { "epoch": 1.2555633637854298, "grad_norm": 0.04684186726808548, "learning_rate": 4.818158429521129e-05, "loss": 0.0266, "step": 5360 }, { "epoch": 1.2579058327477162, "grad_norm": 0.27714163064956665, "learning_rate": 4.801635694219079e-05, "loss": 0.0468, "step": 5370 }, { "epoch": 1.2602483017100023, "grad_norm": 0.1844978630542755, "learning_rate": 4.785115128197298e-05, "loss": 0.0392, "step": 5380 }, { "epoch": 1.2625907706722885, "grad_norm": 0.36138930916786194, "learning_rate": 4.7685969121220456e-05, "loss": 0.029, "step": 5390 }, { "epoch": 1.264933239634575, "grad_norm": 0.3211914896965027, "learning_rate": 4.7520812266338885e-05, "loss": 0.0611, "step": 5400 }, { "epoch": 1.2672757085968611, "grad_norm": 0.5163668990135193, "learning_rate": 4.735568252345718e-05, "loss": 0.0481, "step": 5410 }, { "epoch": 1.2696181775591473, "grad_norm": 0.5117266178131104, "learning_rate": 4.7190581698407725e-05, "loss": 0.0326, "step": 5420 }, { "epoch": 1.2719606465214337, "grad_norm": 0.24475805461406708, "learning_rate": 4.702551159670672e-05, "loss": 0.0229, "step": 5430 }, { "epoch": 1.2743031154837199, "grad_norm": 0.07154544442892075, "learning_rate": 4.6860474023534335e-05, "loss": 0.042, "step": 5440 }, { "epoch": 1.276645584446006, "grad_norm": 0.28115877509117126, "learning_rate": 4.669547078371504e-05, "loss": 0.0249, "step": 5450 }, { "epoch": 1.2789880534082925, "grad_norm": 0.22904540598392487, "learning_rate": 4.65305036816978e-05, "loss": 0.0339, "step": 5460 }, { "epoch": 1.2813305223705787, "grad_norm": 0.11327308416366577, "learning_rate": 4.6365574521536445e-05, "loss": 0.0175, "step": 5470 }, { "epoch": 1.2836729913328648, "grad_norm": 0.1697210669517517, "learning_rate": 4.620068510686985e-05, "loss": 0.0362, "step": 5480 }, { "epoch": 1.286015460295151, "grad_norm": 0.08553613722324371, "learning_rate": 4.60358372409022e-05, "loss": 0.0159, "step": 5490 }, { "epoch": 1.2883579292574374, "grad_norm": 0.07890176773071289, "learning_rate": 4.5871032726383386e-05, "loss": 0.0288, "step": 5500 }, { "epoch": 1.2907003982197236, "grad_norm": 0.33075398206710815, "learning_rate": 4.570627336558915e-05, "loss": 0.0262, "step": 5510 }, { "epoch": 1.2930428671820098, "grad_norm": 0.09929897636175156, "learning_rate": 4.554156096030149e-05, "loss": 0.0231, "step": 5520 }, { "epoch": 1.295385336144296, "grad_norm": 0.1128670945763588, "learning_rate": 4.537689731178883e-05, "loss": 0.0201, "step": 5530 }, { "epoch": 1.2977278051065824, "grad_norm": 0.05418454855680466, "learning_rate": 4.5212284220786494e-05, "loss": 0.0404, "step": 5540 }, { "epoch": 1.3000702740688685, "grad_norm": 0.1747845560312271, "learning_rate": 4.504772348747687e-05, "loss": 0.0324, "step": 5550 }, { "epoch": 1.3024127430311547, "grad_norm": 0.6264855265617371, "learning_rate": 4.488321691146975e-05, "loss": 0.0607, "step": 5560 }, { "epoch": 1.3047552119934411, "grad_norm": 0.20012634992599487, "learning_rate": 4.471876629178273e-05, "loss": 0.0336, "step": 5570 }, { "epoch": 1.3070976809557273, "grad_norm": 0.07151951640844345, "learning_rate": 4.4554373426821374e-05, "loss": 0.0199, "step": 5580 }, { "epoch": 1.3094401499180135, "grad_norm": 0.09090318530797958, "learning_rate": 4.439004011435979e-05, "loss": 0.0263, "step": 5590 }, { "epoch": 1.3117826188803, "grad_norm": 0.09504502266645432, "learning_rate": 4.4225768151520694e-05, "loss": 0.038, "step": 5600 }, { "epoch": 1.314125087842586, "grad_norm": 0.19809271395206451, "learning_rate": 4.406155933475599e-05, "loss": 0.0376, "step": 5610 }, { "epoch": 1.3164675568048723, "grad_norm": 0.2558313012123108, "learning_rate": 4.3897415459827e-05, "loss": 0.043, "step": 5620 }, { "epoch": 1.3188100257671587, "grad_norm": 0.08637325465679169, "learning_rate": 4.373333832178478e-05, "loss": 0.0341, "step": 5630 }, { "epoch": 1.3211524947294448, "grad_norm": 0.06880134344100952, "learning_rate": 4.3569329714950704e-05, "loss": 0.0229, "step": 5640 }, { "epoch": 1.323494963691731, "grad_norm": 0.16358880698680878, "learning_rate": 4.3405391432896555e-05, "loss": 0.0387, "step": 5650 }, { "epoch": 1.3258374326540174, "grad_norm": 0.05642487108707428, "learning_rate": 4.324152526842517e-05, "loss": 0.0291, "step": 5660 }, { "epoch": 1.3281799016163036, "grad_norm": 0.13398276269435883, "learning_rate": 4.307773301355062e-05, "loss": 0.0449, "step": 5670 }, { "epoch": 1.3305223705785898, "grad_norm": 0.41730400919914246, "learning_rate": 4.291401645947879e-05, "loss": 0.0336, "step": 5680 }, { "epoch": 1.3328648395408762, "grad_norm": 0.1082252785563469, "learning_rate": 4.275037739658771e-05, "loss": 0.0159, "step": 5690 }, { "epoch": 1.3352073085031624, "grad_norm": 0.5044443607330322, "learning_rate": 4.2586817614407895e-05, "loss": 0.026, "step": 5700 }, { "epoch": 1.3375497774654486, "grad_norm": 0.19207948446273804, "learning_rate": 4.2423338901602985e-05, "loss": 0.0205, "step": 5710 }, { "epoch": 1.339892246427735, "grad_norm": 0.14319565892219543, "learning_rate": 4.2259943045949934e-05, "loss": 0.0174, "step": 5720 }, { "epoch": 1.3422347153900211, "grad_norm": 0.0638875886797905, "learning_rate": 4.209663183431969e-05, "loss": 0.0272, "step": 5730 }, { "epoch": 1.3445771843523073, "grad_norm": 0.5683619379997253, "learning_rate": 4.1933407052657456e-05, "loss": 0.0265, "step": 5740 }, { "epoch": 1.3469196533145935, "grad_norm": 0.1282253861427307, "learning_rate": 4.17702704859633e-05, "loss": 0.0285, "step": 5750 }, { "epoch": 1.3492621222768797, "grad_norm": 0.2435198575258255, "learning_rate": 4.160722391827262e-05, "loss": 0.0348, "step": 5760 }, { "epoch": 1.351604591239166, "grad_norm": 0.10652618855237961, "learning_rate": 4.14442691326365e-05, "loss": 0.0409, "step": 5770 }, { "epoch": 1.3539470602014523, "grad_norm": 0.06271979957818985, "learning_rate": 4.1281407911102425e-05, "loss": 0.0559, "step": 5780 }, { "epoch": 1.3562895291637385, "grad_norm": 0.05037263408303261, "learning_rate": 4.111864203469457e-05, "loss": 0.0263, "step": 5790 }, { "epoch": 1.3586319981260249, "grad_norm": 0.2569263279438019, "learning_rate": 4.095597328339452e-05, "loss": 0.0259, "step": 5800 }, { "epoch": 1.360974467088311, "grad_norm": 0.39117732644081116, "learning_rate": 4.079340343612165e-05, "loss": 0.0413, "step": 5810 }, { "epoch": 1.3633169360505972, "grad_norm": 0.0529431588947773, "learning_rate": 4.063093427071376e-05, "loss": 0.0615, "step": 5820 }, { "epoch": 1.3656594050128836, "grad_norm": 0.18688374757766724, "learning_rate": 4.046856756390767e-05, "loss": 0.0184, "step": 5830 }, { "epoch": 1.3680018739751698, "grad_norm": 0.08132046461105347, "learning_rate": 4.0306305091319595e-05, "loss": 0.0203, "step": 5840 }, { "epoch": 1.370344342937456, "grad_norm": 0.2862519323825836, "learning_rate": 4.0144148627425993e-05, "loss": 0.0497, "step": 5850 }, { "epoch": 1.3726868118997424, "grad_norm": 0.12356792390346527, "learning_rate": 3.9982099945543945e-05, "loss": 0.0202, "step": 5860 }, { "epoch": 1.3750292808620286, "grad_norm": 0.43368279933929443, "learning_rate": 3.982016081781189e-05, "loss": 0.0305, "step": 5870 }, { "epoch": 1.3773717498243148, "grad_norm": 0.03974668309092522, "learning_rate": 3.965833301517017e-05, "loss": 0.0262, "step": 5880 }, { "epoch": 1.3797142187866012, "grad_norm": 0.16461171209812164, "learning_rate": 3.949661830734172e-05, "loss": 0.0375, "step": 5890 }, { "epoch": 1.3820566877488873, "grad_norm": 0.06088129058480263, "learning_rate": 3.933501846281267e-05, "loss": 0.0192, "step": 5900 }, { "epoch": 1.3843991567111735, "grad_norm": 0.2690442204475403, "learning_rate": 3.917353524881302e-05, "loss": 0.0159, "step": 5910 }, { "epoch": 1.38674162567346, "grad_norm": 0.09126674383878708, "learning_rate": 3.901217043129735e-05, "loss": 0.0334, "step": 5920 }, { "epoch": 1.3890840946357461, "grad_norm": 0.11212047934532166, "learning_rate": 3.8850925774925425e-05, "loss": 0.0208, "step": 5930 }, { "epoch": 1.3914265635980323, "grad_norm": 0.03019798919558525, "learning_rate": 3.8689803043043e-05, "loss": 0.0218, "step": 5940 }, { "epoch": 1.3937690325603187, "grad_norm": 0.11158014088869095, "learning_rate": 3.852880399766243e-05, "loss": 0.0327, "step": 5950 }, { "epoch": 1.3961115015226049, "grad_norm": 0.1466631442308426, "learning_rate": 3.836793039944349e-05, "loss": 0.0316, "step": 5960 }, { "epoch": 1.398453970484891, "grad_norm": 0.04972492530941963, "learning_rate": 3.820718400767409e-05, "loss": 0.0204, "step": 5970 }, { "epoch": 1.4007964394471772, "grad_norm": 0.18622121214866638, "learning_rate": 3.8046566580251e-05, "loss": 0.0446, "step": 5980 }, { "epoch": 1.4031389084094636, "grad_norm": 0.4047488868236542, "learning_rate": 3.788607987366069e-05, "loss": 0.0422, "step": 5990 }, { "epoch": 1.4054813773717498, "grad_norm": 0.043907005339860916, "learning_rate": 3.772572564296005e-05, "loss": 0.0279, "step": 6000 }, { "epoch": 1.407823846334036, "grad_norm": 0.2679661214351654, "learning_rate": 3.756550564175727e-05, "loss": 0.0209, "step": 6010 }, { "epoch": 1.4101663152963222, "grad_norm": 0.0252488162368536, "learning_rate": 3.74054216221926e-05, "loss": 0.0187, "step": 6020 }, { "epoch": 1.4125087842586086, "grad_norm": 0.03220526501536369, "learning_rate": 3.7245475334919246e-05, "loss": 0.0235, "step": 6030 }, { "epoch": 1.4148512532208948, "grad_norm": 0.132725328207016, "learning_rate": 3.7085668529084184e-05, "loss": 0.0231, "step": 6040 }, { "epoch": 1.417193722183181, "grad_norm": 0.17545637488365173, "learning_rate": 3.6926002952309016e-05, "loss": 0.0259, "step": 6050 }, { "epoch": 1.4195361911454674, "grad_norm": 0.197429358959198, "learning_rate": 3.676648035067093e-05, "loss": 0.0274, "step": 6060 }, { "epoch": 1.4218786601077535, "grad_norm": 0.06819231063127518, "learning_rate": 3.6607102468683526e-05, "loss": 0.0355, "step": 6070 }, { "epoch": 1.4242211290700397, "grad_norm": 0.16003242135047913, "learning_rate": 3.6447871049277796e-05, "loss": 0.0376, "step": 6080 }, { "epoch": 1.4265635980323261, "grad_norm": 0.13673585653305054, "learning_rate": 3.628878783378302e-05, "loss": 0.0213, "step": 6090 }, { "epoch": 1.4289060669946123, "grad_norm": 0.15434902906417847, "learning_rate": 3.612985456190778e-05, "loss": 0.0126, "step": 6100 }, { "epoch": 1.4312485359568985, "grad_norm": 0.17395956814289093, "learning_rate": 3.597107297172084e-05, "loss": 0.084, "step": 6110 }, { "epoch": 1.433591004919185, "grad_norm": 0.04844974726438522, "learning_rate": 3.581244479963225e-05, "loss": 0.0219, "step": 6120 }, { "epoch": 1.435933473881471, "grad_norm": 0.04163607209920883, "learning_rate": 3.5653971780374295e-05, "loss": 0.0279, "step": 6130 }, { "epoch": 1.4382759428437573, "grad_norm": 0.11247994005680084, "learning_rate": 3.5495655646982505e-05, "loss": 0.0388, "step": 6140 }, { "epoch": 1.4406184118060437, "grad_norm": 0.10106071829795837, "learning_rate": 3.533749813077677e-05, "loss": 0.0197, "step": 6150 }, { "epoch": 1.4429608807683298, "grad_norm": 0.3352503776550293, "learning_rate": 3.517950096134232e-05, "loss": 0.0306, "step": 6160 }, { "epoch": 1.445303349730616, "grad_norm": 0.23961161077022552, "learning_rate": 3.5021665866510925e-05, "loss": 0.0361, "step": 6170 }, { "epoch": 1.4476458186929024, "grad_norm": 0.27124881744384766, "learning_rate": 3.4863994572341843e-05, "loss": 0.0215, "step": 6180 }, { "epoch": 1.4499882876551886, "grad_norm": 0.19891873002052307, "learning_rate": 3.470648880310313e-05, "loss": 0.0356, "step": 6190 }, { "epoch": 1.4523307566174748, "grad_norm": 0.2036479115486145, "learning_rate": 3.4549150281252636e-05, "loss": 0.024, "step": 6200 }, { "epoch": 1.4546732255797612, "grad_norm": 0.20012417435646057, "learning_rate": 3.439198072741921e-05, "loss": 0.0601, "step": 6210 }, { "epoch": 1.4570156945420474, "grad_norm": 0.09231871366500854, "learning_rate": 3.423498186038393e-05, "loss": 0.0264, "step": 6220 }, { "epoch": 1.4593581635043336, "grad_norm": 0.08506989479064941, "learning_rate": 3.407815539706124e-05, "loss": 0.0326, "step": 6230 }, { "epoch": 1.4617006324666197, "grad_norm": 0.07338278740644455, "learning_rate": 3.392150305248024e-05, "loss": 0.0261, "step": 6240 }, { "epoch": 1.4640431014289061, "grad_norm": 0.04994959384202957, "learning_rate": 3.3765026539765834e-05, "loss": 0.0155, "step": 6250 }, { "epoch": 1.4663855703911923, "grad_norm": 0.14995336532592773, "learning_rate": 3.360872757012011e-05, "loss": 0.0215, "step": 6260 }, { "epoch": 1.4687280393534785, "grad_norm": 0.08906183391809464, "learning_rate": 3.3452607852803584e-05, "loss": 0.0165, "step": 6270 }, { "epoch": 1.4710705083157647, "grad_norm": 0.07266787439584732, "learning_rate": 3.329666909511645e-05, "loss": 0.0368, "step": 6280 }, { "epoch": 1.473412977278051, "grad_norm": 0.5040260553359985, "learning_rate": 3.3140913002379995e-05, "loss": 0.0261, "step": 6290 }, { "epoch": 1.4757554462403373, "grad_norm": 0.1433238685131073, "learning_rate": 3.298534127791785e-05, "loss": 0.0175, "step": 6300 }, { "epoch": 1.4780979152026235, "grad_norm": 0.3917306363582611, "learning_rate": 3.282995562303754e-05, "loss": 0.0235, "step": 6310 }, { "epoch": 1.4804403841649099, "grad_norm": 0.07920818775892258, "learning_rate": 3.267475773701161e-05, "loss": 0.0428, "step": 6320 }, { "epoch": 1.482782853127196, "grad_norm": 0.07408647239208221, "learning_rate": 3.251974931705933e-05, "loss": 0.0255, "step": 6330 }, { "epoch": 1.4851253220894822, "grad_norm": 0.0957607626914978, "learning_rate": 3.236493205832795e-05, "loss": 0.0279, "step": 6340 }, { "epoch": 1.4874677910517686, "grad_norm": 0.372249037027359, "learning_rate": 3.221030765387417e-05, "loss": 0.0302, "step": 6350 }, { "epoch": 1.4898102600140548, "grad_norm": 0.20557020604610443, "learning_rate": 3.205587779464576e-05, "loss": 0.0374, "step": 6360 }, { "epoch": 1.492152728976341, "grad_norm": 0.2854403257369995, "learning_rate": 3.190164416946285e-05, "loss": 0.0234, "step": 6370 }, { "epoch": 1.4944951979386274, "grad_norm": 0.023650668561458588, "learning_rate": 3.1747608464999725e-05, "loss": 0.028, "step": 6380 }, { "epoch": 1.4968376669009136, "grad_norm": 0.3256511390209198, "learning_rate": 3.1593772365766105e-05, "loss": 0.0349, "step": 6390 }, { "epoch": 1.4991801358631998, "grad_norm": 0.10362248122692108, "learning_rate": 3.144013755408895e-05, "loss": 0.0181, "step": 6400 }, { "epoch": 1.5015226048254862, "grad_norm": 0.22891394793987274, "learning_rate": 3.128670571009399e-05, "loss": 0.0139, "step": 6410 }, { "epoch": 1.5038650737877723, "grad_norm": 0.3262953460216522, "learning_rate": 3.113347851168721e-05, "loss": 0.0276, "step": 6420 }, { "epoch": 1.5062075427500585, "grad_norm": 0.04172496870160103, "learning_rate": 3.098045763453678e-05, "loss": 0.0151, "step": 6430 }, { "epoch": 1.508550011712345, "grad_norm": 0.10430093109607697, "learning_rate": 3.082764475205442e-05, "loss": 0.0151, "step": 6440 }, { "epoch": 1.510892480674631, "grad_norm": 0.061427149921655655, "learning_rate": 3.0675041535377405e-05, "loss": 0.0257, "step": 6450 }, { "epoch": 1.5132349496369173, "grad_norm": 0.3583897054195404, "learning_rate": 3.052264965335e-05, "loss": 0.0228, "step": 6460 }, { "epoch": 1.5155774185992037, "grad_norm": 0.3000676929950714, "learning_rate": 3.0370470772505433e-05, "loss": 0.0319, "step": 6470 }, { "epoch": 1.5179198875614897, "grad_norm": 0.054317738860845566, "learning_rate": 3.0218506557047598e-05, "loss": 0.0258, "step": 6480 }, { "epoch": 1.520262356523776, "grad_norm": 0.554436981678009, "learning_rate": 3.006675866883275e-05, "loss": 0.0423, "step": 6490 }, { "epoch": 1.5226048254860625, "grad_norm": 0.047464508563280106, "learning_rate": 2.991522876735154e-05, "loss": 0.0181, "step": 6500 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.709630644176e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }