diff --git "a/checkpoint-7000/trainer_state.json" "b/checkpoint-7000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-7000/trainer_state.json" @@ -0,0 +1,4933 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.6397282736003747, + "eval_steps": 500, + "global_step": 7000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0023424689622862497, + "grad_norm": 4.449338436126709, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.3236, + "step": 10 + }, + { + "epoch": 0.004684937924572499, + "grad_norm": 3.442420721054077, + "learning_rate": 4.000000000000001e-06, + "loss": 1.1552, + "step": 20 + }, + { + "epoch": 0.007027406886858749, + "grad_norm": 2.458024263381958, + "learning_rate": 6e-06, + "loss": 0.9371, + "step": 30 + }, + { + "epoch": 0.009369875849144999, + "grad_norm": 2.4206013679504395, + "learning_rate": 8.000000000000001e-06, + "loss": 1.1333, + "step": 40 + }, + { + "epoch": 0.011712344811431248, + "grad_norm": 4.484491348266602, + "learning_rate": 1e-05, + "loss": 0.7988, + "step": 50 + }, + { + "epoch": 0.014054813773717497, + "grad_norm": 7.087528228759766, + "learning_rate": 1.2e-05, + "loss": 0.8714, + "step": 60 + }, + { + "epoch": 0.016397282736003747, + "grad_norm": 2.9479169845581055, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.5826, + "step": 70 + }, + { + "epoch": 0.018739751698289998, + "grad_norm": 2.2344982624053955, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.457, + "step": 80 + }, + { + "epoch": 0.02108222066057625, + "grad_norm": 1.1311728954315186, + "learning_rate": 1.8e-05, + "loss": 0.3638, + "step": 90 + }, + { + "epoch": 0.023424689622862496, + "grad_norm": 5.992610931396484, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 100 + }, + { + "epoch": 0.025767158585148747, + "grad_norm": 1.328804612159729, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.369, + "step": 110 + }, + { + "epoch": 0.028109627547434995, + "grad_norm": 2.6690480709075928, + "learning_rate": 2.4e-05, + "loss": 0.2072, + "step": 120 + }, + { + "epoch": 0.030452096509721246, + "grad_norm": 1.2436017990112305, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.2564, + "step": 130 + }, + { + "epoch": 0.03279456547200749, + "grad_norm": 2.130502939224243, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.1741, + "step": 140 + }, + { + "epoch": 0.035137034434293744, + "grad_norm": 1.1833769083023071, + "learning_rate": 3e-05, + "loss": 0.1982, + "step": 150 + }, + { + "epoch": 0.037479503396579995, + "grad_norm": 0.887791633605957, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.2346, + "step": 160 + }, + { + "epoch": 0.039821972358866246, + "grad_norm": 2.4128785133361816, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.1967, + "step": 170 + }, + { + "epoch": 0.0421644413211525, + "grad_norm": 1.2833918333053589, + "learning_rate": 3.6e-05, + "loss": 0.1552, + "step": 180 + }, + { + "epoch": 0.04450691028343874, + "grad_norm": 1.459666132926941, + "learning_rate": 3.8e-05, + "loss": 0.2237, + "step": 190 + }, + { + "epoch": 0.04684937924572499, + "grad_norm": 1.7674411535263062, + "learning_rate": 4e-05, + "loss": 0.1619, + "step": 200 + }, + { + "epoch": 0.049191848208011243, + "grad_norm": 1.2941542863845825, + "learning_rate": 4.2e-05, + "loss": 0.184, + "step": 210 + }, + { + "epoch": 0.051534317170297494, + "grad_norm": 1.7022488117218018, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.1501, + "step": 220 + }, + { + "epoch": 0.053876786132583745, + "grad_norm": 0.8502867221832275, + "learning_rate": 4.600000000000001e-05, + "loss": 0.2449, + "step": 230 + }, + { + "epoch": 0.05621925509486999, + "grad_norm": 2.1729302406311035, + "learning_rate": 4.8e-05, + "loss": 0.141, + "step": 240 + }, + { + "epoch": 0.05856172405715624, + "grad_norm": 1.9990278482437134, + "learning_rate": 5e-05, + "loss": 0.1569, + "step": 250 + }, + { + "epoch": 0.06090419301944249, + "grad_norm": 1.0973132848739624, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.1574, + "step": 260 + }, + { + "epoch": 0.06324666198172874, + "grad_norm": 1.5121344327926636, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.1309, + "step": 270 + }, + { + "epoch": 0.06558913094401499, + "grad_norm": 1.0041357278823853, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.2048, + "step": 280 + }, + { + "epoch": 0.06793159990630124, + "grad_norm": 1.9920216798782349, + "learning_rate": 5.8e-05, + "loss": 0.1425, + "step": 290 + }, + { + "epoch": 0.07027406886858749, + "grad_norm": 0.6136835217475891, + "learning_rate": 6e-05, + "loss": 0.1236, + "step": 300 + }, + { + "epoch": 0.07261653783087374, + "grad_norm": 1.2063113451004028, + "learning_rate": 6.2e-05, + "loss": 0.1342, + "step": 310 + }, + { + "epoch": 0.07495900679315999, + "grad_norm": 0.7644496560096741, + "learning_rate": 6.400000000000001e-05, + "loss": 0.1205, + "step": 320 + }, + { + "epoch": 0.07730147575544624, + "grad_norm": 0.973790168762207, + "learning_rate": 6.6e-05, + "loss": 0.1551, + "step": 330 + }, + { + "epoch": 0.07964394471773249, + "grad_norm": 1.9004161357879639, + "learning_rate": 6.800000000000001e-05, + "loss": 0.1395, + "step": 340 + }, + { + "epoch": 0.08198641368001874, + "grad_norm": 0.8575976490974426, + "learning_rate": 7e-05, + "loss": 0.1081, + "step": 350 + }, + { + "epoch": 0.084328882642305, + "grad_norm": 1.3740334510803223, + "learning_rate": 7.2e-05, + "loss": 0.18, + "step": 360 + }, + { + "epoch": 0.08667135160459125, + "grad_norm": 0.7421107888221741, + "learning_rate": 7.4e-05, + "loss": 0.1496, + "step": 370 + }, + { + "epoch": 0.08901382056687748, + "grad_norm": 1.4952155351638794, + "learning_rate": 7.6e-05, + "loss": 0.1491, + "step": 380 + }, + { + "epoch": 0.09135628952916373, + "grad_norm": 1.0072972774505615, + "learning_rate": 7.800000000000001e-05, + "loss": 0.1282, + "step": 390 + }, + { + "epoch": 0.09369875849144998, + "grad_norm": 1.719224452972412, + "learning_rate": 8e-05, + "loss": 0.1779, + "step": 400 + }, + { + "epoch": 0.09604122745373624, + "grad_norm": 1.4302623271942139, + "learning_rate": 8.2e-05, + "loss": 0.1145, + "step": 410 + }, + { + "epoch": 0.09838369641602249, + "grad_norm": 0.6622968316078186, + "learning_rate": 8.4e-05, + "loss": 0.1159, + "step": 420 + }, + { + "epoch": 0.10072616537830874, + "grad_norm": 1.0967049598693848, + "learning_rate": 8.6e-05, + "loss": 0.1659, + "step": 430 + }, + { + "epoch": 0.10306863434059499, + "grad_norm": 1.1332488059997559, + "learning_rate": 8.800000000000001e-05, + "loss": 0.1292, + "step": 440 + }, + { + "epoch": 0.10541110330288124, + "grad_norm": 1.308289647102356, + "learning_rate": 9e-05, + "loss": 0.1202, + "step": 450 + }, + { + "epoch": 0.10775357226516749, + "grad_norm": 0.5696719884872437, + "learning_rate": 9.200000000000001e-05, + "loss": 0.1118, + "step": 460 + }, + { + "epoch": 0.11009604122745374, + "grad_norm": 0.9922944903373718, + "learning_rate": 9.4e-05, + "loss": 0.1644, + "step": 470 + }, + { + "epoch": 0.11243851018973998, + "grad_norm": 1.5004724264144897, + "learning_rate": 9.6e-05, + "loss": 0.2011, + "step": 480 + }, + { + "epoch": 0.11478097915202623, + "grad_norm": 0.9503705501556396, + "learning_rate": 9.8e-05, + "loss": 0.1038, + "step": 490 + }, + { + "epoch": 0.11712344811431248, + "grad_norm": 1.421077013015747, + "learning_rate": 0.0001, + "loss": 0.0944, + "step": 500 + }, + { + "epoch": 0.11946591707659873, + "grad_norm": 0.8938995599746704, + "learning_rate": 9.999972660400536e-05, + "loss": 0.1216, + "step": 510 + }, + { + "epoch": 0.12180838603888498, + "grad_norm": 0.46683940291404724, + "learning_rate": 9.999890641901125e-05, + "loss": 0.1278, + "step": 520 + }, + { + "epoch": 0.12415085500117123, + "grad_norm": 0.8092114925384521, + "learning_rate": 9.999753945398704e-05, + "loss": 0.0794, + "step": 530 + }, + { + "epoch": 0.12649332396345747, + "grad_norm": 0.27710163593292236, + "learning_rate": 9.99956257238817e-05, + "loss": 0.1266, + "step": 540 + }, + { + "epoch": 0.12883579292574374, + "grad_norm": 0.81737220287323, + "learning_rate": 9.999316524962345e-05, + "loss": 0.1108, + "step": 550 + }, + { + "epoch": 0.13117826188802997, + "grad_norm": 0.6735175848007202, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0854, + "step": 560 + }, + { + "epoch": 0.13352073085031624, + "grad_norm": 0.2487485110759735, + "learning_rate": 9.998660418225645e-05, + "loss": 0.1045, + "step": 570 + }, + { + "epoch": 0.13586319981260248, + "grad_norm": 0.3255215287208557, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0948, + "step": 580 + }, + { + "epoch": 0.13820566877488874, + "grad_norm": 0.7749798893928528, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0775, + "step": 590 + }, + { + "epoch": 0.14054813773717498, + "grad_norm": 1.220957636833191, + "learning_rate": 9.997266286704631e-05, + "loss": 0.1201, + "step": 600 + }, + { + "epoch": 0.14289060669946124, + "grad_norm": 0.8066214919090271, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0815, + "step": 610 + }, + { + "epoch": 0.14523307566174748, + "grad_norm": 0.6408377885818481, + "learning_rate": 9.996063610703137e-05, + "loss": 0.1, + "step": 620 + }, + { + "epoch": 0.14757554462403374, + "grad_norm": 0.8596289753913879, + "learning_rate": 9.995380315038119e-05, + "loss": 0.1008, + "step": 630 + }, + { + "epoch": 0.14991801358631998, + "grad_norm": 0.972243070602417, + "learning_rate": 9.994642390694308e-05, + "loss": 0.1075, + "step": 640 + }, + { + "epoch": 0.15226048254860622, + "grad_norm": 0.5220253467559814, + "learning_rate": 9.993849845741524e-05, + "loss": 0.1021, + "step": 650 + }, + { + "epoch": 0.15460295151089248, + "grad_norm": 0.5453582406044006, + "learning_rate": 9.993002688846913e-05, + "loss": 0.0963, + "step": 660 + }, + { + "epoch": 0.15694542047317872, + "grad_norm": 0.24789837002754211, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0712, + "step": 670 + }, + { + "epoch": 0.15928788943546499, + "grad_norm": 0.31857672333717346, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0859, + "step": 680 + }, + { + "epoch": 0.16163035839775122, + "grad_norm": 0.7285981178283691, + "learning_rate": 9.990133642141359e-05, + "loss": 0.1274, + "step": 690 + }, + { + "epoch": 0.1639728273600375, + "grad_norm": 1.0549755096435547, + "learning_rate": 9.989068136093873e-05, + "loss": 0.1187, + "step": 700 + }, + { + "epoch": 0.16631529632232372, + "grad_norm": 0.204506054520607, + "learning_rate": 9.987948070396571e-05, + "loss": 0.1005, + "step": 710 + }, + { + "epoch": 0.16865776528461, + "grad_norm": 0.4295964241027832, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0937, + "step": 720 + }, + { + "epoch": 0.17100023424689623, + "grad_norm": 1.0681158304214478, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0855, + "step": 730 + }, + { + "epoch": 0.1733427032091825, + "grad_norm": 0.667492151260376, + "learning_rate": 9.984260640876821e-05, + "loss": 0.1096, + "step": 740 + }, + { + "epoch": 0.17568517217146873, + "grad_norm": 0.6995371580123901, + "learning_rate": 9.98292246503335e-05, + "loss": 0.108, + "step": 750 + }, + { + "epoch": 0.17802764113375497, + "grad_norm": 0.9727945923805237, + "learning_rate": 9.981529796748134e-05, + "loss": 0.1155, + "step": 760 + }, + { + "epoch": 0.18037011009604123, + "grad_norm": 0.3702404201030731, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0846, + "step": 770 + }, + { + "epoch": 0.18271257905832747, + "grad_norm": 0.3169856667518616, + "learning_rate": 9.97858104436822e-05, + "loss": 0.0917, + "step": 780 + }, + { + "epoch": 0.18505504802061373, + "grad_norm": 0.6973789930343628, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0785, + "step": 790 + }, + { + "epoch": 0.18739751698289997, + "grad_norm": 0.5686987042427063, + "learning_rate": 9.975414512725057e-05, + "loss": 0.1015, + "step": 800 + }, + { + "epoch": 0.18973998594518623, + "grad_norm": 0.6190043687820435, + "learning_rate": 9.973749622593534e-05, + "loss": 0.0753, + "step": 810 + }, + { + "epoch": 0.19208245490747247, + "grad_norm": 0.3807699382305145, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0734, + "step": 820 + }, + { + "epoch": 0.19442492386975874, + "grad_norm": 0.45342546701431274, + "learning_rate": 9.970256684745258e-05, + "loss": 0.1012, + "step": 830 + }, + { + "epoch": 0.19676739283204497, + "grad_norm": 0.2780962586402893, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0757, + "step": 840 + }, + { + "epoch": 0.19910986179433124, + "grad_norm": 0.20734530687332153, + "learning_rate": 9.966546331768191e-05, + "loss": 0.0751, + "step": 850 + }, + { + "epoch": 0.20145233075661748, + "grad_norm": 0.3406268358230591, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0937, + "step": 860 + }, + { + "epoch": 0.2037947997189037, + "grad_norm": 0.33824971318244934, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0913, + "step": 870 + }, + { + "epoch": 0.20613726868118998, + "grad_norm": 0.5773669481277466, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0834, + "step": 880 + }, + { + "epoch": 0.20847973764347622, + "grad_norm": 0.5624499917030334, + "learning_rate": 9.95847403914247e-05, + "loss": 0.1001, + "step": 890 + }, + { + "epoch": 0.21082220660576248, + "grad_norm": 0.5361132025718689, + "learning_rate": 9.956320346634876e-05, + "loss": 0.1233, + "step": 900 + }, + { + "epoch": 0.21316467556804872, + "grad_norm": 0.4824270009994507, + "learning_rate": 9.954112452602045e-05, + "loss": 0.0882, + "step": 910 + }, + { + "epoch": 0.21550714453033498, + "grad_norm": 0.6482338905334473, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0647, + "step": 920 + }, + { + "epoch": 0.21784961349262122, + "grad_norm": 0.2783452868461609, + "learning_rate": 9.949534157133844e-05, + "loss": 0.0917, + "step": 930 + }, + { + "epoch": 0.22019208245490748, + "grad_norm": 0.4593198597431183, + "learning_rate": 9.94716380576598e-05, + "loss": 0.068, + "step": 940 + }, + { + "epoch": 0.22253455141719372, + "grad_norm": 0.7751959562301636, + "learning_rate": 9.944739353007344e-05, + "loss": 0.1032, + "step": 950 + }, + { + "epoch": 0.22487702037947996, + "grad_norm": 0.3963168263435364, + "learning_rate": 9.942260825371358e-05, + "loss": 0.0942, + "step": 960 + }, + { + "epoch": 0.22721948934176622, + "grad_norm": 0.40413302183151245, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0736, + "step": 970 + }, + { + "epoch": 0.22956195830405246, + "grad_norm": 0.3862430155277252, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0726, + "step": 980 + }, + { + "epoch": 0.23190442726633873, + "grad_norm": 0.5864925384521484, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0872, + "step": 990 + }, + { + "epoch": 0.23424689622862496, + "grad_norm": 0.31625375151634216, + "learning_rate": 9.931806517013612e-05, + "loss": 0.0708, + "step": 1000 + }, + { + "epoch": 0.23658936519091123, + "grad_norm": 0.5403046011924744, + "learning_rate": 9.929058033379181e-05, + "loss": 0.073, + "step": 1010 + }, + { + "epoch": 0.23893183415319746, + "grad_norm": 0.4366021156311035, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0643, + "step": 1020 + }, + { + "epoch": 0.24127430311548373, + "grad_norm": 0.500108540058136, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0437, + "step": 1030 + }, + { + "epoch": 0.24361677207776997, + "grad_norm": 0.8096440434455872, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0555, + "step": 1040 + }, + { + "epoch": 0.24595924104005623, + "grad_norm": 0.6826971173286438, + "learning_rate": 9.917525374361912e-05, + "loss": 0.0704, + "step": 1050 + }, + { + "epoch": 0.24830171000234247, + "grad_norm": 0.27831944823265076, + "learning_rate": 9.914507686137019e-05, + "loss": 0.0659, + "step": 1060 + }, + { + "epoch": 0.2506441789646287, + "grad_norm": 0.35980355739593506, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0652, + "step": 1070 + }, + { + "epoch": 0.25298664792691494, + "grad_norm": 0.7075427174568176, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0933, + "step": 1080 + }, + { + "epoch": 0.25532911688920124, + "grad_norm": 0.33446595072746277, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0594, + "step": 1090 + }, + { + "epoch": 0.2576715858514875, + "grad_norm": 0.21890777349472046, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0636, + "step": 1100 + }, + { + "epoch": 0.2600140548137737, + "grad_norm": 0.19606763124465942, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0573, + "step": 1110 + }, + { + "epoch": 0.26235652377605995, + "grad_norm": 0.40309399366378784, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0711, + "step": 1120 + }, + { + "epoch": 0.26469899273834624, + "grad_norm": 0.15657459199428558, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0596, + "step": 1130 + }, + { + "epoch": 0.2670414617006325, + "grad_norm": 0.5244103670120239, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0624, + "step": 1140 + }, + { + "epoch": 0.2693839306629187, + "grad_norm": 0.6240133047103882, + "learning_rate": 9.884934153917997e-05, + "loss": 0.1013, + "step": 1150 + }, + { + "epoch": 0.27172639962520495, + "grad_norm": 0.2892966568470001, + "learning_rate": 9.881380604901964e-05, + "loss": 0.0886, + "step": 1160 + }, + { + "epoch": 0.27406886858749124, + "grad_norm": 0.11301174759864807, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0967, + "step": 1170 + }, + { + "epoch": 0.2764113375497775, + "grad_norm": 0.6525554060935974, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0857, + "step": 1180 + }, + { + "epoch": 0.2787538065120637, + "grad_norm": 0.27176904678344727, + "learning_rate": 9.870399824239117e-05, + "loss": 0.0556, + "step": 1190 + }, + { + "epoch": 0.28109627547434995, + "grad_norm": 0.4166867136955261, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0684, + "step": 1200 + }, + { + "epoch": 0.2834387444366362, + "grad_norm": 0.19580566883087158, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0882, + "step": 1210 + }, + { + "epoch": 0.2857812133989225, + "grad_norm": 0.44604888558387756, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0883, + "step": 1220 + }, + { + "epoch": 0.2881236823612087, + "grad_norm": 0.49636200070381165, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0912, + "step": 1230 + }, + { + "epoch": 0.29046615132349496, + "grad_norm": 0.1988007128238678, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0719, + "step": 1240 + }, + { + "epoch": 0.2928086202857812, + "grad_norm": 0.30095556378364563, + "learning_rate": 9.847001329696653e-05, + "loss": 0.078, + "step": 1250 + }, + { + "epoch": 0.2951510892480675, + "grad_norm": 0.34190279245376587, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0442, + "step": 1260 + }, + { + "epoch": 0.2974935582103537, + "grad_norm": 0.25464609265327454, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0583, + "step": 1270 + }, + { + "epoch": 0.29983602717263996, + "grad_norm": 0.07694657146930695, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0359, + "step": 1280 + }, + { + "epoch": 0.3021784961349262, + "grad_norm": 0.19848985970020294, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0812, + "step": 1290 + }, + { + "epoch": 0.30452096509721244, + "grad_norm": 0.27825915813446045, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0496, + "step": 1300 + }, + { + "epoch": 0.30686343405949873, + "grad_norm": 0.3718523681163788, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0671, + "step": 1310 + }, + { + "epoch": 0.30920590302178497, + "grad_norm": 0.5311722159385681, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0739, + "step": 1320 + }, + { + "epoch": 0.3115483719840712, + "grad_norm": 0.41185882687568665, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0665, + "step": 1330 + }, + { + "epoch": 0.31389084094635744, + "grad_norm": 0.2839798629283905, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0495, + "step": 1340 + }, + { + "epoch": 0.31623330990864373, + "grad_norm": 0.5456023812294006, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0538, + "step": 1350 + }, + { + "epoch": 0.31857577887092997, + "grad_norm": 1.1303348541259766, + "learning_rate": 9.799155349053851e-05, + "loss": 0.0948, + "step": 1360 + }, + { + "epoch": 0.3209182478332162, + "grad_norm": 0.3756462633609772, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0405, + "step": 1370 + }, + { + "epoch": 0.32326071679550245, + "grad_norm": 0.45304539799690247, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0518, + "step": 1380 + }, + { + "epoch": 0.3256031857577887, + "grad_norm": 0.42578068375587463, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0694, + "step": 1390 + }, + { + "epoch": 0.327945654720075, + "grad_norm": 0.5314955711364746, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0656, + "step": 1400 + }, + { + "epoch": 0.3302881236823612, + "grad_norm": 0.445273220539093, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0467, + "step": 1410 + }, + { + "epoch": 0.33263059264464745, + "grad_norm": 0.45427191257476807, + "learning_rate": 9.77037682640015e-05, + "loss": 0.071, + "step": 1420 + }, + { + "epoch": 0.3349730616069337, + "grad_norm": 1.1310575008392334, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0783, + "step": 1430 + }, + { + "epoch": 0.33731553056922, + "grad_norm": 0.37553080916404724, + "learning_rate": 9.760366073392246e-05, + "loss": 0.0595, + "step": 1440 + }, + { + "epoch": 0.3396579995315062, + "grad_norm": 0.456626296043396, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0684, + "step": 1450 + }, + { + "epoch": 0.34200046849379245, + "grad_norm": 0.23000092804431915, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0663, + "step": 1460 + }, + { + "epoch": 0.3443429374560787, + "grad_norm": 0.8536004424095154, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0615, + "step": 1470 + }, + { + "epoch": 0.346685406418365, + "grad_norm": 0.2810976803302765, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0499, + "step": 1480 + }, + { + "epoch": 0.3490278753806512, + "grad_norm": 0.5517282485961914, + "learning_rate": 9.734429148174675e-05, + "loss": 0.0623, + "step": 1490 + }, + { + "epoch": 0.35137034434293746, + "grad_norm": 0.5391654372215271, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0701, + "step": 1500 + }, + { + "epoch": 0.3537128133052237, + "grad_norm": 0.2104485183954239, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0624, + "step": 1510 + }, + { + "epoch": 0.35605528226750993, + "grad_norm": 0.6778100728988647, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0735, + "step": 1520 + }, + { + "epoch": 0.3583977512297962, + "grad_norm": 0.5578711628913879, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0649, + "step": 1530 + }, + { + "epoch": 0.36074022019208246, + "grad_norm": 0.19399204850196838, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0696, + "step": 1540 + }, + { + "epoch": 0.3630826891543687, + "grad_norm": 0.36409327387809753, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0764, + "step": 1550 + }, + { + "epoch": 0.36542515811665494, + "grad_norm": 0.3991371691226959, + "learning_rate": 9.695944607949649e-05, + "loss": 0.053, + "step": 1560 + }, + { + "epoch": 0.36776762707894123, + "grad_norm": 0.24415276944637299, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0359, + "step": 1570 + }, + { + "epoch": 0.37011009604122747, + "grad_norm": 0.2075069695711136, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0663, + "step": 1580 + }, + { + "epoch": 0.3724525650035137, + "grad_norm": 0.6543785333633423, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0494, + "step": 1590 + }, + { + "epoch": 0.37479503396579994, + "grad_norm": 0.5545148253440857, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0922, + "step": 1600 + }, + { + "epoch": 0.3771375029280862, + "grad_norm": 0.3024766743183136, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0458, + "step": 1610 + }, + { + "epoch": 0.37947997189037247, + "grad_norm": 0.18543019890785217, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0814, + "step": 1620 + }, + { + "epoch": 0.3818224408526587, + "grad_norm": 0.6047130823135376, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0749, + "step": 1630 + }, + { + "epoch": 0.38416490981494494, + "grad_norm": 0.5619345307350159, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0647, + "step": 1640 + }, + { + "epoch": 0.3865073787772312, + "grad_norm": 0.3835267126560211, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0526, + "step": 1650 + }, + { + "epoch": 0.3888498477395175, + "grad_norm": 0.2994864583015442, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0451, + "step": 1660 + }, + { + "epoch": 0.3911923167018037, + "grad_norm": 0.5770288705825806, + "learning_rate": 9.630393468087818e-05, + "loss": 0.0716, + "step": 1670 + }, + { + "epoch": 0.39353478566408995, + "grad_norm": 0.3165629506111145, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0468, + "step": 1680 + }, + { + "epoch": 0.3958772546263762, + "grad_norm": 0.11682554334402084, + "learning_rate": 9.617814195316411e-05, + "loss": 0.0669, + "step": 1690 + }, + { + "epoch": 0.3982197235886625, + "grad_norm": 0.4979915916919708, + "learning_rate": 9.611448774886924e-05, + "loss": 0.0553, + "step": 1700 + }, + { + "epoch": 0.4005621925509487, + "grad_norm": 0.14603012800216675, + "learning_rate": 9.605032924392457e-05, + "loss": 0.0597, + "step": 1710 + }, + { + "epoch": 0.40290466151323495, + "grad_norm": 0.3345795273780823, + "learning_rate": 9.598566713995718e-05, + "loss": 0.049, + "step": 1720 + }, + { + "epoch": 0.4052471304755212, + "grad_norm": 0.4213583171367645, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0659, + "step": 1730 + }, + { + "epoch": 0.4075895994378074, + "grad_norm": 0.1514274775981903, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0803, + "step": 1740 + }, + { + "epoch": 0.4099320684000937, + "grad_norm": 1.1298153400421143, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0574, + "step": 1750 + }, + { + "epoch": 0.41227453736237996, + "grad_norm": 0.2879124581813812, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0618, + "step": 1760 + }, + { + "epoch": 0.4146170063246662, + "grad_norm": 0.21584849059581757, + "learning_rate": 9.565482757680415e-05, + "loss": 0.069, + "step": 1770 + }, + { + "epoch": 0.41695947528695243, + "grad_norm": 0.27666664123535156, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0619, + "step": 1780 + }, + { + "epoch": 0.4193019442492387, + "grad_norm": 0.36067232489585876, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0512, + "step": 1790 + }, + { + "epoch": 0.42164441321152496, + "grad_norm": 0.21706882119178772, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0399, + "step": 1800 + }, + { + "epoch": 0.4239868821738112, + "grad_norm": 0.2502746880054474, + "learning_rate": 9.538116473662861e-05, + "loss": 0.067, + "step": 1810 + }, + { + "epoch": 0.42632935113609743, + "grad_norm": 0.19951611757278442, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0572, + "step": 1820 + }, + { + "epoch": 0.42867182009838367, + "grad_norm": 0.5946075916290283, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0556, + "step": 1830 + }, + { + "epoch": 0.43101428906066996, + "grad_norm": 0.20143412053585052, + "learning_rate": 9.517070405476575e-05, + "loss": 0.0556, + "step": 1840 + }, + { + "epoch": 0.4333567580229562, + "grad_norm": 0.30480778217315674, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0721, + "step": 1850 + }, + { + "epoch": 0.43569922698524244, + "grad_norm": 0.289962500333786, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0349, + "step": 1860 + }, + { + "epoch": 0.4380416959475287, + "grad_norm": 0.23470467329025269, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0508, + "step": 1870 + }, + { + "epoch": 0.44038416490981497, + "grad_norm": 0.5040431022644043, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0576, + "step": 1880 + }, + { + "epoch": 0.4427266338721012, + "grad_norm": 0.4373694360256195, + "learning_rate": 9.481006715927351e-05, + "loss": 0.0526, + "step": 1890 + }, + { + "epoch": 0.44506910283438744, + "grad_norm": 0.41776043176651, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0417, + "step": 1900 + }, + { + "epoch": 0.4474115717966737, + "grad_norm": 0.5410218238830566, + "learning_rate": 9.46623765919727e-05, + "loss": 0.0737, + "step": 1910 + }, + { + "epoch": 0.4497540407589599, + "grad_norm": 0.4274581968784332, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0715, + "step": 1920 + }, + { + "epoch": 0.4520965097212462, + "grad_norm": 0.31722667813301086, + "learning_rate": 9.451273234763371e-05, + "loss": 0.0672, + "step": 1930 + }, + { + "epoch": 0.45443897868353245, + "grad_norm": 0.221653014421463, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0631, + "step": 1940 + }, + { + "epoch": 0.4567814476458187, + "grad_norm": 0.2043227255344391, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0436, + "step": 1950 + }, + { + "epoch": 0.4591239166081049, + "grad_norm": 0.1967364400625229, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0601, + "step": 1960 + }, + { + "epoch": 0.4614663855703912, + "grad_norm": 0.23282958567142487, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0441, + "step": 1970 + }, + { + "epoch": 0.46380885453267745, + "grad_norm": 0.6064874529838562, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0691, + "step": 1980 + }, + { + "epoch": 0.4661513234949637, + "grad_norm": 0.29970476031303406, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0362, + "step": 1990 + }, + { + "epoch": 0.4684937924572499, + "grad_norm": 0.3310692310333252, + "learning_rate": 9.397368756032445e-05, + "loss": 0.045, + "step": 2000 + }, + { + "epoch": 0.4708362614195362, + "grad_norm": 0.34072744846343994, + "learning_rate": 9.389475079423988e-05, + "loss": 0.0646, + "step": 2010 + }, + { + "epoch": 0.47317873038182245, + "grad_norm": 0.09513302892446518, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0543, + "step": 2020 + }, + { + "epoch": 0.4755211993441087, + "grad_norm": 0.19264456629753113, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0682, + "step": 2030 + }, + { + "epoch": 0.47786366830639493, + "grad_norm": 0.3914099633693695, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0455, + "step": 2040 + }, + { + "epoch": 0.48020613726868117, + "grad_norm": 0.4226783514022827, + "learning_rate": 9.357421218136386e-05, + "loss": 0.0689, + "step": 2050 + }, + { + "epoch": 0.48254860623096746, + "grad_norm": 0.41455796360969543, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0596, + "step": 2060 + }, + { + "epoch": 0.4848910751932537, + "grad_norm": 0.2510756254196167, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0341, + "step": 2070 + }, + { + "epoch": 0.48723354415553993, + "grad_norm": 0.40096133947372437, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0813, + "step": 2080 + }, + { + "epoch": 0.48957601311782617, + "grad_norm": 0.6878464221954346, + "learning_rate": 9.32460493430591e-05, + "loss": 0.044, + "step": 2090 + }, + { + "epoch": 0.49191848208011246, + "grad_norm": 0.3416203558444977, + "learning_rate": 9.316282404787871e-05, + "loss": 0.0686, + "step": 2100 + }, + { + "epoch": 0.4942609510423987, + "grad_norm": 0.12535825371742249, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0354, + "step": 2110 + }, + { + "epoch": 0.49660342000468494, + "grad_norm": 0.19023941457271576, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0376, + "step": 2120 + }, + { + "epoch": 0.4989458889669712, + "grad_norm": 0.3778730034828186, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0518, + "step": 2130 + }, + { + "epoch": 0.5012883579292574, + "grad_norm": 0.256195068359375, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0571, + "step": 2140 + }, + { + "epoch": 0.5036308268915437, + "grad_norm": 0.19933399558067322, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0271, + "step": 2150 + }, + { + "epoch": 0.5059732958538299, + "grad_norm": 0.06613205373287201, + "learning_rate": 9.265359203611987e-05, + "loss": 0.0334, + "step": 2160 + }, + { + "epoch": 0.5083157648161162, + "grad_norm": 0.21248801052570343, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0746, + "step": 2170 + }, + { + "epoch": 0.5106582337784025, + "grad_norm": 0.3601578176021576, + "learning_rate": 9.248010643731935e-05, + "loss": 0.076, + "step": 2180 + }, + { + "epoch": 0.5130007027406887, + "grad_norm": 0.0984947606921196, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0892, + "step": 2190 + }, + { + "epoch": 0.515343171702975, + "grad_norm": 0.13032953441143036, + "learning_rate": 9.230476262104677e-05, + "loss": 0.039, + "step": 2200 + }, + { + "epoch": 0.5176856406652612, + "grad_norm": 0.48068541288375854, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0585, + "step": 2210 + }, + { + "epoch": 0.5200281096275474, + "grad_norm": 0.42812222242355347, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0929, + "step": 2220 + }, + { + "epoch": 0.5223705785898337, + "grad_norm": 0.3526000380516052, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0657, + "step": 2230 + }, + { + "epoch": 0.5247130475521199, + "grad_norm": 0.14142726361751556, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0571, + "step": 2240 + }, + { + "epoch": 0.5270555165144062, + "grad_norm": 0.10022013634443283, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0362, + "step": 2250 + }, + { + "epoch": 0.5293979854766925, + "grad_norm": 0.18126869201660156, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0616, + "step": 2260 + }, + { + "epoch": 0.5317404544389787, + "grad_norm": 0.22198501229286194, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0227, + "step": 2270 + }, + { + "epoch": 0.534082923401265, + "grad_norm": 0.07468587905168533, + "learning_rate": 9.158495979556358e-05, + "loss": 0.045, + "step": 2280 + }, + { + "epoch": 0.5364253923635511, + "grad_norm": 0.1882839947938919, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0569, + "step": 2290 + }, + { + "epoch": 0.5387678613258374, + "grad_norm": 0.1339283585548401, + "learning_rate": 9.140044155740101e-05, + "loss": 0.0692, + "step": 2300 + }, + { + "epoch": 0.5411103302881237, + "grad_norm": 0.19089505076408386, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0638, + "step": 2310 + }, + { + "epoch": 0.5434527992504099, + "grad_norm": 0.131087064743042, + "learning_rate": 9.121411232980588e-05, + "loss": 0.0656, + "step": 2320 + }, + { + "epoch": 0.5457952682126962, + "grad_norm": 0.24333599209785461, + "learning_rate": 9.112027113896262e-05, + "loss": 0.0617, + "step": 2330 + }, + { + "epoch": 0.5481377371749825, + "grad_norm": 0.4338069260120392, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0384, + "step": 2340 + }, + { + "epoch": 0.5504802061372687, + "grad_norm": 0.3546713888645172, + "learning_rate": 9.093124073433463e-05, + "loss": 0.0594, + "step": 2350 + }, + { + "epoch": 0.552822675099555, + "grad_norm": 0.1043967604637146, + "learning_rate": 9.083605358775612e-05, + "loss": 0.0482, + "step": 2360 + }, + { + "epoch": 0.5551651440618411, + "grad_norm": 0.16685545444488525, + "learning_rate": 9.074041986463808e-05, + "loss": 0.0439, + "step": 2370 + }, + { + "epoch": 0.5575076130241274, + "grad_norm": 0.15651892125606537, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0542, + "step": 2380 + }, + { + "epoch": 0.5598500819864137, + "grad_norm": 0.33224546909332275, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0772, + "step": 2390 + }, + { + "epoch": 0.5621925509486999, + "grad_norm": 0.3219659626483917, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0347, + "step": 2400 + }, + { + "epoch": 0.5645350199109862, + "grad_norm": 0.3930731415748596, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0386, + "step": 2410 + }, + { + "epoch": 0.5668774888732724, + "grad_norm": 0.13527953624725342, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0479, + "step": 2420 + }, + { + "epoch": 0.5692199578355587, + "grad_norm": 0.1432938128709793, + "learning_rate": 9.015729832577681e-05, + "loss": 0.0319, + "step": 2430 + }, + { + "epoch": 0.571562426797845, + "grad_norm": 0.25687897205352783, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0387, + "step": 2440 + }, + { + "epoch": 0.5739048957601312, + "grad_norm": 0.31300991773605347, + "learning_rate": 8.995939984474624e-05, + "loss": 0.0574, + "step": 2450 + }, + { + "epoch": 0.5762473647224174, + "grad_norm": 0.25793933868408203, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0415, + "step": 2460 + }, + { + "epoch": 0.5785898336847036, + "grad_norm": 0.13978935778141022, + "learning_rate": 8.975975341011596e-05, + "loss": 0.0366, + "step": 2470 + }, + { + "epoch": 0.5809323026469899, + "grad_norm": 0.20552988350391388, + "learning_rate": 8.965927743634391e-05, + "loss": 0.0519, + "step": 2480 + }, + { + "epoch": 0.5832747716092762, + "grad_norm": 0.0843147486448288, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0434, + "step": 2490 + }, + { + "epoch": 0.5856172405715624, + "grad_norm": 0.519131600856781, + "learning_rate": 8.945702546981969e-05, + "loss": 0.044, + "step": 2500 + }, + { + "epoch": 0.5879597095338487, + "grad_norm": 0.20150704681873322, + "learning_rate": 8.935525168886262e-05, + "loss": 0.0486, + "step": 2510 + }, + { + "epoch": 0.590302178496135, + "grad_norm": 0.6557456851005554, + "learning_rate": 8.92530475251784e-05, + "loss": 0.0444, + "step": 2520 + }, + { + "epoch": 0.5926446474584212, + "grad_norm": 0.48158717155456543, + "learning_rate": 8.91504140964553e-05, + "loss": 0.0512, + "step": 2530 + }, + { + "epoch": 0.5949871164207075, + "grad_norm": 0.3636298179626465, + "learning_rate": 8.90473525250761e-05, + "loss": 0.052, + "step": 2540 + }, + { + "epoch": 0.5973295853829936, + "grad_norm": 0.1767117828130722, + "learning_rate": 8.894386393810563e-05, + "loss": 0.0534, + "step": 2550 + }, + { + "epoch": 0.5996720543452799, + "grad_norm": 0.30989664793014526, + "learning_rate": 8.883994946727849e-05, + "loss": 0.0765, + "step": 2560 + }, + { + "epoch": 0.6020145233075662, + "grad_norm": 0.28089532256126404, + "learning_rate": 8.873561024898668e-05, + "loss": 0.0424, + "step": 2570 + }, + { + "epoch": 0.6043569922698524, + "grad_norm": 0.5266916751861572, + "learning_rate": 8.863084742426719e-05, + "loss": 0.0364, + "step": 2580 + }, + { + "epoch": 0.6066994612321387, + "grad_norm": 0.5653497576713562, + "learning_rate": 8.852566213878947e-05, + "loss": 0.0604, + "step": 2590 + }, + { + "epoch": 0.6090419301944249, + "grad_norm": 0.34995973110198975, + "learning_rate": 8.842005554284296e-05, + "loss": 0.0386, + "step": 2600 + }, + { + "epoch": 0.6113843991567112, + "grad_norm": 0.42935842275619507, + "learning_rate": 8.831402879132446e-05, + "loss": 0.0595, + "step": 2610 + }, + { + "epoch": 0.6137268681189975, + "grad_norm": 0.19672085344791412, + "learning_rate": 8.820758304372557e-05, + "loss": 0.0426, + "step": 2620 + }, + { + "epoch": 0.6160693370812836, + "grad_norm": 0.17344583570957184, + "learning_rate": 8.810071946411989e-05, + "loss": 0.0979, + "step": 2630 + }, + { + "epoch": 0.6184118060435699, + "grad_norm": 0.19755525887012482, + "learning_rate": 8.799343922115044e-05, + "loss": 0.0322, + "step": 2640 + }, + { + "epoch": 0.6207542750058562, + "grad_norm": 0.33817166090011597, + "learning_rate": 8.788574348801675e-05, + "loss": 0.0375, + "step": 2650 + }, + { + "epoch": 0.6230967439681424, + "grad_norm": 0.44614845514297485, + "learning_rate": 8.77776334424621e-05, + "loss": 0.054, + "step": 2660 + }, + { + "epoch": 0.6254392129304287, + "grad_norm": 0.4128440022468567, + "learning_rate": 8.766911026676064e-05, + "loss": 0.0422, + "step": 2670 + }, + { + "epoch": 0.6277816818927149, + "grad_norm": 0.22449485957622528, + "learning_rate": 8.756017514770443e-05, + "loss": 0.037, + "step": 2680 + }, + { + "epoch": 0.6301241508550012, + "grad_norm": 0.2689172029495239, + "learning_rate": 8.745082927659047e-05, + "loss": 0.0353, + "step": 2690 + }, + { + "epoch": 0.6324666198172875, + "grad_norm": 0.05075841769576073, + "learning_rate": 8.73410738492077e-05, + "loss": 0.0333, + "step": 2700 + }, + { + "epoch": 0.6348090887795736, + "grad_norm": 0.1499403417110443, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0559, + "step": 2710 + }, + { + "epoch": 0.6371515577418599, + "grad_norm": 0.36928892135620117, + "learning_rate": 8.71203391311725e-05, + "loss": 0.0763, + "step": 2720 + }, + { + "epoch": 0.6394940267041461, + "grad_norm": 0.5727768540382385, + "learning_rate": 8.700936225443959e-05, + "loss": 0.0527, + "step": 2730 + }, + { + "epoch": 0.6418364956664324, + "grad_norm": 0.30735543370246887, + "learning_rate": 8.689798064925049e-05, + "loss": 0.0585, + "step": 2740 + }, + { + "epoch": 0.6441789646287187, + "grad_norm": 0.3882769048213959, + "learning_rate": 8.678619553365659e-05, + "loss": 0.0491, + "step": 2750 + }, + { + "epoch": 0.6465214335910049, + "grad_norm": 0.365843802690506, + "learning_rate": 8.6674008130122e-05, + "loss": 0.0397, + "step": 2760 + }, + { + "epoch": 0.6488639025532912, + "grad_norm": 0.21451324224472046, + "learning_rate": 8.656141966551019e-05, + "loss": 0.0365, + "step": 2770 + }, + { + "epoch": 0.6512063715155774, + "grad_norm": 0.1609046310186386, + "learning_rate": 8.644843137107059e-05, + "loss": 0.039, + "step": 2780 + }, + { + "epoch": 0.6535488404778637, + "grad_norm": 0.7074998021125793, + "learning_rate": 8.633504448242505e-05, + "loss": 0.0591, + "step": 2790 + }, + { + "epoch": 0.65589130944015, + "grad_norm": 0.21024738252162933, + "learning_rate": 8.622126023955446e-05, + "loss": 0.0488, + "step": 2800 + }, + { + "epoch": 0.6582337784024361, + "grad_norm": 0.3021513819694519, + "learning_rate": 8.610707988678503e-05, + "loss": 0.04, + "step": 2810 + }, + { + "epoch": 0.6605762473647224, + "grad_norm": 0.19868189096450806, + "learning_rate": 8.599250467277483e-05, + "loss": 0.0319, + "step": 2820 + }, + { + "epoch": 0.6629187163270087, + "grad_norm": 0.15607990324497223, + "learning_rate": 8.587753585050004e-05, + "loss": 0.036, + "step": 2830 + }, + { + "epoch": 0.6652611852892949, + "grad_norm": 0.3136105239391327, + "learning_rate": 8.576217467724128e-05, + "loss": 0.0752, + "step": 2840 + }, + { + "epoch": 0.6676036542515812, + "grad_norm": 0.21903324127197266, + "learning_rate": 8.564642241456986e-05, + "loss": 0.0416, + "step": 2850 + }, + { + "epoch": 0.6699461232138674, + "grad_norm": 0.5193045735359192, + "learning_rate": 8.553028032833397e-05, + "loss": 0.0386, + "step": 2860 + }, + { + "epoch": 0.6722885921761537, + "grad_norm": 0.5539060235023499, + "learning_rate": 8.541374968864487e-05, + "loss": 0.0439, + "step": 2870 + }, + { + "epoch": 0.67463106113844, + "grad_norm": 0.2819710969924927, + "learning_rate": 8.529683176986295e-05, + "loss": 0.0541, + "step": 2880 + }, + { + "epoch": 0.6769735301007261, + "grad_norm": 0.1039167121052742, + "learning_rate": 8.517952785058385e-05, + "loss": 0.039, + "step": 2890 + }, + { + "epoch": 0.6793159990630124, + "grad_norm": 0.062352605164051056, + "learning_rate": 8.506183921362443e-05, + "loss": 0.0401, + "step": 2900 + }, + { + "epoch": 0.6816584680252986, + "grad_norm": 0.5535932183265686, + "learning_rate": 8.494376714600878e-05, + "loss": 0.0505, + "step": 2910 + }, + { + "epoch": 0.6840009369875849, + "grad_norm": 0.37601238489151, + "learning_rate": 8.482531293895412e-05, + "loss": 0.0391, + "step": 2920 + }, + { + "epoch": 0.6863434059498712, + "grad_norm": 0.06856988370418549, + "learning_rate": 8.470647788785665e-05, + "loss": 0.0389, + "step": 2930 + }, + { + "epoch": 0.6886858749121574, + "grad_norm": 0.5693712830543518, + "learning_rate": 8.458726329227747e-05, + "loss": 0.0495, + "step": 2940 + }, + { + "epoch": 0.6910283438744437, + "grad_norm": 0.14418154954910278, + "learning_rate": 8.44676704559283e-05, + "loss": 0.0405, + "step": 2950 + }, + { + "epoch": 0.69337081283673, + "grad_norm": 0.11880888044834137, + "learning_rate": 8.434770068665723e-05, + "loss": 0.0362, + "step": 2960 + }, + { + "epoch": 0.6957132817990161, + "grad_norm": 0.6350199580192566, + "learning_rate": 8.422735529643444e-05, + "loss": 0.0607, + "step": 2970 + }, + { + "epoch": 0.6980557507613024, + "grad_norm": 0.19949962198734283, + "learning_rate": 8.410663560133784e-05, + "loss": 0.0346, + "step": 2980 + }, + { + "epoch": 0.7003982197235886, + "grad_norm": 0.19905024766921997, + "learning_rate": 8.398554292153866e-05, + "loss": 0.0455, + "step": 2990 + }, + { + "epoch": 0.7027406886858749, + "grad_norm": 0.12724433839321136, + "learning_rate": 8.386407858128706e-05, + "loss": 0.0312, + "step": 3000 + }, + { + "epoch": 0.7050831576481612, + "grad_norm": 0.6818522214889526, + "learning_rate": 8.37422439088976e-05, + "loss": 0.0477, + "step": 3010 + }, + { + "epoch": 0.7074256266104474, + "grad_norm": 0.14397919178009033, + "learning_rate": 8.362004023673474e-05, + "loss": 0.054, + "step": 3020 + }, + { + "epoch": 0.7097680955727337, + "grad_norm": 0.1597958207130432, + "learning_rate": 8.349746890119826e-05, + "loss": 0.0475, + "step": 3030 + }, + { + "epoch": 0.7121105645350199, + "grad_norm": 0.2985258102416992, + "learning_rate": 8.337453124270863e-05, + "loss": 0.0276, + "step": 3040 + }, + { + "epoch": 0.7144530334973062, + "grad_norm": 0.17043350636959076, + "learning_rate": 8.32512286056924e-05, + "loss": 0.0337, + "step": 3050 + }, + { + "epoch": 0.7167955024595924, + "grad_norm": 0.390009343624115, + "learning_rate": 8.31275623385675e-05, + "loss": 0.0277, + "step": 3060 + }, + { + "epoch": 0.7191379714218786, + "grad_norm": 0.20475880801677704, + "learning_rate": 8.300353379372834e-05, + "loss": 0.0691, + "step": 3070 + }, + { + "epoch": 0.7214804403841649, + "grad_norm": 0.11685507744550705, + "learning_rate": 8.287914432753123e-05, + "loss": 0.0411, + "step": 3080 + }, + { + "epoch": 0.7238229093464511, + "grad_norm": 0.531944990158081, + "learning_rate": 8.275439530027948e-05, + "loss": 0.0511, + "step": 3090 + }, + { + "epoch": 0.7261653783087374, + "grad_norm": 0.05079588294029236, + "learning_rate": 8.262928807620843e-05, + "loss": 0.0664, + "step": 3100 + }, + { + "epoch": 0.7285078472710237, + "grad_norm": 0.3010249435901642, + "learning_rate": 8.250382402347065e-05, + "loss": 0.0565, + "step": 3110 + }, + { + "epoch": 0.7308503162333099, + "grad_norm": 0.2115558385848999, + "learning_rate": 8.237800451412095e-05, + "loss": 0.0615, + "step": 3120 + }, + { + "epoch": 0.7331927851955962, + "grad_norm": 0.3865530490875244, + "learning_rate": 8.225183092410128e-05, + "loss": 0.0349, + "step": 3130 + }, + { + "epoch": 0.7355352541578825, + "grad_norm": 0.07815901935100555, + "learning_rate": 8.212530463322583e-05, + "loss": 0.036, + "step": 3140 + }, + { + "epoch": 0.7378777231201686, + "grad_norm": 0.11009709537029266, + "learning_rate": 8.199842702516583e-05, + "loss": 0.0386, + "step": 3150 + }, + { + "epoch": 0.7402201920824549, + "grad_norm": 0.12392786890268326, + "learning_rate": 8.18711994874345e-05, + "loss": 0.0396, + "step": 3160 + }, + { + "epoch": 0.7425626610447411, + "grad_norm": 0.16354168951511383, + "learning_rate": 8.174362341137177e-05, + "loss": 0.0446, + "step": 3170 + }, + { + "epoch": 0.7449051300070274, + "grad_norm": 0.2223191112279892, + "learning_rate": 8.161570019212921e-05, + "loss": 0.0326, + "step": 3180 + }, + { + "epoch": 0.7472475989693137, + "grad_norm": 0.176427960395813, + "learning_rate": 8.148743122865463e-05, + "loss": 0.0235, + "step": 3190 + }, + { + "epoch": 0.7495900679315999, + "grad_norm": 0.19706971943378448, + "learning_rate": 8.135881792367686e-05, + "loss": 0.0417, + "step": 3200 + }, + { + "epoch": 0.7519325368938862, + "grad_norm": 0.08818463236093521, + "learning_rate": 8.12298616836904e-05, + "loss": 0.0463, + "step": 3210 + }, + { + "epoch": 0.7542750058561724, + "grad_norm": 0.08389343321323395, + "learning_rate": 8.110056391894005e-05, + "loss": 0.0259, + "step": 3220 + }, + { + "epoch": 0.7566174748184586, + "grad_norm": 0.13730217516422272, + "learning_rate": 8.097092604340542e-05, + "loss": 0.0394, + "step": 3230 + }, + { + "epoch": 0.7589599437807449, + "grad_norm": 0.48324722051620483, + "learning_rate": 8.084094947478556e-05, + "loss": 0.0488, + "step": 3240 + }, + { + "epoch": 0.7613024127430311, + "grad_norm": 0.15898984670639038, + "learning_rate": 8.07106356344834e-05, + "loss": 0.0402, + "step": 3250 + }, + { + "epoch": 0.7636448817053174, + "grad_norm": 0.19997884333133698, + "learning_rate": 8.057998594759022e-05, + "loss": 0.0406, + "step": 3260 + }, + { + "epoch": 0.7659873506676037, + "grad_norm": 0.06215028837323189, + "learning_rate": 8.044900184287007e-05, + "loss": 0.0577, + "step": 3270 + }, + { + "epoch": 0.7683298196298899, + "grad_norm": 0.28326717019081116, + "learning_rate": 8.031768475274413e-05, + "loss": 0.057, + "step": 3280 + }, + { + "epoch": 0.7706722885921762, + "grad_norm": 0.29579654335975647, + "learning_rate": 8.018603611327504e-05, + "loss": 0.0563, + "step": 3290 + }, + { + "epoch": 0.7730147575544624, + "grad_norm": 0.5313428044319153, + "learning_rate": 8.005405736415126e-05, + "loss": 0.0748, + "step": 3300 + }, + { + "epoch": 0.7753572265167487, + "grad_norm": 0.45142146944999695, + "learning_rate": 7.992174994867123e-05, + "loss": 0.0344, + "step": 3310 + }, + { + "epoch": 0.777699695479035, + "grad_norm": 0.22848837077617645, + "learning_rate": 7.978911531372765e-05, + "loss": 0.0367, + "step": 3320 + }, + { + "epoch": 0.7800421644413211, + "grad_norm": 0.07316577434539795, + "learning_rate": 7.965615490979163e-05, + "loss": 0.0332, + "step": 3330 + }, + { + "epoch": 0.7823846334036074, + "grad_norm": 0.08522647619247437, + "learning_rate": 7.952287019089685e-05, + "loss": 0.0313, + "step": 3340 + }, + { + "epoch": 0.7847271023658936, + "grad_norm": 0.2560670077800751, + "learning_rate": 7.938926261462366e-05, + "loss": 0.0753, + "step": 3350 + }, + { + "epoch": 0.7870695713281799, + "grad_norm": 0.2529207468032837, + "learning_rate": 7.925533364208309e-05, + "loss": 0.0584, + "step": 3360 + }, + { + "epoch": 0.7894120402904662, + "grad_norm": 0.20108440518379211, + "learning_rate": 7.912108473790092e-05, + "loss": 0.0443, + "step": 3370 + }, + { + "epoch": 0.7917545092527524, + "grad_norm": 0.09312764555215836, + "learning_rate": 7.898651737020166e-05, + "loss": 0.0529, + "step": 3380 + }, + { + "epoch": 0.7940969782150387, + "grad_norm": 0.08973310142755508, + "learning_rate": 7.88516330105925e-05, + "loss": 0.0313, + "step": 3390 + }, + { + "epoch": 0.796439447177325, + "grad_norm": 0.2917576730251312, + "learning_rate": 7.871643313414718e-05, + "loss": 0.0699, + "step": 3400 + }, + { + "epoch": 0.7987819161396111, + "grad_norm": 0.3426614999771118, + "learning_rate": 7.858091921938988e-05, + "loss": 0.0554, + "step": 3410 + }, + { + "epoch": 0.8011243851018974, + "grad_norm": 0.10231604427099228, + "learning_rate": 7.844509274827907e-05, + "loss": 0.0469, + "step": 3420 + }, + { + "epoch": 0.8034668540641836, + "grad_norm": 0.36295169591903687, + "learning_rate": 7.830895520619128e-05, + "loss": 0.0489, + "step": 3430 + }, + { + "epoch": 0.8058093230264699, + "grad_norm": 0.23017369210720062, + "learning_rate": 7.817250808190483e-05, + "loss": 0.0407, + "step": 3440 + }, + { + "epoch": 0.8081517919887562, + "grad_norm": 0.2438231259584427, + "learning_rate": 7.803575286758364e-05, + "loss": 0.0542, + "step": 3450 + }, + { + "epoch": 0.8104942609510424, + "grad_norm": 0.28502318263053894, + "learning_rate": 7.789869105876083e-05, + "loss": 0.0433, + "step": 3460 + }, + { + "epoch": 0.8128367299133287, + "grad_norm": 0.7063993215560913, + "learning_rate": 7.776132415432234e-05, + "loss": 0.0687, + "step": 3470 + }, + { + "epoch": 0.8151791988756149, + "grad_norm": 0.3574845492839813, + "learning_rate": 7.762365365649067e-05, + "loss": 0.0283, + "step": 3480 + }, + { + "epoch": 0.8175216678379011, + "grad_norm": 0.1527651846408844, + "learning_rate": 7.748568107080832e-05, + "loss": 0.0502, + "step": 3490 + }, + { + "epoch": 0.8198641368001874, + "grad_norm": 0.20111270248889923, + "learning_rate": 7.734740790612136e-05, + "loss": 0.0526, + "step": 3500 + }, + { + "epoch": 0.8222066057624736, + "grad_norm": 0.5221764445304871, + "learning_rate": 7.720883567456298e-05, + "loss": 0.0385, + "step": 3510 + }, + { + "epoch": 0.8245490747247599, + "grad_norm": 0.11450177431106567, + "learning_rate": 7.70699658915369e-05, + "loss": 0.0495, + "step": 3520 + }, + { + "epoch": 0.8268915436870461, + "grad_norm": 0.2669161558151245, + "learning_rate": 7.693080007570084e-05, + "loss": 0.0419, + "step": 3530 + }, + { + "epoch": 0.8292340126493324, + "grad_norm": 0.4859974682331085, + "learning_rate": 7.679133974894983e-05, + "loss": 0.0454, + "step": 3540 + }, + { + "epoch": 0.8315764816116187, + "grad_norm": 0.13351887464523315, + "learning_rate": 7.66515864363997e-05, + "loss": 0.0401, + "step": 3550 + }, + { + "epoch": 0.8339189505739049, + "grad_norm": 0.3376217484474182, + "learning_rate": 7.651154166637025e-05, + "loss": 0.0372, + "step": 3560 + }, + { + "epoch": 0.8362614195361912, + "grad_norm": 0.4906126856803894, + "learning_rate": 7.637120697036866e-05, + "loss": 0.0444, + "step": 3570 + }, + { + "epoch": 0.8386038884984774, + "grad_norm": 0.1525869518518448, + "learning_rate": 7.623058388307269e-05, + "loss": 0.0411, + "step": 3580 + }, + { + "epoch": 0.8409463574607636, + "grad_norm": 0.10655678063631058, + "learning_rate": 7.608967394231387e-05, + "loss": 0.0322, + "step": 3590 + }, + { + "epoch": 0.8432888264230499, + "grad_norm": 0.6658011674880981, + "learning_rate": 7.594847868906076e-05, + "loss": 0.0736, + "step": 3600 + }, + { + "epoch": 0.8456312953853361, + "grad_norm": 0.2985578775405884, + "learning_rate": 7.580699966740201e-05, + "loss": 0.0296, + "step": 3610 + }, + { + "epoch": 0.8479737643476224, + "grad_norm": 0.08989045768976212, + "learning_rate": 7.566523842452958e-05, + "loss": 0.0412, + "step": 3620 + }, + { + "epoch": 0.8503162333099087, + "grad_norm": 0.37455546855926514, + "learning_rate": 7.552319651072164e-05, + "loss": 0.0473, + "step": 3630 + }, + { + "epoch": 0.8526587022721949, + "grad_norm": 0.19339019060134888, + "learning_rate": 7.538087547932585e-05, + "loss": 0.0475, + "step": 3640 + }, + { + "epoch": 0.8550011712344812, + "grad_norm": 0.22095589339733124, + "learning_rate": 7.52382768867422e-05, + "loss": 0.0287, + "step": 3650 + }, + { + "epoch": 0.8573436401967673, + "grad_norm": 0.39905375242233276, + "learning_rate": 7.509540229240601e-05, + "loss": 0.0418, + "step": 3660 + }, + { + "epoch": 0.8596861091590536, + "grad_norm": 0.1556907296180725, + "learning_rate": 7.495225325877103e-05, + "loss": 0.0462, + "step": 3670 + }, + { + "epoch": 0.8620285781213399, + "grad_norm": 0.43170592188835144, + "learning_rate": 7.480883135129211e-05, + "loss": 0.0453, + "step": 3680 + }, + { + "epoch": 0.8643710470836261, + "grad_norm": 0.09220433235168457, + "learning_rate": 7.466513813840825e-05, + "loss": 0.0414, + "step": 3690 + }, + { + "epoch": 0.8667135160459124, + "grad_norm": 0.09303878992795944, + "learning_rate": 7.452117519152542e-05, + "loss": 0.0412, + "step": 3700 + }, + { + "epoch": 0.8690559850081987, + "grad_norm": 0.456315279006958, + "learning_rate": 7.437694408499933e-05, + "loss": 0.0429, + "step": 3710 + }, + { + "epoch": 0.8713984539704849, + "grad_norm": 0.0672278180718422, + "learning_rate": 7.423244639611826e-05, + "loss": 0.0492, + "step": 3720 + }, + { + "epoch": 0.8737409229327712, + "grad_norm": 0.11052095890045166, + "learning_rate": 7.408768370508576e-05, + "loss": 0.0404, + "step": 3730 + }, + { + "epoch": 0.8760833918950574, + "grad_norm": 0.20042133331298828, + "learning_rate": 7.394265759500348e-05, + "loss": 0.0597, + "step": 3740 + }, + { + "epoch": 0.8784258608573436, + "grad_norm": 0.3536411225795746, + "learning_rate": 7.379736965185368e-05, + "loss": 0.0212, + "step": 3750 + }, + { + "epoch": 0.8807683298196299, + "grad_norm": 0.28125354647636414, + "learning_rate": 7.365182146448205e-05, + "loss": 0.052, + "step": 3760 + }, + { + "epoch": 0.8831107987819161, + "grad_norm": 0.12258744984865189, + "learning_rate": 7.350601462458024e-05, + "loss": 0.02, + "step": 3770 + }, + { + "epoch": 0.8854532677442024, + "grad_norm": 0.5056569576263428, + "learning_rate": 7.335995072666848e-05, + "loss": 0.035, + "step": 3780 + }, + { + "epoch": 0.8877957367064886, + "grad_norm": 0.2552855610847473, + "learning_rate": 7.32136313680782e-05, + "loss": 0.0421, + "step": 3790 + }, + { + "epoch": 0.8901382056687749, + "grad_norm": 0.05761013180017471, + "learning_rate": 7.30670581489344e-05, + "loss": 0.0414, + "step": 3800 + }, + { + "epoch": 0.8924806746310612, + "grad_norm": 0.9745859503746033, + "learning_rate": 7.292023267213835e-05, + "loss": 0.0725, + "step": 3810 + }, + { + "epoch": 0.8948231435933474, + "grad_norm": 0.2608197033405304, + "learning_rate": 7.277315654334997e-05, + "loss": 0.0405, + "step": 3820 + }, + { + "epoch": 0.8971656125556337, + "grad_norm": 0.3153429329395294, + "learning_rate": 7.262583137097018e-05, + "loss": 0.0407, + "step": 3830 + }, + { + "epoch": 0.8995080815179198, + "grad_norm": 0.5415343642234802, + "learning_rate": 7.247825876612353e-05, + "loss": 0.0389, + "step": 3840 + }, + { + "epoch": 0.9018505504802061, + "grad_norm": 0.4772924482822418, + "learning_rate": 7.233044034264034e-05, + "loss": 0.055, + "step": 3850 + }, + { + "epoch": 0.9041930194424924, + "grad_norm": 0.41308316588401794, + "learning_rate": 7.218237771703921e-05, + "loss": 0.0578, + "step": 3860 + }, + { + "epoch": 0.9065354884047786, + "grad_norm": 0.0859963595867157, + "learning_rate": 7.203407250850928e-05, + "loss": 0.0328, + "step": 3870 + }, + { + "epoch": 0.9088779573670649, + "grad_norm": 0.4168371856212616, + "learning_rate": 7.188552633889259e-05, + "loss": 0.0493, + "step": 3880 + }, + { + "epoch": 0.9112204263293512, + "grad_norm": 0.42193326354026794, + "learning_rate": 7.173674083266624e-05, + "loss": 0.052, + "step": 3890 + }, + { + "epoch": 0.9135628952916374, + "grad_norm": 0.11540161073207855, + "learning_rate": 7.158771761692464e-05, + "loss": 0.0616, + "step": 3900 + }, + { + "epoch": 0.9159053642539237, + "grad_norm": 0.1789163500070572, + "learning_rate": 7.143845832136188e-05, + "loss": 0.0315, + "step": 3910 + }, + { + "epoch": 0.9182478332162098, + "grad_norm": 0.2873396873474121, + "learning_rate": 7.128896457825364e-05, + "loss": 0.0577, + "step": 3920 + }, + { + "epoch": 0.9205903021784961, + "grad_norm": 0.035885997116565704, + "learning_rate": 7.113923802243957e-05, + "loss": 0.0462, + "step": 3930 + }, + { + "epoch": 0.9229327711407824, + "grad_norm": 0.380929172039032, + "learning_rate": 7.09892802913053e-05, + "loss": 0.0285, + "step": 3940 + }, + { + "epoch": 0.9252752401030686, + "grad_norm": 0.21406327188014984, + "learning_rate": 7.083909302476453e-05, + "loss": 0.0255, + "step": 3950 + }, + { + "epoch": 0.9276177090653549, + "grad_norm": 0.04998482018709183, + "learning_rate": 7.068867786524116e-05, + "loss": 0.0285, + "step": 3960 + }, + { + "epoch": 0.9299601780276411, + "grad_norm": 0.19604696333408356, + "learning_rate": 7.053803645765128e-05, + "loss": 0.0345, + "step": 3970 + }, + { + "epoch": 0.9323026469899274, + "grad_norm": 0.6424615979194641, + "learning_rate": 7.038717044938519e-05, + "loss": 0.0411, + "step": 3980 + }, + { + "epoch": 0.9346451159522137, + "grad_norm": 0.0754154697060585, + "learning_rate": 7.023608149028937e-05, + "loss": 0.0243, + "step": 3990 + }, + { + "epoch": 0.9369875849144998, + "grad_norm": 0.26757097244262695, + "learning_rate": 7.008477123264848e-05, + "loss": 0.0414, + "step": 4000 + }, + { + "epoch": 0.9393300538767861, + "grad_norm": 0.14239585399627686, + "learning_rate": 6.993324133116726e-05, + "loss": 0.0259, + "step": 4010 + }, + { + "epoch": 0.9416725228390724, + "grad_norm": 0.12988215684890747, + "learning_rate": 6.978149344295242e-05, + "loss": 0.0279, + "step": 4020 + }, + { + "epoch": 0.9440149918013586, + "grad_norm": 0.3678188920021057, + "learning_rate": 6.962952922749457e-05, + "loss": 0.0353, + "step": 4030 + }, + { + "epoch": 0.9463574607636449, + "grad_norm": 0.6559092402458191, + "learning_rate": 6.947735034665002e-05, + "loss": 0.0558, + "step": 4040 + }, + { + "epoch": 0.9486999297259311, + "grad_norm": 0.607363760471344, + "learning_rate": 6.932495846462261e-05, + "loss": 0.0459, + "step": 4050 + }, + { + "epoch": 0.9510423986882174, + "grad_norm": 0.22406215965747833, + "learning_rate": 6.917235524794558e-05, + "loss": 0.0412, + "step": 4060 + }, + { + "epoch": 0.9533848676505037, + "grad_norm": 0.2519318461418152, + "learning_rate": 6.901954236546323e-05, + "loss": 0.0355, + "step": 4070 + }, + { + "epoch": 0.9557273366127899, + "grad_norm": 0.40484338998794556, + "learning_rate": 6.886652148831279e-05, + "loss": 0.0446, + "step": 4080 + }, + { + "epoch": 0.9580698055750761, + "grad_norm": 0.36861318349838257, + "learning_rate": 6.871329428990602e-05, + "loss": 0.0324, + "step": 4090 + }, + { + "epoch": 0.9604122745373623, + "grad_norm": 0.15483994781970978, + "learning_rate": 6.855986244591104e-05, + "loss": 0.0265, + "step": 4100 + }, + { + "epoch": 0.9627547434996486, + "grad_norm": 0.12822240591049194, + "learning_rate": 6.840622763423391e-05, + "loss": 0.0251, + "step": 4110 + }, + { + "epoch": 0.9650972124619349, + "grad_norm": 0.2436823546886444, + "learning_rate": 6.825239153500029e-05, + "loss": 0.0354, + "step": 4120 + }, + { + "epoch": 0.9674396814242211, + "grad_norm": 0.11992768943309784, + "learning_rate": 6.809835583053715e-05, + "loss": 0.0355, + "step": 4130 + }, + { + "epoch": 0.9697821503865074, + "grad_norm": 0.05282627418637276, + "learning_rate": 6.794412220535426e-05, + "loss": 0.0325, + "step": 4140 + }, + { + "epoch": 0.9721246193487937, + "grad_norm": 0.1702210009098053, + "learning_rate": 6.778969234612584e-05, + "loss": 0.0421, + "step": 4150 + }, + { + "epoch": 0.9744670883110799, + "grad_norm": 0.30918455123901367, + "learning_rate": 6.763506794167208e-05, + "loss": 0.0306, + "step": 4160 + }, + { + "epoch": 0.9768095572733662, + "grad_norm": 0.18471957743167877, + "learning_rate": 6.748025068294067e-05, + "loss": 0.026, + "step": 4170 + }, + { + "epoch": 0.9791520262356523, + "grad_norm": 0.2867111265659332, + "learning_rate": 6.732524226298841e-05, + "loss": 0.0368, + "step": 4180 + }, + { + "epoch": 0.9814944951979386, + "grad_norm": 0.5615723729133606, + "learning_rate": 6.71700443769625e-05, + "loss": 0.0374, + "step": 4190 + }, + { + "epoch": 0.9838369641602249, + "grad_norm": 0.06628378480672836, + "learning_rate": 6.701465872208216e-05, + "loss": 0.0432, + "step": 4200 + }, + { + "epoch": 0.9861794331225111, + "grad_norm": 0.24212607741355896, + "learning_rate": 6.685908699762002e-05, + "loss": 0.0446, + "step": 4210 + }, + { + "epoch": 0.9885219020847974, + "grad_norm": 0.1411833018064499, + "learning_rate": 6.670333090488356e-05, + "loss": 0.0281, + "step": 4220 + }, + { + "epoch": 0.9908643710470836, + "grad_norm": 0.4957182705402374, + "learning_rate": 6.654739214719641e-05, + "loss": 0.0385, + "step": 4230 + }, + { + "epoch": 0.9932068400093699, + "grad_norm": 0.2773032486438751, + "learning_rate": 6.639127242987988e-05, + "loss": 0.0351, + "step": 4240 + }, + { + "epoch": 0.9955493089716562, + "grad_norm": 0.6347845196723938, + "learning_rate": 6.623497346023418e-05, + "loss": 0.0519, + "step": 4250 + }, + { + "epoch": 0.9978917779339423, + "grad_norm": 0.39392927289009094, + "learning_rate": 6.607849694751977e-05, + "loss": 0.0415, + "step": 4260 + }, + { + "epoch": 1.0002342468962286, + "grad_norm": 0.12185105681419373, + "learning_rate": 6.592184460293877e-05, + "loss": 0.0413, + "step": 4270 + }, + { + "epoch": 1.0025767158585148, + "grad_norm": 0.4016129970550537, + "learning_rate": 6.576501813961609e-05, + "loss": 0.0473, + "step": 4280 + }, + { + "epoch": 1.0049191848208012, + "grad_norm": 0.10202305018901825, + "learning_rate": 6.56080192725808e-05, + "loss": 0.0476, + "step": 4290 + }, + { + "epoch": 1.0072616537830874, + "grad_norm": 0.08643211424350739, + "learning_rate": 6.545084971874738e-05, + "loss": 0.0363, + "step": 4300 + }, + { + "epoch": 1.0096041227453736, + "grad_norm": 0.4279628396034241, + "learning_rate": 6.529351119689688e-05, + "loss": 0.0343, + "step": 4310 + }, + { + "epoch": 1.0119465917076598, + "grad_norm": 0.0435931533575058, + "learning_rate": 6.513600542765817e-05, + "loss": 0.0363, + "step": 4320 + }, + { + "epoch": 1.0142890606699462, + "grad_norm": 0.11314094811677933, + "learning_rate": 6.497833413348909e-05, + "loss": 0.0409, + "step": 4330 + }, + { + "epoch": 1.0166315296322324, + "grad_norm": 0.049418941140174866, + "learning_rate": 6.48204990386577e-05, + "loss": 0.027, + "step": 4340 + }, + { + "epoch": 1.0189739985945185, + "grad_norm": 0.0937579795718193, + "learning_rate": 6.466250186922325e-05, + "loss": 0.0386, + "step": 4350 + }, + { + "epoch": 1.021316467556805, + "grad_norm": 0.17256158590316772, + "learning_rate": 6.450434435301751e-05, + "loss": 0.0283, + "step": 4360 + }, + { + "epoch": 1.0236589365190911, + "grad_norm": 0.41623151302337646, + "learning_rate": 6.43460282196257e-05, + "loss": 0.0309, + "step": 4370 + }, + { + "epoch": 1.0260014054813773, + "grad_norm": 0.25574249029159546, + "learning_rate": 6.418755520036775e-05, + "loss": 0.017, + "step": 4380 + }, + { + "epoch": 1.0283438744436637, + "grad_norm": 0.12465788424015045, + "learning_rate": 6.402892702827916e-05, + "loss": 0.028, + "step": 4390 + }, + { + "epoch": 1.03068634340595, + "grad_norm": 0.2367735058069229, + "learning_rate": 6.387014543809223e-05, + "loss": 0.0288, + "step": 4400 + }, + { + "epoch": 1.033028812368236, + "grad_norm": 0.15218676626682281, + "learning_rate": 6.371121216621698e-05, + "loss": 0.0414, + "step": 4410 + }, + { + "epoch": 1.0353712813305225, + "grad_norm": 0.09345823526382446, + "learning_rate": 6.355212895072223e-05, + "loss": 0.0348, + "step": 4420 + }, + { + "epoch": 1.0377137502928087, + "grad_norm": 0.25038620829582214, + "learning_rate": 6.339289753131649e-05, + "loss": 0.0472, + "step": 4430 + }, + { + "epoch": 1.0400562192550948, + "grad_norm": 0.5955792665481567, + "learning_rate": 6.323351964932908e-05, + "loss": 0.0612, + "step": 4440 + }, + { + "epoch": 1.042398688217381, + "grad_norm": 0.10471931844949722, + "learning_rate": 6.307399704769099e-05, + "loss": 0.0319, + "step": 4450 + }, + { + "epoch": 1.0447411571796674, + "grad_norm": 0.3728072941303253, + "learning_rate": 6.291433147091583e-05, + "loss": 0.0346, + "step": 4460 + }, + { + "epoch": 1.0470836261419536, + "grad_norm": 0.13940206170082092, + "learning_rate": 6.275452466508077e-05, + "loss": 0.0315, + "step": 4470 + }, + { + "epoch": 1.0494260951042398, + "grad_norm": 0.24892286956310272, + "learning_rate": 6.259457837780742e-05, + "loss": 0.0271, + "step": 4480 + }, + { + "epoch": 1.0517685640665262, + "grad_norm": 0.09227164089679718, + "learning_rate": 6.243449435824276e-05, + "loss": 0.035, + "step": 4490 + }, + { + "epoch": 1.0541110330288124, + "grad_norm": 0.4062785804271698, + "learning_rate": 6.227427435703997e-05, + "loss": 0.0381, + "step": 4500 + }, + { + "epoch": 1.0564535019910986, + "grad_norm": 0.10490421950817108, + "learning_rate": 6.211392012633932e-05, + "loss": 0.0424, + "step": 4510 + }, + { + "epoch": 1.058795970953385, + "grad_norm": 0.08822830021381378, + "learning_rate": 6.195343341974899e-05, + "loss": 0.0484, + "step": 4520 + }, + { + "epoch": 1.0611384399156711, + "grad_norm": 0.22914232313632965, + "learning_rate": 6.179281599232591e-05, + "loss": 0.0388, + "step": 4530 + }, + { + "epoch": 1.0634809088779573, + "grad_norm": 0.6712221503257751, + "learning_rate": 6.163206960055651e-05, + "loss": 0.0853, + "step": 4540 + }, + { + "epoch": 1.0658233778402435, + "grad_norm": 0.2438327521085739, + "learning_rate": 6.147119600233758e-05, + "loss": 0.0177, + "step": 4550 + }, + { + "epoch": 1.06816584680253, + "grad_norm": 0.45352616906166077, + "learning_rate": 6.131019695695702e-05, + "loss": 0.0798, + "step": 4560 + }, + { + "epoch": 1.070508315764816, + "grad_norm": 0.17237244546413422, + "learning_rate": 6.11490742250746e-05, + "loss": 0.037, + "step": 4570 + }, + { + "epoch": 1.0728507847271023, + "grad_norm": 0.7011030316352844, + "learning_rate": 6.0987829568702656e-05, + "loss": 0.0549, + "step": 4580 + }, + { + "epoch": 1.0751932536893887, + "grad_norm": 0.14807315170764923, + "learning_rate": 6.0826464751186994e-05, + "loss": 0.0483, + "step": 4590 + }, + { + "epoch": 1.0775357226516749, + "grad_norm": 0.42932969331741333, + "learning_rate": 6.066498153718735e-05, + "loss": 0.0388, + "step": 4600 + }, + { + "epoch": 1.079878191613961, + "grad_norm": 0.13377119600772858, + "learning_rate": 6.05033816926583e-05, + "loss": 0.0464, + "step": 4610 + }, + { + "epoch": 1.0822206605762474, + "grad_norm": 0.13043726980686188, + "learning_rate": 6.034166698482984e-05, + "loss": 0.0234, + "step": 4620 + }, + { + "epoch": 1.0845631295385336, + "grad_norm": 0.23946554958820343, + "learning_rate": 6.017983918218812e-05, + "loss": 0.0415, + "step": 4630 + }, + { + "epoch": 1.0869055985008198, + "grad_norm": 0.11139467358589172, + "learning_rate": 6.001790005445607e-05, + "loss": 0.0397, + "step": 4640 + }, + { + "epoch": 1.0892480674631062, + "grad_norm": 0.1447746455669403, + "learning_rate": 5.985585137257401e-05, + "loss": 0.0293, + "step": 4650 + }, + { + "epoch": 1.0915905364253924, + "grad_norm": 0.45925086736679077, + "learning_rate": 5.969369490868042e-05, + "loss": 0.03, + "step": 4660 + }, + { + "epoch": 1.0939330053876786, + "grad_norm": 0.2177567183971405, + "learning_rate": 5.953143243609235e-05, + "loss": 0.042, + "step": 4670 + }, + { + "epoch": 1.096275474349965, + "grad_norm": 0.20075875520706177, + "learning_rate": 5.9369065729286245e-05, + "loss": 0.0384, + "step": 4680 + }, + { + "epoch": 1.0986179433122512, + "grad_norm": 0.16894571483135223, + "learning_rate": 5.9206596563878357e-05, + "loss": 0.0308, + "step": 4690 + }, + { + "epoch": 1.1009604122745373, + "grad_norm": 0.09761305898427963, + "learning_rate": 5.90440267166055e-05, + "loss": 0.0244, + "step": 4700 + }, + { + "epoch": 1.1033028812368235, + "grad_norm": 0.04163440316915512, + "learning_rate": 5.888135796530544e-05, + "loss": 0.0191, + "step": 4710 + }, + { + "epoch": 1.10564535019911, + "grad_norm": 0.27570199966430664, + "learning_rate": 5.871859208889759e-05, + "loss": 0.0222, + "step": 4720 + }, + { + "epoch": 1.107987819161396, + "grad_norm": 0.2948501706123352, + "learning_rate": 5.85557308673635e-05, + "loss": 0.0442, + "step": 4730 + }, + { + "epoch": 1.1103302881236823, + "grad_norm": 0.26524093747138977, + "learning_rate": 5.8392776081727385e-05, + "loss": 0.0347, + "step": 4740 + }, + { + "epoch": 1.1126727570859687, + "grad_norm": 0.26801493763923645, + "learning_rate": 5.8229729514036705e-05, + "loss": 0.0299, + "step": 4750 + }, + { + "epoch": 1.1150152260482549, + "grad_norm": 0.0498003289103508, + "learning_rate": 5.8066592947342555e-05, + "loss": 0.0289, + "step": 4760 + }, + { + "epoch": 1.117357695010541, + "grad_norm": 0.2827109694480896, + "learning_rate": 5.7903368165680327e-05, + "loss": 0.0328, + "step": 4770 + }, + { + "epoch": 1.1197001639728275, + "grad_norm": 0.18607333302497864, + "learning_rate": 5.7740056954050084e-05, + "loss": 0.0277, + "step": 4780 + }, + { + "epoch": 1.1220426329351136, + "grad_norm": 0.10899386554956436, + "learning_rate": 5.757666109839702e-05, + "loss": 0.0397, + "step": 4790 + }, + { + "epoch": 1.1243851018973998, + "grad_norm": 0.9352733492851257, + "learning_rate": 5.74131823855921e-05, + "loss": 0.0801, + "step": 4800 + }, + { + "epoch": 1.126727570859686, + "grad_norm": 0.15164723992347717, + "learning_rate": 5.72496226034123e-05, + "loss": 0.0572, + "step": 4810 + }, + { + "epoch": 1.1290700398219724, + "grad_norm": 0.06457802653312683, + "learning_rate": 5.7085983540521216e-05, + "loss": 0.041, + "step": 4820 + }, + { + "epoch": 1.1314125087842586, + "grad_norm": 0.13067546486854553, + "learning_rate": 5.692226698644938e-05, + "loss": 0.0345, + "step": 4830 + }, + { + "epoch": 1.1337549777465448, + "grad_norm": 0.4330101013183594, + "learning_rate": 5.675847473157485e-05, + "loss": 0.0436, + "step": 4840 + }, + { + "epoch": 1.1360974467088312, + "grad_norm": 0.41848742961883545, + "learning_rate": 5.6594608567103456e-05, + "loss": 0.0216, + "step": 4850 + }, + { + "epoch": 1.1384399156711174, + "grad_norm": 0.13505397737026215, + "learning_rate": 5.6430670285049314e-05, + "loss": 0.0305, + "step": 4860 + }, + { + "epoch": 1.1407823846334035, + "grad_norm": 0.4569176435470581, + "learning_rate": 5.6266661678215216e-05, + "loss": 0.0324, + "step": 4870 + }, + { + "epoch": 1.14312485359569, + "grad_norm": 0.4705914556980133, + "learning_rate": 5.6102584540173006e-05, + "loss": 0.0478, + "step": 4880 + }, + { + "epoch": 1.1454673225579761, + "grad_norm": 0.276143342256546, + "learning_rate": 5.5938440665244006e-05, + "loss": 0.0578, + "step": 4890 + }, + { + "epoch": 1.1478097915202623, + "grad_norm": 0.3393331468105316, + "learning_rate": 5.577423184847932e-05, + "loss": 0.0507, + "step": 4900 + }, + { + "epoch": 1.1501522604825487, + "grad_norm": 0.18119889497756958, + "learning_rate": 5.560995988564023e-05, + "loss": 0.0197, + "step": 4910 + }, + { + "epoch": 1.1524947294448349, + "grad_norm": 0.0739196389913559, + "learning_rate": 5.544562657317863e-05, + "loss": 0.0297, + "step": 4920 + }, + { + "epoch": 1.154837198407121, + "grad_norm": 0.22677703201770782, + "learning_rate": 5.52812337082173e-05, + "loss": 0.0407, + "step": 4930 + }, + { + "epoch": 1.1571796673694075, + "grad_norm": 0.054532766342163086, + "learning_rate": 5.511678308853026e-05, + "loss": 0.0448, + "step": 4940 + }, + { + "epoch": 1.1595221363316937, + "grad_norm": 0.45871463418006897, + "learning_rate": 5.495227651252315e-05, + "loss": 0.0316, + "step": 4950 + }, + { + "epoch": 1.1618646052939798, + "grad_norm": 0.09669110924005508, + "learning_rate": 5.478771577921351e-05, + "loss": 0.0404, + "step": 4960 + }, + { + "epoch": 1.164207074256266, + "grad_norm": 0.1810620278120041, + "learning_rate": 5.462310268821118e-05, + "loss": 0.0233, + "step": 4970 + }, + { + "epoch": 1.1665495432185524, + "grad_norm": 0.10690245032310486, + "learning_rate": 5.445843903969854e-05, + "loss": 0.033, + "step": 4980 + }, + { + "epoch": 1.1688920121808386, + "grad_norm": 0.3685993552207947, + "learning_rate": 5.4293726634410855e-05, + "loss": 0.0204, + "step": 4990 + }, + { + "epoch": 1.1712344811431248, + "grad_norm": 0.17481215298175812, + "learning_rate": 5.4128967273616625e-05, + "loss": 0.0269, + "step": 5000 + }, + { + "epoch": 1.1735769501054112, + "grad_norm": 0.6450178027153015, + "learning_rate": 5.396416275909779e-05, + "loss": 0.052, + "step": 5010 + }, + { + "epoch": 1.1759194190676974, + "grad_norm": 0.0964297205209732, + "learning_rate": 5.379931489313016e-05, + "loss": 0.0299, + "step": 5020 + }, + { + "epoch": 1.1782618880299836, + "grad_norm": 0.06013895943760872, + "learning_rate": 5.363442547846356e-05, + "loss": 0.0334, + "step": 5030 + }, + { + "epoch": 1.1806043569922697, + "grad_norm": 0.032787106931209564, + "learning_rate": 5.3469496318302204e-05, + "loss": 0.0506, + "step": 5040 + }, + { + "epoch": 1.1829468259545561, + "grad_norm": 0.3833360970020294, + "learning_rate": 5.330452921628497e-05, + "loss": 0.0331, + "step": 5050 + }, + { + "epoch": 1.1852892949168423, + "grad_norm": 0.08078952878713608, + "learning_rate": 5.313952597646568e-05, + "loss": 0.0171, + "step": 5060 + }, + { + "epoch": 1.1876317638791285, + "grad_norm": 0.09187212586402893, + "learning_rate": 5.297448840329329e-05, + "loss": 0.0195, + "step": 5070 + }, + { + "epoch": 1.189974232841415, + "grad_norm": 0.2530211806297302, + "learning_rate": 5.280941830159227e-05, + "loss": 0.0219, + "step": 5080 + }, + { + "epoch": 1.192316701803701, + "grad_norm": 0.059026945382356644, + "learning_rate": 5.264431747654284e-05, + "loss": 0.0362, + "step": 5090 + }, + { + "epoch": 1.1946591707659873, + "grad_norm": 0.04210277274250984, + "learning_rate": 5.247918773366112e-05, + "loss": 0.0314, + "step": 5100 + }, + { + "epoch": 1.1970016397282737, + "grad_norm": 0.4919138550758362, + "learning_rate": 5.231403087877955e-05, + "loss": 0.0335, + "step": 5110 + }, + { + "epoch": 1.1993441086905599, + "grad_norm": 0.06546583771705627, + "learning_rate": 5.214884871802703e-05, + "loss": 0.0223, + "step": 5120 + }, + { + "epoch": 1.201686577652846, + "grad_norm": 0.08152215927839279, + "learning_rate": 5.198364305780922e-05, + "loss": 0.0316, + "step": 5130 + }, + { + "epoch": 1.2040290466151324, + "grad_norm": 0.2411283552646637, + "learning_rate": 5.1818415704788725e-05, + "loss": 0.0669, + "step": 5140 + }, + { + "epoch": 1.2063715155774186, + "grad_norm": 0.49666517972946167, + "learning_rate": 5.165316846586541e-05, + "loss": 0.041, + "step": 5150 + }, + { + "epoch": 1.2087139845397048, + "grad_norm": 0.08363020420074463, + "learning_rate": 5.148790314815663e-05, + "loss": 0.0209, + "step": 5160 + }, + { + "epoch": 1.2110564535019912, + "grad_norm": 0.04317115619778633, + "learning_rate": 5.132262155897739e-05, + "loss": 0.0367, + "step": 5170 + }, + { + "epoch": 1.2133989224642774, + "grad_norm": 0.1066800057888031, + "learning_rate": 5.1157325505820694e-05, + "loss": 0.0399, + "step": 5180 + }, + { + "epoch": 1.2157413914265636, + "grad_norm": 0.17649437487125397, + "learning_rate": 5.0992016796337686e-05, + "loss": 0.0236, + "step": 5190 + }, + { + "epoch": 1.21808386038885, + "grad_norm": 0.14966139197349548, + "learning_rate": 5.0826697238317935e-05, + "loss": 0.0195, + "step": 5200 + }, + { + "epoch": 1.2204263293511362, + "grad_norm": 0.03593892604112625, + "learning_rate": 5.066136863966963e-05, + "loss": 0.0202, + "step": 5210 + }, + { + "epoch": 1.2227687983134223, + "grad_norm": 0.46276217699050903, + "learning_rate": 5.0496032808399815e-05, + "loss": 0.0464, + "step": 5220 + }, + { + "epoch": 1.2251112672757085, + "grad_norm": 0.21946477890014648, + "learning_rate": 5.033069155259471e-05, + "loss": 0.0301, + "step": 5230 + }, + { + "epoch": 1.227453736237995, + "grad_norm": 0.08784784376621246, + "learning_rate": 5.016534668039976e-05, + "loss": 0.0316, + "step": 5240 + }, + { + "epoch": 1.229796205200281, + "grad_norm": 0.1410629153251648, + "learning_rate": 5e-05, + "loss": 0.0263, + "step": 5250 + }, + { + "epoch": 1.2321386741625673, + "grad_norm": 0.07868409156799316, + "learning_rate": 4.9834653319600246e-05, + "loss": 0.0213, + "step": 5260 + }, + { + "epoch": 1.2344811431248537, + "grad_norm": 0.215213343501091, + "learning_rate": 4.96693084474053e-05, + "loss": 0.0457, + "step": 5270 + }, + { + "epoch": 1.2368236120871399, + "grad_norm": 0.16864515841007233, + "learning_rate": 4.950396719160018e-05, + "loss": 0.0336, + "step": 5280 + }, + { + "epoch": 1.239166081049426, + "grad_norm": 0.0474487841129303, + "learning_rate": 4.93386313603304e-05, + "loss": 0.0227, + "step": 5290 + }, + { + "epoch": 1.2415085500117122, + "grad_norm": 0.5898747444152832, + "learning_rate": 4.917330276168208e-05, + "loss": 0.0165, + "step": 5300 + }, + { + "epoch": 1.2438510189739986, + "grad_norm": 0.4065062403678894, + "learning_rate": 4.9007983203662326e-05, + "loss": 0.0271, + "step": 5310 + }, + { + "epoch": 1.2461934879362848, + "grad_norm": 0.19243858754634857, + "learning_rate": 4.884267449417931e-05, + "loss": 0.0222, + "step": 5320 + }, + { + "epoch": 1.248535956898571, + "grad_norm": 0.14905819296836853, + "learning_rate": 4.867737844102261e-05, + "loss": 0.0183, + "step": 5330 + }, + { + "epoch": 1.2508784258608574, + "grad_norm": 0.28917795419692993, + "learning_rate": 4.851209685184338e-05, + "loss": 0.0194, + "step": 5340 + }, + { + "epoch": 1.2532208948231436, + "grad_norm": 0.3423207104206085, + "learning_rate": 4.834683153413459e-05, + "loss": 0.0281, + "step": 5350 + }, + { + "epoch": 1.2555633637854298, + "grad_norm": 0.04684186726808548, + "learning_rate": 4.818158429521129e-05, + "loss": 0.0266, + "step": 5360 + }, + { + "epoch": 1.2579058327477162, + "grad_norm": 0.27714163064956665, + "learning_rate": 4.801635694219079e-05, + "loss": 0.0468, + "step": 5370 + }, + { + "epoch": 1.2602483017100023, + "grad_norm": 0.1844978630542755, + "learning_rate": 4.785115128197298e-05, + "loss": 0.0392, + "step": 5380 + }, + { + "epoch": 1.2625907706722885, + "grad_norm": 0.36138930916786194, + "learning_rate": 4.7685969121220456e-05, + "loss": 0.029, + "step": 5390 + }, + { + "epoch": 1.264933239634575, + "grad_norm": 0.3211914896965027, + "learning_rate": 4.7520812266338885e-05, + "loss": 0.0611, + "step": 5400 + }, + { + "epoch": 1.2672757085968611, + "grad_norm": 0.5163668990135193, + "learning_rate": 4.735568252345718e-05, + "loss": 0.0481, + "step": 5410 + }, + { + "epoch": 1.2696181775591473, + "grad_norm": 0.5117266178131104, + "learning_rate": 4.7190581698407725e-05, + "loss": 0.0326, + "step": 5420 + }, + { + "epoch": 1.2719606465214337, + "grad_norm": 0.24475805461406708, + "learning_rate": 4.702551159670672e-05, + "loss": 0.0229, + "step": 5430 + }, + { + "epoch": 1.2743031154837199, + "grad_norm": 0.07154544442892075, + "learning_rate": 4.6860474023534335e-05, + "loss": 0.042, + "step": 5440 + }, + { + "epoch": 1.276645584446006, + "grad_norm": 0.28115877509117126, + "learning_rate": 4.669547078371504e-05, + "loss": 0.0249, + "step": 5450 + }, + { + "epoch": 1.2789880534082925, + "grad_norm": 0.22904540598392487, + "learning_rate": 4.65305036816978e-05, + "loss": 0.0339, + "step": 5460 + }, + { + "epoch": 1.2813305223705787, + "grad_norm": 0.11327308416366577, + "learning_rate": 4.6365574521536445e-05, + "loss": 0.0175, + "step": 5470 + }, + { + "epoch": 1.2836729913328648, + "grad_norm": 0.1697210669517517, + "learning_rate": 4.620068510686985e-05, + "loss": 0.0362, + "step": 5480 + }, + { + "epoch": 1.286015460295151, + "grad_norm": 0.08553613722324371, + "learning_rate": 4.60358372409022e-05, + "loss": 0.0159, + "step": 5490 + }, + { + "epoch": 1.2883579292574374, + "grad_norm": 0.07890176773071289, + "learning_rate": 4.5871032726383386e-05, + "loss": 0.0288, + "step": 5500 + }, + { + "epoch": 1.2907003982197236, + "grad_norm": 0.33075398206710815, + "learning_rate": 4.570627336558915e-05, + "loss": 0.0262, + "step": 5510 + }, + { + "epoch": 1.2930428671820098, + "grad_norm": 0.09929897636175156, + "learning_rate": 4.554156096030149e-05, + "loss": 0.0231, + "step": 5520 + }, + { + "epoch": 1.295385336144296, + "grad_norm": 0.1128670945763588, + "learning_rate": 4.537689731178883e-05, + "loss": 0.0201, + "step": 5530 + }, + { + "epoch": 1.2977278051065824, + "grad_norm": 0.05418454855680466, + "learning_rate": 4.5212284220786494e-05, + "loss": 0.0404, + "step": 5540 + }, + { + "epoch": 1.3000702740688685, + "grad_norm": 0.1747845560312271, + "learning_rate": 4.504772348747687e-05, + "loss": 0.0324, + "step": 5550 + }, + { + "epoch": 1.3024127430311547, + "grad_norm": 0.6264855265617371, + "learning_rate": 4.488321691146975e-05, + "loss": 0.0607, + "step": 5560 + }, + { + "epoch": 1.3047552119934411, + "grad_norm": 0.20012634992599487, + "learning_rate": 4.471876629178273e-05, + "loss": 0.0336, + "step": 5570 + }, + { + "epoch": 1.3070976809557273, + "grad_norm": 0.07151951640844345, + "learning_rate": 4.4554373426821374e-05, + "loss": 0.0199, + "step": 5580 + }, + { + "epoch": 1.3094401499180135, + "grad_norm": 0.09090318530797958, + "learning_rate": 4.439004011435979e-05, + "loss": 0.0263, + "step": 5590 + }, + { + "epoch": 1.3117826188803, + "grad_norm": 0.09504502266645432, + "learning_rate": 4.4225768151520694e-05, + "loss": 0.038, + "step": 5600 + }, + { + "epoch": 1.314125087842586, + "grad_norm": 0.19809271395206451, + "learning_rate": 4.406155933475599e-05, + "loss": 0.0376, + "step": 5610 + }, + { + "epoch": 1.3164675568048723, + "grad_norm": 0.2558313012123108, + "learning_rate": 4.3897415459827e-05, + "loss": 0.043, + "step": 5620 + }, + { + "epoch": 1.3188100257671587, + "grad_norm": 0.08637325465679169, + "learning_rate": 4.373333832178478e-05, + "loss": 0.0341, + "step": 5630 + }, + { + "epoch": 1.3211524947294448, + "grad_norm": 0.06880134344100952, + "learning_rate": 4.3569329714950704e-05, + "loss": 0.0229, + "step": 5640 + }, + { + "epoch": 1.323494963691731, + "grad_norm": 0.16358880698680878, + "learning_rate": 4.3405391432896555e-05, + "loss": 0.0387, + "step": 5650 + }, + { + "epoch": 1.3258374326540174, + "grad_norm": 0.05642487108707428, + "learning_rate": 4.324152526842517e-05, + "loss": 0.0291, + "step": 5660 + }, + { + "epoch": 1.3281799016163036, + "grad_norm": 0.13398276269435883, + "learning_rate": 4.307773301355062e-05, + "loss": 0.0449, + "step": 5670 + }, + { + "epoch": 1.3305223705785898, + "grad_norm": 0.41730400919914246, + "learning_rate": 4.291401645947879e-05, + "loss": 0.0336, + "step": 5680 + }, + { + "epoch": 1.3328648395408762, + "grad_norm": 0.1082252785563469, + "learning_rate": 4.275037739658771e-05, + "loss": 0.0159, + "step": 5690 + }, + { + "epoch": 1.3352073085031624, + "grad_norm": 0.5044443607330322, + "learning_rate": 4.2586817614407895e-05, + "loss": 0.026, + "step": 5700 + }, + { + "epoch": 1.3375497774654486, + "grad_norm": 0.19207948446273804, + "learning_rate": 4.2423338901602985e-05, + "loss": 0.0205, + "step": 5710 + }, + { + "epoch": 1.339892246427735, + "grad_norm": 0.14319565892219543, + "learning_rate": 4.2259943045949934e-05, + "loss": 0.0174, + "step": 5720 + }, + { + "epoch": 1.3422347153900211, + "grad_norm": 0.0638875886797905, + "learning_rate": 4.209663183431969e-05, + "loss": 0.0272, + "step": 5730 + }, + { + "epoch": 1.3445771843523073, + "grad_norm": 0.5683619379997253, + "learning_rate": 4.1933407052657456e-05, + "loss": 0.0265, + "step": 5740 + }, + { + "epoch": 1.3469196533145935, + "grad_norm": 0.1282253861427307, + "learning_rate": 4.17702704859633e-05, + "loss": 0.0285, + "step": 5750 + }, + { + "epoch": 1.3492621222768797, + "grad_norm": 0.2435198575258255, + "learning_rate": 4.160722391827262e-05, + "loss": 0.0348, + "step": 5760 + }, + { + "epoch": 1.351604591239166, + "grad_norm": 0.10652618855237961, + "learning_rate": 4.14442691326365e-05, + "loss": 0.0409, + "step": 5770 + }, + { + "epoch": 1.3539470602014523, + "grad_norm": 0.06271979957818985, + "learning_rate": 4.1281407911102425e-05, + "loss": 0.0559, + "step": 5780 + }, + { + "epoch": 1.3562895291637385, + "grad_norm": 0.05037263408303261, + "learning_rate": 4.111864203469457e-05, + "loss": 0.0263, + "step": 5790 + }, + { + "epoch": 1.3586319981260249, + "grad_norm": 0.2569263279438019, + "learning_rate": 4.095597328339452e-05, + "loss": 0.0259, + "step": 5800 + }, + { + "epoch": 1.360974467088311, + "grad_norm": 0.39117732644081116, + "learning_rate": 4.079340343612165e-05, + "loss": 0.0413, + "step": 5810 + }, + { + "epoch": 1.3633169360505972, + "grad_norm": 0.0529431588947773, + "learning_rate": 4.063093427071376e-05, + "loss": 0.0615, + "step": 5820 + }, + { + "epoch": 1.3656594050128836, + "grad_norm": 0.18688374757766724, + "learning_rate": 4.046856756390767e-05, + "loss": 0.0184, + "step": 5830 + }, + { + "epoch": 1.3680018739751698, + "grad_norm": 0.08132046461105347, + "learning_rate": 4.0306305091319595e-05, + "loss": 0.0203, + "step": 5840 + }, + { + "epoch": 1.370344342937456, + "grad_norm": 0.2862519323825836, + "learning_rate": 4.0144148627425993e-05, + "loss": 0.0497, + "step": 5850 + }, + { + "epoch": 1.3726868118997424, + "grad_norm": 0.12356792390346527, + "learning_rate": 3.9982099945543945e-05, + "loss": 0.0202, + "step": 5860 + }, + { + "epoch": 1.3750292808620286, + "grad_norm": 0.43368279933929443, + "learning_rate": 3.982016081781189e-05, + "loss": 0.0305, + "step": 5870 + }, + { + "epoch": 1.3773717498243148, + "grad_norm": 0.03974668309092522, + "learning_rate": 3.965833301517017e-05, + "loss": 0.0262, + "step": 5880 + }, + { + "epoch": 1.3797142187866012, + "grad_norm": 0.16461171209812164, + "learning_rate": 3.949661830734172e-05, + "loss": 0.0375, + "step": 5890 + }, + { + "epoch": 1.3820566877488873, + "grad_norm": 0.06088129058480263, + "learning_rate": 3.933501846281267e-05, + "loss": 0.0192, + "step": 5900 + }, + { + "epoch": 1.3843991567111735, + "grad_norm": 0.2690442204475403, + "learning_rate": 3.917353524881302e-05, + "loss": 0.0159, + "step": 5910 + }, + { + "epoch": 1.38674162567346, + "grad_norm": 0.09126674383878708, + "learning_rate": 3.901217043129735e-05, + "loss": 0.0334, + "step": 5920 + }, + { + "epoch": 1.3890840946357461, + "grad_norm": 0.11212047934532166, + "learning_rate": 3.8850925774925425e-05, + "loss": 0.0208, + "step": 5930 + }, + { + "epoch": 1.3914265635980323, + "grad_norm": 0.03019798919558525, + "learning_rate": 3.8689803043043e-05, + "loss": 0.0218, + "step": 5940 + }, + { + "epoch": 1.3937690325603187, + "grad_norm": 0.11158014088869095, + "learning_rate": 3.852880399766243e-05, + "loss": 0.0327, + "step": 5950 + }, + { + "epoch": 1.3961115015226049, + "grad_norm": 0.1466631442308426, + "learning_rate": 3.836793039944349e-05, + "loss": 0.0316, + "step": 5960 + }, + { + "epoch": 1.398453970484891, + "grad_norm": 0.04972492530941963, + "learning_rate": 3.820718400767409e-05, + "loss": 0.0204, + "step": 5970 + }, + { + "epoch": 1.4007964394471772, + "grad_norm": 0.18622121214866638, + "learning_rate": 3.8046566580251e-05, + "loss": 0.0446, + "step": 5980 + }, + { + "epoch": 1.4031389084094636, + "grad_norm": 0.4047488868236542, + "learning_rate": 3.788607987366069e-05, + "loss": 0.0422, + "step": 5990 + }, + { + "epoch": 1.4054813773717498, + "grad_norm": 0.043907005339860916, + "learning_rate": 3.772572564296005e-05, + "loss": 0.0279, + "step": 6000 + }, + { + "epoch": 1.407823846334036, + "grad_norm": 0.2679661214351654, + "learning_rate": 3.756550564175727e-05, + "loss": 0.0209, + "step": 6010 + }, + { + "epoch": 1.4101663152963222, + "grad_norm": 0.0252488162368536, + "learning_rate": 3.74054216221926e-05, + "loss": 0.0187, + "step": 6020 + }, + { + "epoch": 1.4125087842586086, + "grad_norm": 0.03220526501536369, + "learning_rate": 3.7245475334919246e-05, + "loss": 0.0235, + "step": 6030 + }, + { + "epoch": 1.4148512532208948, + "grad_norm": 0.132725328207016, + "learning_rate": 3.7085668529084184e-05, + "loss": 0.0231, + "step": 6040 + }, + { + "epoch": 1.417193722183181, + "grad_norm": 0.17545637488365173, + "learning_rate": 3.6926002952309016e-05, + "loss": 0.0259, + "step": 6050 + }, + { + "epoch": 1.4195361911454674, + "grad_norm": 0.197429358959198, + "learning_rate": 3.676648035067093e-05, + "loss": 0.0274, + "step": 6060 + }, + { + "epoch": 1.4218786601077535, + "grad_norm": 0.06819231063127518, + "learning_rate": 3.6607102468683526e-05, + "loss": 0.0355, + "step": 6070 + }, + { + "epoch": 1.4242211290700397, + "grad_norm": 0.16003242135047913, + "learning_rate": 3.6447871049277796e-05, + "loss": 0.0376, + "step": 6080 + }, + { + "epoch": 1.4265635980323261, + "grad_norm": 0.13673585653305054, + "learning_rate": 3.628878783378302e-05, + "loss": 0.0213, + "step": 6090 + }, + { + "epoch": 1.4289060669946123, + "grad_norm": 0.15434902906417847, + "learning_rate": 3.612985456190778e-05, + "loss": 0.0126, + "step": 6100 + }, + { + "epoch": 1.4312485359568985, + "grad_norm": 0.17395956814289093, + "learning_rate": 3.597107297172084e-05, + "loss": 0.084, + "step": 6110 + }, + { + "epoch": 1.433591004919185, + "grad_norm": 0.04844974726438522, + "learning_rate": 3.581244479963225e-05, + "loss": 0.0219, + "step": 6120 + }, + { + "epoch": 1.435933473881471, + "grad_norm": 0.04163607209920883, + "learning_rate": 3.5653971780374295e-05, + "loss": 0.0279, + "step": 6130 + }, + { + "epoch": 1.4382759428437573, + "grad_norm": 0.11247994005680084, + "learning_rate": 3.5495655646982505e-05, + "loss": 0.0388, + "step": 6140 + }, + { + "epoch": 1.4406184118060437, + "grad_norm": 0.10106071829795837, + "learning_rate": 3.533749813077677e-05, + "loss": 0.0197, + "step": 6150 + }, + { + "epoch": 1.4429608807683298, + "grad_norm": 0.3352503776550293, + "learning_rate": 3.517950096134232e-05, + "loss": 0.0306, + "step": 6160 + }, + { + "epoch": 1.445303349730616, + "grad_norm": 0.23961161077022552, + "learning_rate": 3.5021665866510925e-05, + "loss": 0.0361, + "step": 6170 + }, + { + "epoch": 1.4476458186929024, + "grad_norm": 0.27124881744384766, + "learning_rate": 3.4863994572341843e-05, + "loss": 0.0215, + "step": 6180 + }, + { + "epoch": 1.4499882876551886, + "grad_norm": 0.19891873002052307, + "learning_rate": 3.470648880310313e-05, + "loss": 0.0356, + "step": 6190 + }, + { + "epoch": 1.4523307566174748, + "grad_norm": 0.2036479115486145, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.024, + "step": 6200 + }, + { + "epoch": 1.4546732255797612, + "grad_norm": 0.20012417435646057, + "learning_rate": 3.439198072741921e-05, + "loss": 0.0601, + "step": 6210 + }, + { + "epoch": 1.4570156945420474, + "grad_norm": 0.09231871366500854, + "learning_rate": 3.423498186038393e-05, + "loss": 0.0264, + "step": 6220 + }, + { + "epoch": 1.4593581635043336, + "grad_norm": 0.08506989479064941, + "learning_rate": 3.407815539706124e-05, + "loss": 0.0326, + "step": 6230 + }, + { + "epoch": 1.4617006324666197, + "grad_norm": 0.07338278740644455, + "learning_rate": 3.392150305248024e-05, + "loss": 0.0261, + "step": 6240 + }, + { + "epoch": 1.4640431014289061, + "grad_norm": 0.04994959384202957, + "learning_rate": 3.3765026539765834e-05, + "loss": 0.0155, + "step": 6250 + }, + { + "epoch": 1.4663855703911923, + "grad_norm": 0.14995336532592773, + "learning_rate": 3.360872757012011e-05, + "loss": 0.0215, + "step": 6260 + }, + { + "epoch": 1.4687280393534785, + "grad_norm": 0.08906183391809464, + "learning_rate": 3.3452607852803584e-05, + "loss": 0.0165, + "step": 6270 + }, + { + "epoch": 1.4710705083157647, + "grad_norm": 0.07266787439584732, + "learning_rate": 3.329666909511645e-05, + "loss": 0.0368, + "step": 6280 + }, + { + "epoch": 1.473412977278051, + "grad_norm": 0.5040260553359985, + "learning_rate": 3.3140913002379995e-05, + "loss": 0.0261, + "step": 6290 + }, + { + "epoch": 1.4757554462403373, + "grad_norm": 0.1433238685131073, + "learning_rate": 3.298534127791785e-05, + "loss": 0.0175, + "step": 6300 + }, + { + "epoch": 1.4780979152026235, + "grad_norm": 0.3917306363582611, + "learning_rate": 3.282995562303754e-05, + "loss": 0.0235, + "step": 6310 + }, + { + "epoch": 1.4804403841649099, + "grad_norm": 0.07920818775892258, + "learning_rate": 3.267475773701161e-05, + "loss": 0.0428, + "step": 6320 + }, + { + "epoch": 1.482782853127196, + "grad_norm": 0.07408647239208221, + "learning_rate": 3.251974931705933e-05, + "loss": 0.0255, + "step": 6330 + }, + { + "epoch": 1.4851253220894822, + "grad_norm": 0.0957607626914978, + "learning_rate": 3.236493205832795e-05, + "loss": 0.0279, + "step": 6340 + }, + { + "epoch": 1.4874677910517686, + "grad_norm": 0.372249037027359, + "learning_rate": 3.221030765387417e-05, + "loss": 0.0302, + "step": 6350 + }, + { + "epoch": 1.4898102600140548, + "grad_norm": 0.20557020604610443, + "learning_rate": 3.205587779464576e-05, + "loss": 0.0374, + "step": 6360 + }, + { + "epoch": 1.492152728976341, + "grad_norm": 0.2854403257369995, + "learning_rate": 3.190164416946285e-05, + "loss": 0.0234, + "step": 6370 + }, + { + "epoch": 1.4944951979386274, + "grad_norm": 0.023650668561458588, + "learning_rate": 3.1747608464999725e-05, + "loss": 0.028, + "step": 6380 + }, + { + "epoch": 1.4968376669009136, + "grad_norm": 0.3256511390209198, + "learning_rate": 3.1593772365766105e-05, + "loss": 0.0349, + "step": 6390 + }, + { + "epoch": 1.4991801358631998, + "grad_norm": 0.10362248122692108, + "learning_rate": 3.144013755408895e-05, + "loss": 0.0181, + "step": 6400 + }, + { + "epoch": 1.5015226048254862, + "grad_norm": 0.22891394793987274, + "learning_rate": 3.128670571009399e-05, + "loss": 0.0139, + "step": 6410 + }, + { + "epoch": 1.5038650737877723, + "grad_norm": 0.3262953460216522, + "learning_rate": 3.113347851168721e-05, + "loss": 0.0276, + "step": 6420 + }, + { + "epoch": 1.5062075427500585, + "grad_norm": 0.04172496870160103, + "learning_rate": 3.098045763453678e-05, + "loss": 0.0151, + "step": 6430 + }, + { + "epoch": 1.508550011712345, + "grad_norm": 0.10430093109607697, + "learning_rate": 3.082764475205442e-05, + "loss": 0.0151, + "step": 6440 + }, + { + "epoch": 1.510892480674631, + "grad_norm": 0.061427149921655655, + "learning_rate": 3.0675041535377405e-05, + "loss": 0.0257, + "step": 6450 + }, + { + "epoch": 1.5132349496369173, + "grad_norm": 0.3583897054195404, + "learning_rate": 3.052264965335e-05, + "loss": 0.0228, + "step": 6460 + }, + { + "epoch": 1.5155774185992037, + "grad_norm": 0.3000676929950714, + "learning_rate": 3.0370470772505433e-05, + "loss": 0.0319, + "step": 6470 + }, + { + "epoch": 1.5179198875614897, + "grad_norm": 0.054317738860845566, + "learning_rate": 3.0218506557047598e-05, + "loss": 0.0258, + "step": 6480 + }, + { + "epoch": 1.520262356523776, + "grad_norm": 0.554436981678009, + "learning_rate": 3.006675866883275e-05, + "loss": 0.0423, + "step": 6490 + }, + { + "epoch": 1.5226048254860625, + "grad_norm": 0.047464508563280106, + "learning_rate": 2.991522876735154e-05, + "loss": 0.0181, + "step": 6500 + }, + { + "epoch": 1.5249472944483484, + "grad_norm": 0.0966537818312645, + "learning_rate": 2.976391850971065e-05, + "loss": 0.059, + "step": 6510 + }, + { + "epoch": 1.5272897634106348, + "grad_norm": 0.2305019199848175, + "learning_rate": 2.9612829550614836e-05, + "loss": 0.0525, + "step": 6520 + }, + { + "epoch": 1.529632232372921, + "grad_norm": 0.0933275818824768, + "learning_rate": 2.9461963542348737e-05, + "loss": 0.0181, + "step": 6530 + }, + { + "epoch": 1.5319747013352072, + "grad_norm": 0.06837865710258484, + "learning_rate": 2.931132213475884e-05, + "loss": 0.0305, + "step": 6540 + }, + { + "epoch": 1.5343171702974936, + "grad_norm": 0.3509468138217926, + "learning_rate": 2.916090697523549e-05, + "loss": 0.0246, + "step": 6550 + }, + { + "epoch": 1.5366596392597798, + "grad_norm": 0.09399297833442688, + "learning_rate": 2.9010719708694722e-05, + "loss": 0.0571, + "step": 6560 + }, + { + "epoch": 1.539002108222066, + "grad_norm": 0.04773678630590439, + "learning_rate": 2.8860761977560436e-05, + "loss": 0.0219, + "step": 6570 + }, + { + "epoch": 1.5413445771843524, + "grad_norm": 0.5464432835578918, + "learning_rate": 2.8711035421746367e-05, + "loss": 0.033, + "step": 6580 + }, + { + "epoch": 1.5436870461466385, + "grad_norm": 0.03716734051704407, + "learning_rate": 2.8561541678638142e-05, + "loss": 0.0238, + "step": 6590 + }, + { + "epoch": 1.5460295151089247, + "grad_norm": 0.07261854410171509, + "learning_rate": 2.8412282383075363e-05, + "loss": 0.0355, + "step": 6600 + }, + { + "epoch": 1.5483719840712111, + "grad_norm": 0.3284207880496979, + "learning_rate": 2.8263259167333777e-05, + "loss": 0.0179, + "step": 6610 + }, + { + "epoch": 1.5507144530334973, + "grad_norm": 0.16059207916259766, + "learning_rate": 2.811447366110741e-05, + "loss": 0.0221, + "step": 6620 + }, + { + "epoch": 1.5530569219957835, + "grad_norm": 0.1849690079689026, + "learning_rate": 2.7965927491490705e-05, + "loss": 0.0287, + "step": 6630 + }, + { + "epoch": 1.55539939095807, + "grad_norm": 0.39172595739364624, + "learning_rate": 2.7817622282960815e-05, + "loss": 0.0475, + "step": 6640 + }, + { + "epoch": 1.557741859920356, + "grad_norm": 0.30010920763015747, + "learning_rate": 2.766955965735968e-05, + "loss": 0.0312, + "step": 6650 + }, + { + "epoch": 1.5600843288826423, + "grad_norm": 0.4200305640697479, + "learning_rate": 2.7521741233876496e-05, + "loss": 0.0276, + "step": 6660 + }, + { + "epoch": 1.5624267978449287, + "grad_norm": 0.06515451520681381, + "learning_rate": 2.7374168629029813e-05, + "loss": 0.0165, + "step": 6670 + }, + { + "epoch": 1.5647692668072148, + "grad_norm": 0.19618399441242218, + "learning_rate": 2.7226843456650037e-05, + "loss": 0.019, + "step": 6680 + }, + { + "epoch": 1.567111735769501, + "grad_norm": 0.4492703378200531, + "learning_rate": 2.707976732786166e-05, + "loss": 0.0244, + "step": 6690 + }, + { + "epoch": 1.5694542047317874, + "grad_norm": 0.18303832411766052, + "learning_rate": 2.693294185106562e-05, + "loss": 0.0172, + "step": 6700 + }, + { + "epoch": 1.5717966736940734, + "grad_norm": 0.10762883722782135, + "learning_rate": 2.6786368631921836e-05, + "loss": 0.0286, + "step": 6710 + }, + { + "epoch": 1.5741391426563598, + "grad_norm": 0.1200929656624794, + "learning_rate": 2.6640049273331515e-05, + "loss": 0.0264, + "step": 6720 + }, + { + "epoch": 1.5764816116186462, + "grad_norm": 0.025387238711118698, + "learning_rate": 2.6493985375419778e-05, + "loss": 0.0172, + "step": 6730 + }, + { + "epoch": 1.5788240805809322, + "grad_norm": 0.2033502608537674, + "learning_rate": 2.6348178535517966e-05, + "loss": 0.0206, + "step": 6740 + }, + { + "epoch": 1.5811665495432186, + "grad_norm": 0.1873401701450348, + "learning_rate": 2.6202630348146324e-05, + "loss": 0.0228, + "step": 6750 + }, + { + "epoch": 1.583509018505505, + "grad_norm": 0.3058501183986664, + "learning_rate": 2.6057342404996522e-05, + "loss": 0.0178, + "step": 6760 + }, + { + "epoch": 1.585851487467791, + "grad_norm": 0.07519169896841049, + "learning_rate": 2.591231629491423e-05, + "loss": 0.0242, + "step": 6770 + }, + { + "epoch": 1.5881939564300773, + "grad_norm": 0.11511756479740143, + "learning_rate": 2.5767553603881767e-05, + "loss": 0.0146, + "step": 6780 + }, + { + "epoch": 1.5905364253923635, + "grad_norm": 0.3080747425556183, + "learning_rate": 2.562305591500069e-05, + "loss": 0.0226, + "step": 6790 + }, + { + "epoch": 1.5928788943546497, + "grad_norm": 0.04100322350859642, + "learning_rate": 2.547882480847461e-05, + "loss": 0.0201, + "step": 6800 + }, + { + "epoch": 1.595221363316936, + "grad_norm": 0.04346079006791115, + "learning_rate": 2.5334861861591753e-05, + "loss": 0.0132, + "step": 6810 + }, + { + "epoch": 1.5975638322792223, + "grad_norm": 0.37990304827690125, + "learning_rate": 2.5191168648707887e-05, + "loss": 0.0281, + "step": 6820 + }, + { + "epoch": 1.5999063012415085, + "grad_norm": 0.4856362044811249, + "learning_rate": 2.5047746741228978e-05, + "loss": 0.0314, + "step": 6830 + }, + { + "epoch": 1.6022487702037949, + "grad_norm": 0.10676129907369614, + "learning_rate": 2.490459770759398e-05, + "loss": 0.0527, + "step": 6840 + }, + { + "epoch": 1.604591239166081, + "grad_norm": 0.05450962483882904, + "learning_rate": 2.476172311325783e-05, + "loss": 0.0316, + "step": 6850 + }, + { + "epoch": 1.6069337081283672, + "grad_norm": 0.04609961807727814, + "learning_rate": 2.4619124520674146e-05, + "loss": 0.0272, + "step": 6860 + }, + { + "epoch": 1.6092761770906536, + "grad_norm": 0.03132042661309242, + "learning_rate": 2.447680348927837e-05, + "loss": 0.0249, + "step": 6870 + }, + { + "epoch": 1.6116186460529398, + "grad_norm": 0.11953801661729813, + "learning_rate": 2.433476157547044e-05, + "loss": 0.0178, + "step": 6880 + }, + { + "epoch": 1.613961115015226, + "grad_norm": 0.1899009346961975, + "learning_rate": 2.419300033259798e-05, + "loss": 0.0445, + "step": 6890 + }, + { + "epoch": 1.6163035839775124, + "grad_norm": 0.04766709730029106, + "learning_rate": 2.405152131093926e-05, + "loss": 0.0144, + "step": 6900 + }, + { + "epoch": 1.6186460529397986, + "grad_norm": 0.40684422850608826, + "learning_rate": 2.3910326057686127e-05, + "loss": 0.018, + "step": 6910 + }, + { + "epoch": 1.6209885219020848, + "grad_norm": 0.06992173194885254, + "learning_rate": 2.3769416116927335e-05, + "loss": 0.0183, + "step": 6920 + }, + { + "epoch": 1.6233309908643712, + "grad_norm": 0.050338905304670334, + "learning_rate": 2.362879302963135e-05, + "loss": 0.0237, + "step": 6930 + }, + { + "epoch": 1.6256734598266573, + "grad_norm": 0.19553512334823608, + "learning_rate": 2.3488458333629777e-05, + "loss": 0.0339, + "step": 6940 + }, + { + "epoch": 1.6280159287889435, + "grad_norm": 0.15470145642757416, + "learning_rate": 2.3348413563600325e-05, + "loss": 0.0094, + "step": 6950 + }, + { + "epoch": 1.63035839775123, + "grad_norm": 0.23403486609458923, + "learning_rate": 2.3208660251050158e-05, + "loss": 0.026, + "step": 6960 + }, + { + "epoch": 1.6327008667135159, + "grad_norm": 0.13263070583343506, + "learning_rate": 2.3069199924299174e-05, + "loss": 0.0197, + "step": 6970 + }, + { + "epoch": 1.6350433356758023, + "grad_norm": 0.4499634802341461, + "learning_rate": 2.29300341084631e-05, + "loss": 0.0183, + "step": 6980 + }, + { + "epoch": 1.6373858046380887, + "grad_norm": 0.020672103390097618, + "learning_rate": 2.279116432543705e-05, + "loss": 0.019, + "step": 6990 + }, + { + "epoch": 1.6397282736003747, + "grad_norm": 0.08733947575092316, + "learning_rate": 2.2652592093878666e-05, + "loss": 0.0317, + "step": 7000 + } + ], + "logging_steps": 10, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.841140693728e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}