{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984532159196, "eval_steps": 500, "global_step": 122000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.3454886674880981, "learning_rate": 9.991815957246561e-06, "loss": 5.7653, "step": 100 }, { "epoch": 0.0, "grad_norm": 0.9953828454017639, "learning_rate": 9.983631914493123e-06, "loss": 5.6192, "step": 200 }, { "epoch": 0.0, "grad_norm": 0.7844876646995544, "learning_rate": 9.975447871739683e-06, "loss": 5.4452, "step": 300 }, { "epoch": 0.0, "grad_norm": 0.567190945148468, "learning_rate": 9.967263828986244e-06, "loss": 5.3183, "step": 400 }, { "epoch": 0.0, "grad_norm": 0.5304737687110901, "learning_rate": 9.959079786232804e-06, "loss": 5.2156, "step": 500 }, { "epoch": 0.0, "grad_norm": 0.3840586841106415, "learning_rate": 9.950895743479366e-06, "loss": 5.1203, "step": 600 }, { "epoch": 0.01, "grad_norm": 0.34571415185928345, "learning_rate": 9.942711700725926e-06, "loss": 5.0732, "step": 700 }, { "epoch": 0.01, "grad_norm": 0.32429179549217224, "learning_rate": 9.934527657972486e-06, "loss": 5.0458, "step": 800 }, { "epoch": 0.01, "grad_norm": 0.4024583101272583, "learning_rate": 9.926343615219047e-06, "loss": 5.0157, "step": 900 }, { "epoch": 0.01, "grad_norm": 0.3507966101169586, "learning_rate": 9.918159572465607e-06, "loss": 4.9921, "step": 1000 }, { "epoch": 0.01, "grad_norm": 0.37788429856300354, "learning_rate": 9.909975529712169e-06, "loss": 4.9803, "step": 1100 }, { "epoch": 0.01, "grad_norm": 0.3670382797718048, "learning_rate": 9.901791486958729e-06, "loss": 4.9651, "step": 1200 }, { "epoch": 0.01, "grad_norm": 0.2876070737838745, "learning_rate": 9.89360744420529e-06, "loss": 4.9585, "step": 1300 }, { "epoch": 0.01, "grad_norm": 0.3649226725101471, "learning_rate": 9.88542340145185e-06, "loss": 4.9491, "step": 1400 }, { "epoch": 0.01, "grad_norm": 0.39525964856147766, "learning_rate": 9.87723935869841e-06, "loss": 4.9416, "step": 1500 }, { "epoch": 0.01, "grad_norm": 0.30799126625061035, "learning_rate": 9.86905531594497e-06, "loss": 4.9318, "step": 1600 }, { "epoch": 0.01, "grad_norm": 0.2949569523334503, "learning_rate": 9.860871273191532e-06, "loss": 4.9201, "step": 1700 }, { "epoch": 0.01, "grad_norm": 0.3036958575248718, "learning_rate": 9.852687230438092e-06, "loss": 4.9212, "step": 1800 }, { "epoch": 0.02, "grad_norm": 0.7450295686721802, "learning_rate": 9.844503187684653e-06, "loss": 4.9166, "step": 1900 }, { "epoch": 0.02, "grad_norm": 0.439261794090271, "learning_rate": 9.836319144931213e-06, "loss": 4.9028, "step": 2000 }, { "epoch": 0.02, "grad_norm": 0.28415539860725403, "learning_rate": 9.828135102177775e-06, "loss": 4.8989, "step": 2100 }, { "epoch": 0.02, "grad_norm": 0.3490685522556305, "learning_rate": 9.819951059424335e-06, "loss": 4.9008, "step": 2200 }, { "epoch": 0.02, "grad_norm": 0.4549264907836914, "learning_rate": 9.811767016670895e-06, "loss": 4.8968, "step": 2300 }, { "epoch": 0.02, "grad_norm": 0.3350650668144226, "learning_rate": 9.803582973917457e-06, "loss": 4.8886, "step": 2400 }, { "epoch": 0.02, "grad_norm": 0.41439351439476013, "learning_rate": 9.795398931164018e-06, "loss": 4.8869, "step": 2500 }, { "epoch": 0.02, "grad_norm": 0.47745582461357117, "learning_rate": 9.787214888410578e-06, "loss": 4.8864, "step": 2600 }, { "epoch": 0.02, "grad_norm": 0.4192076027393341, "learning_rate": 9.779030845657138e-06, "loss": 4.886, "step": 2700 }, { "epoch": 0.02, "grad_norm": 0.38134631514549255, "learning_rate": 9.7708468029037e-06, "loss": 4.8803, "step": 2800 }, { "epoch": 0.02, "grad_norm": 1.0664669275283813, "learning_rate": 9.76266276015026e-06, "loss": 4.8764, "step": 2900 }, { "epoch": 0.02, "grad_norm": 0.6459155678749084, "learning_rate": 9.75447871739682e-06, "loss": 4.8721, "step": 3000 }, { "epoch": 0.03, "grad_norm": 0.4217374622821808, "learning_rate": 9.746294674643381e-06, "loss": 4.8758, "step": 3100 }, { "epoch": 0.03, "grad_norm": 1.4077221155166626, "learning_rate": 9.738110631889943e-06, "loss": 4.8694, "step": 3200 }, { "epoch": 0.03, "grad_norm": 0.793872058391571, "learning_rate": 9.729926589136503e-06, "loss": 4.872, "step": 3300 }, { "epoch": 0.03, "grad_norm": 0.6410694718360901, "learning_rate": 9.721742546383063e-06, "loss": 4.8697, "step": 3400 }, { "epoch": 0.03, "grad_norm": 0.9800311923027039, "learning_rate": 9.713558503629624e-06, "loss": 4.86, "step": 3500 }, { "epoch": 0.03, "grad_norm": 0.999591052532196, "learning_rate": 9.705374460876184e-06, "loss": 4.8601, "step": 3600 }, { "epoch": 0.03, "grad_norm": 1.2334632873535156, "learning_rate": 9.697190418122744e-06, "loss": 4.8603, "step": 3700 }, { "epoch": 0.03, "grad_norm": 0.6848945617675781, "learning_rate": 9.689006375369305e-06, "loss": 4.8537, "step": 3800 }, { "epoch": 0.03, "grad_norm": 0.4569337069988251, "learning_rate": 9.680822332615866e-06, "loss": 4.859, "step": 3900 }, { "epoch": 0.03, "grad_norm": 1.2655045986175537, "learning_rate": 9.672638289862427e-06, "loss": 4.8556, "step": 4000 }, { "epoch": 0.03, "grad_norm": 1.2269976139068604, "learning_rate": 9.664454247108987e-06, "loss": 4.8569, "step": 4100 }, { "epoch": 0.03, "grad_norm": 0.5749172568321228, "learning_rate": 9.656270204355547e-06, "loss": 4.8549, "step": 4200 }, { "epoch": 0.04, "grad_norm": 0.5450171232223511, "learning_rate": 9.64808616160211e-06, "loss": 4.853, "step": 4300 }, { "epoch": 0.04, "grad_norm": 0.6002165675163269, "learning_rate": 9.63990211884867e-06, "loss": 4.8495, "step": 4400 }, { "epoch": 0.04, "grad_norm": 0.4352850914001465, "learning_rate": 9.63171807609523e-06, "loss": 4.8471, "step": 4500 }, { "epoch": 0.04, "grad_norm": 0.8037896752357483, "learning_rate": 9.623534033341792e-06, "loss": 4.841, "step": 4600 }, { "epoch": 0.04, "grad_norm": 0.7141408324241638, "learning_rate": 9.615349990588352e-06, "loss": 4.8433, "step": 4700 }, { "epoch": 0.04, "grad_norm": 1.3830125331878662, "learning_rate": 9.607165947834912e-06, "loss": 4.84, "step": 4800 }, { "epoch": 0.04, "grad_norm": 1.2149487733840942, "learning_rate": 9.598981905081473e-06, "loss": 4.8414, "step": 4900 }, { "epoch": 0.04, "grad_norm": 1.5489853620529175, "learning_rate": 9.590797862328034e-06, "loss": 4.8425, "step": 5000 }, { "epoch": 0.04, "grad_norm": 1.0431714057922363, "learning_rate": 9.582613819574595e-06, "loss": 4.8369, "step": 5100 }, { "epoch": 0.04, "grad_norm": 1.2853955030441284, "learning_rate": 9.574429776821155e-06, "loss": 4.8411, "step": 5200 }, { "epoch": 0.04, "grad_norm": 0.6190789937973022, "learning_rate": 9.566245734067715e-06, "loss": 4.8381, "step": 5300 }, { "epoch": 0.04, "grad_norm": 0.9078882336616516, "learning_rate": 9.558061691314277e-06, "loss": 4.8355, "step": 5400 }, { "epoch": 0.05, "grad_norm": 0.6225169897079468, "learning_rate": 9.549877648560838e-06, "loss": 4.8382, "step": 5500 }, { "epoch": 0.05, "grad_norm": 0.5269715189933777, "learning_rate": 9.541693605807398e-06, "loss": 4.8368, "step": 5600 }, { "epoch": 0.05, "grad_norm": 0.5707417130470276, "learning_rate": 9.533509563053958e-06, "loss": 4.8331, "step": 5700 }, { "epoch": 0.05, "grad_norm": 1.0135433673858643, "learning_rate": 9.525325520300518e-06, "loss": 4.8372, "step": 5800 }, { "epoch": 0.05, "grad_norm": 0.9796397686004639, "learning_rate": 9.517141477547079e-06, "loss": 4.8301, "step": 5900 }, { "epoch": 0.05, "grad_norm": 0.6885799765586853, "learning_rate": 9.50895743479364e-06, "loss": 4.8309, "step": 6000 }, { "epoch": 0.05, "grad_norm": 0.8593583703041077, "learning_rate": 9.5007733920402e-06, "loss": 4.829, "step": 6100 }, { "epoch": 0.05, "grad_norm": 1.0259130001068115, "learning_rate": 9.492589349286761e-06, "loss": 4.8298, "step": 6200 }, { "epoch": 0.05, "grad_norm": 0.5682145357131958, "learning_rate": 9.484405306533321e-06, "loss": 4.8283, "step": 6300 }, { "epoch": 0.05, "grad_norm": 1.9293478727340698, "learning_rate": 9.476221263779882e-06, "loss": 4.8321, "step": 6400 }, { "epoch": 0.05, "grad_norm": 1.0222758054733276, "learning_rate": 9.468037221026444e-06, "loss": 4.8286, "step": 6500 }, { "epoch": 0.05, "grad_norm": 0.748030960559845, "learning_rate": 9.459853178273004e-06, "loss": 4.8277, "step": 6600 }, { "epoch": 0.05, "grad_norm": 1.7026931047439575, "learning_rate": 9.451669135519564e-06, "loss": 4.8227, "step": 6700 }, { "epoch": 0.06, "grad_norm": 0.8596793413162231, "learning_rate": 9.443485092766126e-06, "loss": 4.8296, "step": 6800 }, { "epoch": 0.06, "grad_norm": 1.9839978218078613, "learning_rate": 9.435301050012686e-06, "loss": 4.8298, "step": 6900 }, { "epoch": 0.06, "grad_norm": 2.7572035789489746, "learning_rate": 9.427117007259247e-06, "loss": 4.8285, "step": 7000 }, { "epoch": 0.06, "grad_norm": 1.3615273237228394, "learning_rate": 9.418932964505807e-06, "loss": 4.8263, "step": 7100 }, { "epoch": 0.06, "grad_norm": 0.8525121808052063, "learning_rate": 9.410748921752369e-06, "loss": 4.8233, "step": 7200 }, { "epoch": 0.06, "grad_norm": 0.6726747751235962, "learning_rate": 9.402564878998929e-06, "loss": 4.8232, "step": 7300 }, { "epoch": 0.06, "grad_norm": 2.404517412185669, "learning_rate": 9.39438083624549e-06, "loss": 4.8222, "step": 7400 }, { "epoch": 0.06, "grad_norm": 2.3438527584075928, "learning_rate": 9.38619679349205e-06, "loss": 4.8275, "step": 7500 }, { "epoch": 0.06, "grad_norm": 0.9558647871017456, "learning_rate": 9.378012750738612e-06, "loss": 4.8188, "step": 7600 }, { "epoch": 0.06, "grad_norm": 1.5687648057937622, "learning_rate": 9.369828707985172e-06, "loss": 4.8203, "step": 7700 }, { "epoch": 0.06, "grad_norm": 4.528586387634277, "learning_rate": 9.361644665231732e-06, "loss": 4.8237, "step": 7800 }, { "epoch": 0.06, "grad_norm": 1.5821335315704346, "learning_rate": 9.353460622478292e-06, "loss": 4.8213, "step": 7900 }, { "epoch": 0.07, "grad_norm": 1.3139538764953613, "learning_rate": 9.345276579724853e-06, "loss": 4.8241, "step": 8000 }, { "epoch": 0.07, "grad_norm": 0.6411435008049011, "learning_rate": 9.337092536971415e-06, "loss": 4.8176, "step": 8100 }, { "epoch": 0.07, "grad_norm": 2.011660575866699, "learning_rate": 9.328908494217975e-06, "loss": 4.8142, "step": 8200 }, { "epoch": 0.07, "grad_norm": 0.9855501055717468, "learning_rate": 9.320724451464535e-06, "loss": 4.816, "step": 8300 }, { "epoch": 0.07, "grad_norm": 0.8996455073356628, "learning_rate": 9.312540408711095e-06, "loss": 4.8162, "step": 8400 }, { "epoch": 0.07, "grad_norm": 1.75440514087677, "learning_rate": 9.304356365957656e-06, "loss": 4.8174, "step": 8500 }, { "epoch": 0.07, "grad_norm": 0.5616024136543274, "learning_rate": 9.296172323204216e-06, "loss": 4.8137, "step": 8600 }, { "epoch": 0.07, "grad_norm": 1.5236667394638062, "learning_rate": 9.287988280450778e-06, "loss": 4.8156, "step": 8700 }, { "epoch": 0.07, "grad_norm": 0.869793176651001, "learning_rate": 9.279804237697338e-06, "loss": 4.8201, "step": 8800 }, { "epoch": 0.07, "grad_norm": 2.2847354412078857, "learning_rate": 9.271620194943898e-06, "loss": 4.816, "step": 8900 }, { "epoch": 0.07, "grad_norm": 1.1544415950775146, "learning_rate": 9.26343615219046e-06, "loss": 4.8174, "step": 9000 }, { "epoch": 0.07, "grad_norm": 0.6653382182121277, "learning_rate": 9.25525210943702e-06, "loss": 4.8136, "step": 9100 }, { "epoch": 0.08, "grad_norm": 1.0501242876052856, "learning_rate": 9.247068066683581e-06, "loss": 4.8104, "step": 9200 }, { "epoch": 0.08, "grad_norm": 0.6880828142166138, "learning_rate": 9.238884023930141e-06, "loss": 4.8093, "step": 9300 }, { "epoch": 0.08, "grad_norm": 0.9488196969032288, "learning_rate": 9.230699981176703e-06, "loss": 4.8121, "step": 9400 }, { "epoch": 0.08, "grad_norm": 4.403717041015625, "learning_rate": 9.222515938423263e-06, "loss": 4.8113, "step": 9500 }, { "epoch": 0.08, "grad_norm": 2.2665512561798096, "learning_rate": 9.214331895669824e-06, "loss": 4.8092, "step": 9600 }, { "epoch": 0.08, "grad_norm": 0.7057957053184509, "learning_rate": 9.206147852916384e-06, "loss": 4.8108, "step": 9700 }, { "epoch": 0.08, "grad_norm": 3.9034390449523926, "learning_rate": 9.197963810162946e-06, "loss": 4.808, "step": 9800 }, { "epoch": 0.08, "grad_norm": 0.9307584762573242, "learning_rate": 9.189779767409506e-06, "loss": 4.809, "step": 9900 }, { "epoch": 0.08, "grad_norm": 3.995647668838501, "learning_rate": 9.181595724656066e-06, "loss": 4.8079, "step": 10000 }, { "epoch": 0.08, "grad_norm": 2.2378621101379395, "learning_rate": 9.173411681902627e-06, "loss": 4.81, "step": 10100 }, { "epoch": 0.08, "grad_norm": 1.424851417541504, "learning_rate": 9.165227639149189e-06, "loss": 4.8045, "step": 10200 }, { "epoch": 0.08, "grad_norm": 1.8927842378616333, "learning_rate": 9.157043596395749e-06, "loss": 4.8085, "step": 10300 }, { "epoch": 0.09, "grad_norm": 3.9594876766204834, "learning_rate": 9.14885955364231e-06, "loss": 4.808, "step": 10400 }, { "epoch": 0.09, "grad_norm": 2.6992530822753906, "learning_rate": 9.14067551088887e-06, "loss": 4.8099, "step": 10500 }, { "epoch": 0.09, "grad_norm": 1.1311434507369995, "learning_rate": 9.13249146813543e-06, "loss": 4.8071, "step": 10600 }, { "epoch": 0.09, "grad_norm": 0.8243028521537781, "learning_rate": 9.12430742538199e-06, "loss": 4.8056, "step": 10700 }, { "epoch": 0.09, "grad_norm": 1.7261260747909546, "learning_rate": 9.11612338262855e-06, "loss": 4.8076, "step": 10800 }, { "epoch": 0.09, "grad_norm": 4.485065460205078, "learning_rate": 9.107939339875112e-06, "loss": 4.8072, "step": 10900 }, { "epoch": 0.09, "grad_norm": 1.044236660003662, "learning_rate": 9.099755297121673e-06, "loss": 4.8076, "step": 11000 }, { "epoch": 0.09, "grad_norm": 2.155515670776367, "learning_rate": 9.091571254368233e-06, "loss": 4.8061, "step": 11100 }, { "epoch": 0.09, "grad_norm": 1.4560600519180298, "learning_rate": 9.083387211614795e-06, "loss": 4.8079, "step": 11200 }, { "epoch": 0.09, "grad_norm": 2.577777624130249, "learning_rate": 9.075203168861355e-06, "loss": 4.8058, "step": 11300 }, { "epoch": 0.09, "grad_norm": 2.1256022453308105, "learning_rate": 9.067019126107915e-06, "loss": 4.8047, "step": 11400 }, { "epoch": 0.09, "grad_norm": 0.9217951893806458, "learning_rate": 9.058835083354476e-06, "loss": 4.8047, "step": 11500 }, { "epoch": 0.09, "grad_norm": 1.4003558158874512, "learning_rate": 9.050651040601038e-06, "loss": 4.8006, "step": 11600 }, { "epoch": 0.1, "grad_norm": 1.1220279932022095, "learning_rate": 9.042466997847598e-06, "loss": 4.8061, "step": 11700 }, { "epoch": 0.1, "grad_norm": 3.371317148208618, "learning_rate": 9.034282955094158e-06, "loss": 4.8034, "step": 11800 }, { "epoch": 0.1, "grad_norm": 1.5556405782699585, "learning_rate": 9.026098912340718e-06, "loss": 4.8022, "step": 11900 }, { "epoch": 0.1, "grad_norm": 0.9803473949432373, "learning_rate": 9.01791486958728e-06, "loss": 4.8029, "step": 12000 }, { "epoch": 0.1, "grad_norm": 0.7116793990135193, "learning_rate": 9.00973082683384e-06, "loss": 4.8047, "step": 12100 }, { "epoch": 0.1, "grad_norm": 0.8312737941741943, "learning_rate": 9.0015467840804e-06, "loss": 4.8004, "step": 12200 }, { "epoch": 0.1, "grad_norm": 2.492680072784424, "learning_rate": 8.993362741326963e-06, "loss": 4.8031, "step": 12300 }, { "epoch": 0.1, "grad_norm": 1.0807607173919678, "learning_rate": 8.985178698573523e-06, "loss": 4.7997, "step": 12400 }, { "epoch": 0.1, "grad_norm": 4.5318403244018555, "learning_rate": 8.976994655820083e-06, "loss": 4.8052, "step": 12500 }, { "epoch": 0.1, "grad_norm": 6.294131278991699, "learning_rate": 8.968810613066644e-06, "loss": 4.8025, "step": 12600 }, { "epoch": 0.1, "grad_norm": 1.2838228940963745, "learning_rate": 8.960626570313204e-06, "loss": 4.8014, "step": 12700 }, { "epoch": 0.1, "grad_norm": 3.4684486389160156, "learning_rate": 8.952442527559764e-06, "loss": 4.7991, "step": 12800 }, { "epoch": 0.11, "grad_norm": 0.9761227965354919, "learning_rate": 8.944258484806324e-06, "loss": 4.8013, "step": 12900 }, { "epoch": 0.11, "grad_norm": 5.355940341949463, "learning_rate": 8.936074442052886e-06, "loss": 4.8014, "step": 13000 }, { "epoch": 0.11, "grad_norm": 3.334829568862915, "learning_rate": 8.927890399299447e-06, "loss": 4.7994, "step": 13100 }, { "epoch": 0.11, "grad_norm": 4.577433109283447, "learning_rate": 8.919706356546007e-06, "loss": 4.7931, "step": 13200 }, { "epoch": 0.11, "grad_norm": 4.9149322509765625, "learning_rate": 8.911522313792567e-06, "loss": 4.8025, "step": 13300 }, { "epoch": 0.11, "grad_norm": 1.5174356698989868, "learning_rate": 8.903338271039129e-06, "loss": 4.7982, "step": 13400 }, { "epoch": 0.11, "grad_norm": 4.420125961303711, "learning_rate": 8.89515422828569e-06, "loss": 4.7976, "step": 13500 }, { "epoch": 0.11, "grad_norm": 4.246557712554932, "learning_rate": 8.88697018553225e-06, "loss": 4.7954, "step": 13600 }, { "epoch": 0.11, "grad_norm": 1.887624979019165, "learning_rate": 8.87878614277881e-06, "loss": 4.7956, "step": 13700 }, { "epoch": 0.11, "grad_norm": 1.2699388265609741, "learning_rate": 8.870602100025372e-06, "loss": 4.7993, "step": 13800 }, { "epoch": 0.11, "grad_norm": 2.5432891845703125, "learning_rate": 8.862418057271932e-06, "loss": 4.8017, "step": 13900 }, { "epoch": 0.11, "grad_norm": 3.0773308277130127, "learning_rate": 8.854234014518492e-06, "loss": 4.8002, "step": 14000 }, { "epoch": 0.12, "grad_norm": 2.0699973106384277, "learning_rate": 8.846049971765053e-06, "loss": 4.7958, "step": 14100 }, { "epoch": 0.12, "grad_norm": 0.9238589406013489, "learning_rate": 8.837865929011615e-06, "loss": 4.7907, "step": 14200 }, { "epoch": 0.12, "grad_norm": 1.0784467458724976, "learning_rate": 8.829681886258175e-06, "loss": 4.797, "step": 14300 }, { "epoch": 0.12, "grad_norm": 1.1584768295288086, "learning_rate": 8.821497843504735e-06, "loss": 4.7947, "step": 14400 }, { "epoch": 0.12, "grad_norm": 1.0812736749649048, "learning_rate": 8.813313800751297e-06, "loss": 4.7894, "step": 14500 }, { "epoch": 0.12, "grad_norm": 1.7061692476272583, "learning_rate": 8.805129757997857e-06, "loss": 4.7946, "step": 14600 }, { "epoch": 0.12, "grad_norm": 2.771735429763794, "learning_rate": 8.796945715244418e-06, "loss": 4.7973, "step": 14700 }, { "epoch": 0.12, "grad_norm": 2.1708245277404785, "learning_rate": 8.788761672490978e-06, "loss": 4.7953, "step": 14800 }, { "epoch": 0.12, "grad_norm": 2.4777073860168457, "learning_rate": 8.780577629737538e-06, "loss": 4.7917, "step": 14900 }, { "epoch": 0.12, "grad_norm": 1.5953549146652222, "learning_rate": 8.772393586984098e-06, "loss": 4.7917, "step": 15000 }, { "epoch": 0.12, "grad_norm": 0.7896368503570557, "learning_rate": 8.76420954423066e-06, "loss": 4.7923, "step": 15100 }, { "epoch": 0.12, "grad_norm": 0.7050272822380066, "learning_rate": 8.75602550147722e-06, "loss": 4.7977, "step": 15200 }, { "epoch": 0.13, "grad_norm": 4.6306023597717285, "learning_rate": 8.747841458723781e-06, "loss": 4.7911, "step": 15300 }, { "epoch": 0.13, "grad_norm": 2.3503997325897217, "learning_rate": 8.739657415970341e-06, "loss": 4.7895, "step": 15400 }, { "epoch": 0.13, "grad_norm": 4.242427825927734, "learning_rate": 8.731473373216901e-06, "loss": 4.7902, "step": 15500 }, { "epoch": 0.13, "grad_norm": 0.8283806443214417, "learning_rate": 8.723289330463463e-06, "loss": 4.7938, "step": 15600 }, { "epoch": 0.13, "grad_norm": 3.901630401611328, "learning_rate": 8.715105287710024e-06, "loss": 4.7936, "step": 15700 }, { "epoch": 0.13, "grad_norm": 6.563976764678955, "learning_rate": 8.706921244956584e-06, "loss": 4.7907, "step": 15800 }, { "epoch": 0.13, "grad_norm": 2.2243263721466064, "learning_rate": 8.698737202203144e-06, "loss": 4.7852, "step": 15900 }, { "epoch": 0.13, "grad_norm": 2.0529608726501465, "learning_rate": 8.690553159449706e-06, "loss": 4.7916, "step": 16000 }, { "epoch": 0.13, "grad_norm": 3.0728704929351807, "learning_rate": 8.682369116696266e-06, "loss": 4.7947, "step": 16100 }, { "epoch": 0.13, "grad_norm": 0.8949060440063477, "learning_rate": 8.674185073942827e-06, "loss": 4.7935, "step": 16200 }, { "epoch": 0.13, "grad_norm": 0.9687060713768005, "learning_rate": 8.666001031189387e-06, "loss": 4.7939, "step": 16300 }, { "epoch": 0.13, "grad_norm": 1.091792345046997, "learning_rate": 8.657816988435949e-06, "loss": 4.7941, "step": 16400 }, { "epoch": 0.14, "grad_norm": 0.7492835521697998, "learning_rate": 8.64963294568251e-06, "loss": 4.7897, "step": 16500 }, { "epoch": 0.14, "grad_norm": 1.856666088104248, "learning_rate": 8.64144890292907e-06, "loss": 4.7917, "step": 16600 }, { "epoch": 0.14, "grad_norm": 2.1373181343078613, "learning_rate": 8.633264860175631e-06, "loss": 4.7875, "step": 16700 }, { "epoch": 0.14, "grad_norm": 3.3695366382598877, "learning_rate": 8.625080817422192e-06, "loss": 4.7935, "step": 16800 }, { "epoch": 0.14, "grad_norm": 3.0197083950042725, "learning_rate": 8.616896774668752e-06, "loss": 4.7896, "step": 16900 }, { "epoch": 0.14, "grad_norm": 2.717109441757202, "learning_rate": 8.608712731915312e-06, "loss": 4.7898, "step": 17000 }, { "epoch": 0.14, "grad_norm": 2.4040961265563965, "learning_rate": 8.600528689161873e-06, "loss": 4.7919, "step": 17100 }, { "epoch": 0.14, "grad_norm": 7.917181968688965, "learning_rate": 8.592344646408434e-06, "loss": 4.7904, "step": 17200 }, { "epoch": 0.14, "grad_norm": 1.2932788133621216, "learning_rate": 8.584160603654995e-06, "loss": 4.7856, "step": 17300 }, { "epoch": 0.14, "grad_norm": 4.480923175811768, "learning_rate": 8.575976560901555e-06, "loss": 4.7952, "step": 17400 }, { "epoch": 0.14, "grad_norm": 1.211406946182251, "learning_rate": 8.567792518148115e-06, "loss": 4.7903, "step": 17500 }, { "epoch": 0.14, "grad_norm": 1.0489588975906372, "learning_rate": 8.559608475394676e-06, "loss": 4.7941, "step": 17600 }, { "epoch": 0.14, "grad_norm": 3.5961625576019287, "learning_rate": 8.551424432641236e-06, "loss": 4.7845, "step": 17700 }, { "epoch": 0.15, "grad_norm": 1.3053170442581177, "learning_rate": 8.543240389887798e-06, "loss": 4.7919, "step": 17800 }, { "epoch": 0.15, "grad_norm": 0.7930333614349365, "learning_rate": 8.535056347134358e-06, "loss": 4.7893, "step": 17900 }, { "epoch": 0.15, "grad_norm": 0.9989149570465088, "learning_rate": 8.526872304380918e-06, "loss": 4.7879, "step": 18000 }, { "epoch": 0.15, "grad_norm": 7.412938594818115, "learning_rate": 8.518688261627479e-06, "loss": 4.7876, "step": 18100 }, { "epoch": 0.15, "grad_norm": 0.9795100688934326, "learning_rate": 8.51050421887404e-06, "loss": 4.7895, "step": 18200 }, { "epoch": 0.15, "grad_norm": 1.8868602514266968, "learning_rate": 8.5023201761206e-06, "loss": 4.7895, "step": 18300 }, { "epoch": 0.15, "grad_norm": 0.9831721186637878, "learning_rate": 8.494136133367161e-06, "loss": 4.7874, "step": 18400 }, { "epoch": 0.15, "grad_norm": 3.5411503314971924, "learning_rate": 8.485952090613721e-06, "loss": 4.786, "step": 18500 }, { "epoch": 0.15, "grad_norm": 2.617466926574707, "learning_rate": 8.477768047860283e-06, "loss": 4.7832, "step": 18600 }, { "epoch": 0.15, "grad_norm": 2.1326940059661865, "learning_rate": 8.469584005106844e-06, "loss": 4.789, "step": 18700 }, { "epoch": 0.15, "grad_norm": 2.020888328552246, "learning_rate": 8.461399962353404e-06, "loss": 4.7867, "step": 18800 }, { "epoch": 0.15, "grad_norm": 2.366783618927002, "learning_rate": 8.453215919599966e-06, "loss": 4.7871, "step": 18900 }, { "epoch": 0.16, "grad_norm": 2.739832878112793, "learning_rate": 8.445031876846526e-06, "loss": 4.7829, "step": 19000 }, { "epoch": 0.16, "grad_norm": 1.9564040899276733, "learning_rate": 8.436847834093086e-06, "loss": 4.7901, "step": 19100 }, { "epoch": 0.16, "grad_norm": 4.063528537750244, "learning_rate": 8.428663791339647e-06, "loss": 4.7839, "step": 19200 }, { "epoch": 0.16, "grad_norm": 1.9199309349060059, "learning_rate": 8.420479748586209e-06, "loss": 4.7919, "step": 19300 }, { "epoch": 0.16, "grad_norm": 0.8330548405647278, "learning_rate": 8.412295705832769e-06, "loss": 4.7877, "step": 19400 }, { "epoch": 0.16, "grad_norm": 2.459280490875244, "learning_rate": 8.404111663079329e-06, "loss": 4.7854, "step": 19500 }, { "epoch": 0.16, "grad_norm": 5.8329572677612305, "learning_rate": 8.39592762032589e-06, "loss": 4.7864, "step": 19600 }, { "epoch": 0.16, "grad_norm": 1.6338814496994019, "learning_rate": 8.38774357757245e-06, "loss": 4.7855, "step": 19700 }, { "epoch": 0.16, "grad_norm": 0.8343069553375244, "learning_rate": 8.37955953481901e-06, "loss": 4.786, "step": 19800 }, { "epoch": 0.16, "grad_norm": 1.5812463760375977, "learning_rate": 8.37137549206557e-06, "loss": 4.784, "step": 19900 }, { "epoch": 0.16, "grad_norm": 1.3143149614334106, "learning_rate": 8.363191449312132e-06, "loss": 4.7761, "step": 20000 }, { "epoch": 0.16, "grad_norm": 2.27919340133667, "learning_rate": 8.355007406558692e-06, "loss": 4.7841, "step": 20100 }, { "epoch": 0.17, "grad_norm": 2.111786365509033, "learning_rate": 8.346823363805253e-06, "loss": 4.7902, "step": 20200 }, { "epoch": 0.17, "grad_norm": 1.4459553956985474, "learning_rate": 8.338639321051813e-06, "loss": 4.7827, "step": 20300 }, { "epoch": 0.17, "grad_norm": 5.489434242248535, "learning_rate": 8.330455278298375e-06, "loss": 4.7804, "step": 20400 }, { "epoch": 0.17, "grad_norm": 0.8438489437103271, "learning_rate": 8.322271235544935e-06, "loss": 4.7826, "step": 20500 }, { "epoch": 0.17, "grad_norm": 0.8289409875869751, "learning_rate": 8.314087192791495e-06, "loss": 4.788, "step": 20600 }, { "epoch": 0.17, "grad_norm": 2.2698094844818115, "learning_rate": 8.305903150038056e-06, "loss": 4.785, "step": 20700 }, { "epoch": 0.17, "grad_norm": 6.737819194793701, "learning_rate": 8.297719107284618e-06, "loss": 4.7879, "step": 20800 }, { "epoch": 0.17, "grad_norm": 1.1006556749343872, "learning_rate": 8.289535064531178e-06, "loss": 4.7861, "step": 20900 }, { "epoch": 0.17, "grad_norm": 0.757918119430542, "learning_rate": 8.281351021777738e-06, "loss": 4.7838, "step": 21000 }, { "epoch": 0.17, "grad_norm": 1.443129539489746, "learning_rate": 8.273166979024298e-06, "loss": 4.7815, "step": 21100 }, { "epoch": 0.17, "grad_norm": 1.2434520721435547, "learning_rate": 8.26498293627086e-06, "loss": 4.7837, "step": 21200 }, { "epoch": 0.17, "grad_norm": 0.9488204717636108, "learning_rate": 8.25679889351742e-06, "loss": 4.7826, "step": 21300 }, { "epoch": 0.18, "grad_norm": 1.4644496440887451, "learning_rate": 8.248614850763981e-06, "loss": 4.7805, "step": 21400 }, { "epoch": 0.18, "grad_norm": 1.1131641864776611, "learning_rate": 8.240430808010543e-06, "loss": 4.7848, "step": 21500 }, { "epoch": 0.18, "grad_norm": 0.9792927503585815, "learning_rate": 8.232246765257103e-06, "loss": 4.7844, "step": 21600 }, { "epoch": 0.18, "grad_norm": 6.909112453460693, "learning_rate": 8.224062722503663e-06, "loss": 4.7791, "step": 21700 }, { "epoch": 0.18, "grad_norm": 1.1648467779159546, "learning_rate": 8.215878679750224e-06, "loss": 4.7834, "step": 21800 }, { "epoch": 0.18, "grad_norm": 1.4722325801849365, "learning_rate": 8.207694636996784e-06, "loss": 4.7775, "step": 21900 }, { "epoch": 0.18, "grad_norm": 0.8322229385375977, "learning_rate": 8.199510594243344e-06, "loss": 4.7848, "step": 22000 }, { "epoch": 0.18, "grad_norm": 1.4244656562805176, "learning_rate": 8.191326551489906e-06, "loss": 4.7784, "step": 22100 }, { "epoch": 0.18, "grad_norm": 3.1128671169281006, "learning_rate": 8.183142508736466e-06, "loss": 4.7819, "step": 22200 }, { "epoch": 0.18, "grad_norm": 2.6106808185577393, "learning_rate": 8.174958465983027e-06, "loss": 4.7791, "step": 22300 }, { "epoch": 0.18, "grad_norm": 5.204963207244873, "learning_rate": 8.166774423229587e-06, "loss": 4.7841, "step": 22400 }, { "epoch": 0.18, "grad_norm": 0.9447894096374512, "learning_rate": 8.158590380476147e-06, "loss": 4.7776, "step": 22500 }, { "epoch": 0.18, "grad_norm": 1.2476049661636353, "learning_rate": 8.15040633772271e-06, "loss": 4.7759, "step": 22600 }, { "epoch": 0.19, "grad_norm": 0.7892690896987915, "learning_rate": 8.14222229496927e-06, "loss": 4.7797, "step": 22700 }, { "epoch": 0.19, "grad_norm": 2.874403476715088, "learning_rate": 8.13403825221583e-06, "loss": 4.7807, "step": 22800 }, { "epoch": 0.19, "grad_norm": 2.8661632537841797, "learning_rate": 8.12585420946239e-06, "loss": 4.7772, "step": 22900 }, { "epoch": 0.19, "grad_norm": 0.8099445104598999, "learning_rate": 8.117670166708952e-06, "loss": 4.7814, "step": 23000 }, { "epoch": 0.19, "grad_norm": 1.0212429761886597, "learning_rate": 8.109486123955512e-06, "loss": 4.7802, "step": 23100 }, { "epoch": 0.19, "grad_norm": 1.0928311347961426, "learning_rate": 8.101302081202072e-06, "loss": 4.777, "step": 23200 }, { "epoch": 0.19, "grad_norm": 1.8933788537979126, "learning_rate": 8.093118038448633e-06, "loss": 4.7832, "step": 23300 }, { "epoch": 0.19, "grad_norm": 0.9154900908470154, "learning_rate": 8.084933995695195e-06, "loss": 4.7854, "step": 23400 }, { "epoch": 0.19, "grad_norm": 4.510463714599609, "learning_rate": 8.076749952941755e-06, "loss": 4.7869, "step": 23500 }, { "epoch": 0.19, "grad_norm": 2.9989612102508545, "learning_rate": 8.068565910188315e-06, "loss": 4.785, "step": 23600 }, { "epoch": 0.19, "grad_norm": 1.1799613237380981, "learning_rate": 8.060381867434877e-06, "loss": 4.7845, "step": 23700 }, { "epoch": 0.19, "grad_norm": 0.8453476428985596, "learning_rate": 8.052197824681437e-06, "loss": 4.7849, "step": 23800 }, { "epoch": 0.2, "grad_norm": 1.3734474182128906, "learning_rate": 8.044013781927998e-06, "loss": 4.7799, "step": 23900 }, { "epoch": 0.2, "grad_norm": 7.692543983459473, "learning_rate": 8.035829739174558e-06, "loss": 4.7787, "step": 24000 }, { "epoch": 0.2, "grad_norm": 4.369629859924316, "learning_rate": 8.027645696421118e-06, "loss": 4.779, "step": 24100 }, { "epoch": 0.2, "grad_norm": 1.2958143949508667, "learning_rate": 8.01946165366768e-06, "loss": 4.7801, "step": 24200 }, { "epoch": 0.2, "grad_norm": 0.838066041469574, "learning_rate": 8.01127761091424e-06, "loss": 4.78, "step": 24300 }, { "epoch": 0.2, "grad_norm": 2.3854634761810303, "learning_rate": 8.0030935681608e-06, "loss": 4.7797, "step": 24400 }, { "epoch": 0.2, "grad_norm": 1.1053544282913208, "learning_rate": 7.994909525407361e-06, "loss": 4.7816, "step": 24500 }, { "epoch": 0.2, "grad_norm": 1.4308538436889648, "learning_rate": 7.986725482653921e-06, "loss": 4.7819, "step": 24600 }, { "epoch": 0.2, "grad_norm": 3.392876148223877, "learning_rate": 7.978541439900482e-06, "loss": 4.7782, "step": 24700 }, { "epoch": 0.2, "grad_norm": 1.5023071765899658, "learning_rate": 7.970357397147044e-06, "loss": 4.7828, "step": 24800 }, { "epoch": 0.2, "grad_norm": 0.7376088500022888, "learning_rate": 7.962173354393604e-06, "loss": 4.7816, "step": 24900 }, { "epoch": 0.2, "grad_norm": 3.311122417449951, "learning_rate": 7.953989311640164e-06, "loss": 4.7819, "step": 25000 }, { "epoch": 0.21, "grad_norm": 1.5772916078567505, "learning_rate": 7.945805268886724e-06, "loss": 4.7779, "step": 25100 }, { "epoch": 0.21, "grad_norm": 0.9813963174819946, "learning_rate": 7.937621226133286e-06, "loss": 4.781, "step": 25200 }, { "epoch": 0.21, "grad_norm": 0.7451574802398682, "learning_rate": 7.929437183379847e-06, "loss": 4.7817, "step": 25300 }, { "epoch": 0.21, "grad_norm": 1.064371943473816, "learning_rate": 7.921253140626407e-06, "loss": 4.7794, "step": 25400 }, { "epoch": 0.21, "grad_norm": 1.4085423946380615, "learning_rate": 7.913069097872967e-06, "loss": 4.7838, "step": 25500 }, { "epoch": 0.21, "grad_norm": 0.9586715698242188, "learning_rate": 7.904885055119529e-06, "loss": 4.7799, "step": 25600 }, { "epoch": 0.21, "grad_norm": 2.1608307361602783, "learning_rate": 7.89670101236609e-06, "loss": 4.7803, "step": 25700 }, { "epoch": 0.21, "grad_norm": 0.8091859221458435, "learning_rate": 7.88851696961265e-06, "loss": 4.784, "step": 25800 }, { "epoch": 0.21, "grad_norm": 0.7846326231956482, "learning_rate": 7.880332926859212e-06, "loss": 4.778, "step": 25900 }, { "epoch": 0.21, "grad_norm": 2.0327844619750977, "learning_rate": 7.872148884105772e-06, "loss": 4.7825, "step": 26000 }, { "epoch": 0.21, "grad_norm": 1.0779097080230713, "learning_rate": 7.863964841352332e-06, "loss": 4.7765, "step": 26100 }, { "epoch": 0.21, "grad_norm": 1.2201029062271118, "learning_rate": 7.855780798598892e-06, "loss": 4.776, "step": 26200 }, { "epoch": 0.22, "grad_norm": 1.4344494342803955, "learning_rate": 7.847596755845454e-06, "loss": 4.7781, "step": 26300 }, { "epoch": 0.22, "grad_norm": 1.114273190498352, "learning_rate": 7.839412713092015e-06, "loss": 4.7764, "step": 26400 }, { "epoch": 0.22, "grad_norm": 0.8567458391189575, "learning_rate": 7.831228670338575e-06, "loss": 4.7831, "step": 26500 }, { "epoch": 0.22, "grad_norm": 4.325285911560059, "learning_rate": 7.823044627585135e-06, "loss": 4.7811, "step": 26600 }, { "epoch": 0.22, "grad_norm": 1.2828031778335571, "learning_rate": 7.814860584831695e-06, "loss": 4.7795, "step": 26700 }, { "epoch": 0.22, "grad_norm": 2.048180341720581, "learning_rate": 7.806676542078256e-06, "loss": 4.7782, "step": 26800 }, { "epoch": 0.22, "grad_norm": 0.9398120045661926, "learning_rate": 7.798492499324818e-06, "loss": 4.778, "step": 26900 }, { "epoch": 0.22, "grad_norm": 0.8492949604988098, "learning_rate": 7.790308456571378e-06, "loss": 4.776, "step": 27000 }, { "epoch": 0.22, "grad_norm": 1.9857730865478516, "learning_rate": 7.782124413817938e-06, "loss": 4.7741, "step": 27100 }, { "epoch": 0.22, "grad_norm": 1.0787758827209473, "learning_rate": 7.773940371064498e-06, "loss": 4.7738, "step": 27200 }, { "epoch": 0.22, "grad_norm": 1.2202094793319702, "learning_rate": 7.765756328311059e-06, "loss": 4.777, "step": 27300 }, { "epoch": 0.22, "grad_norm": 6.543772220611572, "learning_rate": 7.75757228555762e-06, "loss": 4.778, "step": 27400 }, { "epoch": 0.23, "grad_norm": 0.9749574065208435, "learning_rate": 7.749388242804181e-06, "loss": 4.7761, "step": 27500 }, { "epoch": 0.23, "grad_norm": 1.2425750494003296, "learning_rate": 7.741204200050741e-06, "loss": 4.7799, "step": 27600 }, { "epoch": 0.23, "grad_norm": 2.1919734477996826, "learning_rate": 7.733020157297301e-06, "loss": 4.7781, "step": 27700 }, { "epoch": 0.23, "grad_norm": 1.6858105659484863, "learning_rate": 7.724836114543863e-06, "loss": 4.7761, "step": 27800 }, { "epoch": 0.23, "grad_norm": 1.082306146621704, "learning_rate": 7.716652071790424e-06, "loss": 4.7727, "step": 27900 }, { "epoch": 0.23, "grad_norm": 1.4394657611846924, "learning_rate": 7.708468029036984e-06, "loss": 4.7773, "step": 28000 }, { "epoch": 0.23, "grad_norm": 3.90745210647583, "learning_rate": 7.700283986283546e-06, "loss": 4.7771, "step": 28100 }, { "epoch": 0.23, "grad_norm": 1.1074409484863281, "learning_rate": 7.692099943530106e-06, "loss": 4.7813, "step": 28200 }, { "epoch": 0.23, "grad_norm": 0.775972306728363, "learning_rate": 7.683915900776666e-06, "loss": 4.7793, "step": 28300 }, { "epoch": 0.23, "grad_norm": 0.8545662760734558, "learning_rate": 7.675731858023227e-06, "loss": 4.7745, "step": 28400 }, { "epoch": 0.23, "grad_norm": 0.9553655385971069, "learning_rate": 7.667547815269789e-06, "loss": 4.7741, "step": 28500 }, { "epoch": 0.23, "grad_norm": 0.9458415508270264, "learning_rate": 7.659363772516349e-06, "loss": 4.7757, "step": 28600 }, { "epoch": 0.23, "grad_norm": 1.3447215557098389, "learning_rate": 7.65117972976291e-06, "loss": 4.7721, "step": 28700 }, { "epoch": 0.24, "grad_norm": 0.8614388108253479, "learning_rate": 7.64299568700947e-06, "loss": 4.7728, "step": 28800 }, { "epoch": 0.24, "grad_norm": 1.083814263343811, "learning_rate": 7.63481164425603e-06, "loss": 4.7714, "step": 28900 }, { "epoch": 0.24, "grad_norm": 0.9633229374885559, "learning_rate": 7.626627601502591e-06, "loss": 4.7736, "step": 29000 }, { "epoch": 0.24, "grad_norm": 1.1467323303222656, "learning_rate": 7.618443558749151e-06, "loss": 4.7771, "step": 29100 }, { "epoch": 0.24, "grad_norm": 2.2095165252685547, "learning_rate": 7.610259515995713e-06, "loss": 4.7712, "step": 29200 }, { "epoch": 0.24, "grad_norm": 0.9931008815765381, "learning_rate": 7.602075473242273e-06, "loss": 4.7776, "step": 29300 }, { "epoch": 0.24, "grad_norm": 12.287310600280762, "learning_rate": 7.593891430488834e-06, "loss": 4.7797, "step": 29400 }, { "epoch": 0.24, "grad_norm": 3.100107192993164, "learning_rate": 7.585707387735394e-06, "loss": 4.7759, "step": 29500 }, { "epoch": 0.24, "grad_norm": 2.249577045440674, "learning_rate": 7.577523344981955e-06, "loss": 4.7708, "step": 29600 }, { "epoch": 0.24, "grad_norm": 3.1430749893188477, "learning_rate": 7.569339302228515e-06, "loss": 4.7742, "step": 29700 }, { "epoch": 0.24, "grad_norm": 0.9823663830757141, "learning_rate": 7.5611552594750755e-06, "loss": 4.7739, "step": 29800 }, { "epoch": 0.24, "grad_norm": 4.1920166015625, "learning_rate": 7.552971216721637e-06, "loss": 4.778, "step": 29900 }, { "epoch": 0.25, "grad_norm": 2.579256534576416, "learning_rate": 7.544787173968198e-06, "loss": 4.7784, "step": 30000 }, { "epoch": 0.25, "grad_norm": 1.4403995275497437, "learning_rate": 7.536603131214758e-06, "loss": 4.7701, "step": 30100 }, { "epoch": 0.25, "grad_norm": 3.48757266998291, "learning_rate": 7.528419088461318e-06, "loss": 4.7705, "step": 30200 }, { "epoch": 0.25, "grad_norm": 4.121363162994385, "learning_rate": 7.52023504570788e-06, "loss": 4.7704, "step": 30300 }, { "epoch": 0.25, "grad_norm": 1.439936876296997, "learning_rate": 7.5120510029544405e-06, "loss": 4.7711, "step": 30400 }, { "epoch": 0.25, "grad_norm": 1.546036720275879, "learning_rate": 7.503866960201001e-06, "loss": 4.7691, "step": 30500 }, { "epoch": 0.25, "grad_norm": 5.736770153045654, "learning_rate": 7.495682917447561e-06, "loss": 4.7802, "step": 30600 }, { "epoch": 0.25, "grad_norm": 1.7358049154281616, "learning_rate": 7.487498874694122e-06, "loss": 4.771, "step": 30700 }, { "epoch": 0.25, "grad_norm": 0.9707129001617432, "learning_rate": 7.479314831940682e-06, "loss": 4.7691, "step": 30800 }, { "epoch": 0.25, "grad_norm": 2.6308770179748535, "learning_rate": 7.471130789187243e-06, "loss": 4.7706, "step": 30900 }, { "epoch": 0.25, "grad_norm": 2.7046618461608887, "learning_rate": 7.462946746433804e-06, "loss": 4.7767, "step": 31000 }, { "epoch": 0.25, "grad_norm": 1.627031922340393, "learning_rate": 7.454762703680365e-06, "loss": 4.7743, "step": 31100 }, { "epoch": 0.26, "grad_norm": 1.203337550163269, "learning_rate": 7.446578660926925e-06, "loss": 4.7764, "step": 31200 }, { "epoch": 0.26, "grad_norm": 1.2097506523132324, "learning_rate": 7.4383946181734854e-06, "loss": 4.7711, "step": 31300 }, { "epoch": 0.26, "grad_norm": 1.1853944063186646, "learning_rate": 7.430210575420047e-06, "loss": 4.7699, "step": 31400 }, { "epoch": 0.26, "grad_norm": 0.8438019752502441, "learning_rate": 7.422026532666608e-06, "loss": 4.7732, "step": 31500 }, { "epoch": 0.26, "grad_norm": 1.5300862789154053, "learning_rate": 7.413842489913168e-06, "loss": 4.7747, "step": 31600 }, { "epoch": 0.26, "grad_norm": 1.4141496419906616, "learning_rate": 7.405658447159728e-06, "loss": 4.7711, "step": 31700 }, { "epoch": 0.26, "grad_norm": 1.673567533493042, "learning_rate": 7.397474404406289e-06, "loss": 4.777, "step": 31800 }, { "epoch": 0.26, "grad_norm": 2.1357908248901367, "learning_rate": 7.38929036165285e-06, "loss": 4.7723, "step": 31900 }, { "epoch": 0.26, "grad_norm": 4.316195964813232, "learning_rate": 7.381106318899411e-06, "loss": 4.7678, "step": 32000 }, { "epoch": 0.26, "grad_norm": 1.7443231344223022, "learning_rate": 7.372922276145971e-06, "loss": 4.7707, "step": 32100 }, { "epoch": 0.26, "grad_norm": 2.7221803665161133, "learning_rate": 7.364738233392532e-06, "loss": 4.7708, "step": 32200 }, { "epoch": 0.26, "grad_norm": 0.8770394921302795, "learning_rate": 7.356554190639092e-06, "loss": 4.7721, "step": 32300 }, { "epoch": 0.27, "grad_norm": 0.9764662384986877, "learning_rate": 7.348370147885653e-06, "loss": 4.7756, "step": 32400 }, { "epoch": 0.27, "grad_norm": 3.6692917346954346, "learning_rate": 7.3401861051322146e-06, "loss": 4.7773, "step": 32500 }, { "epoch": 0.27, "grad_norm": 1.8451597690582275, "learning_rate": 7.332002062378775e-06, "loss": 4.7742, "step": 32600 }, { "epoch": 0.27, "grad_norm": 0.9092561602592468, "learning_rate": 7.323818019625335e-06, "loss": 4.7666, "step": 32700 }, { "epoch": 0.27, "grad_norm": 0.9218481779098511, "learning_rate": 7.315633976871895e-06, "loss": 4.7717, "step": 32800 }, { "epoch": 0.27, "grad_norm": 0.8461436033248901, "learning_rate": 7.3074499341184565e-06, "loss": 4.7752, "step": 32900 }, { "epoch": 0.27, "grad_norm": 0.879395067691803, "learning_rate": 7.299265891365017e-06, "loss": 4.7736, "step": 33000 }, { "epoch": 0.27, "grad_norm": 0.9056565165519714, "learning_rate": 7.291081848611578e-06, "loss": 4.7671, "step": 33100 }, { "epoch": 0.27, "grad_norm": 2.0318729877471924, "learning_rate": 7.282897805858138e-06, "loss": 4.7672, "step": 33200 }, { "epoch": 0.27, "grad_norm": 1.0115817785263062, "learning_rate": 7.274713763104699e-06, "loss": 4.7782, "step": 33300 }, { "epoch": 0.27, "grad_norm": 1.3754358291625977, "learning_rate": 7.2665297203512595e-06, "loss": 4.7711, "step": 33400 }, { "epoch": 0.27, "grad_norm": 1.6245019435882568, "learning_rate": 7.25834567759782e-06, "loss": 4.7716, "step": 33500 }, { "epoch": 0.27, "grad_norm": 1.5871042013168335, "learning_rate": 7.250161634844382e-06, "loss": 4.7689, "step": 33600 }, { "epoch": 0.28, "grad_norm": 1.6681900024414062, "learning_rate": 7.241977592090942e-06, "loss": 4.7734, "step": 33700 }, { "epoch": 0.28, "grad_norm": 1.2739856243133545, "learning_rate": 7.233793549337502e-06, "loss": 4.7746, "step": 33800 }, { "epoch": 0.28, "grad_norm": 1.8256975412368774, "learning_rate": 7.2256095065840625e-06, "loss": 4.7683, "step": 33900 }, { "epoch": 0.28, "grad_norm": 2.5301151275634766, "learning_rate": 7.217425463830624e-06, "loss": 4.7715, "step": 34000 }, { "epoch": 0.28, "grad_norm": 1.9835234880447388, "learning_rate": 7.209241421077185e-06, "loss": 4.7675, "step": 34100 }, { "epoch": 0.28, "grad_norm": 1.1152873039245605, "learning_rate": 7.201057378323745e-06, "loss": 4.767, "step": 34200 }, { "epoch": 0.28, "grad_norm": 4.025820732116699, "learning_rate": 7.192873335570305e-06, "loss": 4.7739, "step": 34300 }, { "epoch": 0.28, "grad_norm": 4.386086463928223, "learning_rate": 7.184689292816866e-06, "loss": 4.7716, "step": 34400 }, { "epoch": 0.28, "grad_norm": 3.9003477096557617, "learning_rate": 7.176505250063427e-06, "loss": 4.7691, "step": 34500 }, { "epoch": 0.28, "grad_norm": 1.3681395053863525, "learning_rate": 7.168321207309987e-06, "loss": 4.7747, "step": 34600 }, { "epoch": 0.28, "grad_norm": 12.061843872070312, "learning_rate": 7.160137164556549e-06, "loss": 4.7714, "step": 34700 }, { "epoch": 0.28, "grad_norm": 1.0302884578704834, "learning_rate": 7.151953121803109e-06, "loss": 4.7711, "step": 34800 }, { "epoch": 0.29, "grad_norm": 1.5776879787445068, "learning_rate": 7.1437690790496694e-06, "loss": 4.7682, "step": 34900 }, { "epoch": 0.29, "grad_norm": 1.52390718460083, "learning_rate": 7.13558503629623e-06, "loss": 4.7712, "step": 35000 }, { "epoch": 0.29, "grad_norm": 2.443223237991333, "learning_rate": 7.127400993542791e-06, "loss": 4.7696, "step": 35100 }, { "epoch": 0.29, "grad_norm": 1.4894499778747559, "learning_rate": 7.119216950789352e-06, "loss": 4.7739, "step": 35200 }, { "epoch": 0.29, "grad_norm": 1.0729478597640991, "learning_rate": 7.111032908035912e-06, "loss": 4.7728, "step": 35300 }, { "epoch": 0.29, "grad_norm": 1.0306514501571655, "learning_rate": 7.1028488652824725e-06, "loss": 4.7739, "step": 35400 }, { "epoch": 0.29, "grad_norm": 2.0036280155181885, "learning_rate": 7.094664822529034e-06, "loss": 4.7714, "step": 35500 }, { "epoch": 0.29, "grad_norm": 0.9095188975334167, "learning_rate": 7.086480779775594e-06, "loss": 4.7647, "step": 35600 }, { "epoch": 0.29, "grad_norm": 1.0899231433868408, "learning_rate": 7.078296737022154e-06, "loss": 4.7637, "step": 35700 }, { "epoch": 0.29, "grad_norm": 5.961045742034912, "learning_rate": 7.070112694268716e-06, "loss": 4.7722, "step": 35800 }, { "epoch": 0.29, "grad_norm": 0.9270179867744446, "learning_rate": 7.061928651515276e-06, "loss": 4.7682, "step": 35900 }, { "epoch": 0.29, "grad_norm": 1.7026699781417847, "learning_rate": 7.053744608761837e-06, "loss": 4.7697, "step": 36000 }, { "epoch": 0.3, "grad_norm": 1.5568283796310425, "learning_rate": 7.045560566008397e-06, "loss": 4.7695, "step": 36100 }, { "epoch": 0.3, "grad_norm": 4.148453712463379, "learning_rate": 7.037376523254959e-06, "loss": 4.7667, "step": 36200 }, { "epoch": 0.3, "grad_norm": 2.0565524101257324, "learning_rate": 7.029192480501519e-06, "loss": 4.7728, "step": 36300 }, { "epoch": 0.3, "grad_norm": 1.3457672595977783, "learning_rate": 7.021008437748079e-06, "loss": 4.7719, "step": 36400 }, { "epoch": 0.3, "grad_norm": 1.2144137620925903, "learning_rate": 7.01282439499464e-06, "loss": 4.7714, "step": 36500 }, { "epoch": 0.3, "grad_norm": 0.7697350978851318, "learning_rate": 7.004640352241201e-06, "loss": 4.7712, "step": 36600 }, { "epoch": 0.3, "grad_norm": 0.9844958782196045, "learning_rate": 6.996456309487761e-06, "loss": 4.7723, "step": 36700 }, { "epoch": 0.3, "grad_norm": 3.750422716140747, "learning_rate": 6.988272266734321e-06, "loss": 4.7717, "step": 36800 }, { "epoch": 0.3, "grad_norm": 2.6416289806365967, "learning_rate": 6.980088223980883e-06, "loss": 4.7696, "step": 36900 }, { "epoch": 0.3, "grad_norm": 1.745283842086792, "learning_rate": 6.9719041812274435e-06, "loss": 4.7681, "step": 37000 }, { "epoch": 0.3, "grad_norm": 1.1269015073776245, "learning_rate": 6.963720138474004e-06, "loss": 4.7659, "step": 37100 }, { "epoch": 0.3, "grad_norm": 4.63193941116333, "learning_rate": 6.955536095720564e-06, "loss": 4.7676, "step": 37200 }, { "epoch": 0.31, "grad_norm": 1.586788296699524, "learning_rate": 6.947352052967126e-06, "loss": 4.7729, "step": 37300 }, { "epoch": 0.31, "grad_norm": 1.2502408027648926, "learning_rate": 6.939168010213686e-06, "loss": 4.7718, "step": 37400 }, { "epoch": 0.31, "grad_norm": 1.620895504951477, "learning_rate": 6.9309839674602465e-06, "loss": 4.7703, "step": 37500 }, { "epoch": 0.31, "grad_norm": 0.9904155731201172, "learning_rate": 6.922799924706807e-06, "loss": 4.77, "step": 37600 }, { "epoch": 0.31, "grad_norm": 1.9440988302230835, "learning_rate": 6.914615881953368e-06, "loss": 4.7688, "step": 37700 }, { "epoch": 0.31, "grad_norm": 2.9490699768066406, "learning_rate": 6.906431839199928e-06, "loss": 4.7718, "step": 37800 }, { "epoch": 0.31, "grad_norm": 1.7235488891601562, "learning_rate": 6.898247796446489e-06, "loss": 4.7685, "step": 37900 }, { "epoch": 0.31, "grad_norm": 1.076762318611145, "learning_rate": 6.89006375369305e-06, "loss": 4.7708, "step": 38000 }, { "epoch": 0.31, "grad_norm": 0.8281159400939941, "learning_rate": 6.881879710939611e-06, "loss": 4.7697, "step": 38100 }, { "epoch": 0.31, "grad_norm": 1.1257106065750122, "learning_rate": 6.873695668186171e-06, "loss": 4.7727, "step": 38200 }, { "epoch": 0.31, "grad_norm": 4.7161760330200195, "learning_rate": 6.865511625432731e-06, "loss": 4.7695, "step": 38300 }, { "epoch": 0.31, "grad_norm": 2.1901695728302, "learning_rate": 6.857327582679293e-06, "loss": 4.7712, "step": 38400 }, { "epoch": 0.32, "grad_norm": 1.988418459892273, "learning_rate": 6.8491435399258535e-06, "loss": 4.7691, "step": 38500 }, { "epoch": 0.32, "grad_norm": 1.4924893379211426, "learning_rate": 6.840959497172414e-06, "loss": 4.7705, "step": 38600 }, { "epoch": 0.32, "grad_norm": 2.154937744140625, "learning_rate": 6.832775454418974e-06, "loss": 4.7652, "step": 38700 }, { "epoch": 0.32, "grad_norm": 1.1525272130966187, "learning_rate": 6.824591411665535e-06, "loss": 4.7705, "step": 38800 }, { "epoch": 0.32, "grad_norm": 0.9818894863128662, "learning_rate": 6.816407368912095e-06, "loss": 4.767, "step": 38900 }, { "epoch": 0.32, "grad_norm": 4.012678146362305, "learning_rate": 6.8082233261586565e-06, "loss": 4.7657, "step": 39000 }, { "epoch": 0.32, "grad_norm": 3.9307897090911865, "learning_rate": 6.800039283405218e-06, "loss": 4.7629, "step": 39100 }, { "epoch": 0.32, "grad_norm": 2.6684534549713135, "learning_rate": 6.791855240651778e-06, "loss": 4.7695, "step": 39200 }, { "epoch": 0.32, "grad_norm": 3.2453012466430664, "learning_rate": 6.783671197898338e-06, "loss": 4.7733, "step": 39300 }, { "epoch": 0.32, "grad_norm": 1.1828510761260986, "learning_rate": 6.775487155144898e-06, "loss": 4.7696, "step": 39400 }, { "epoch": 0.32, "grad_norm": 1.2635380029678345, "learning_rate": 6.76730311239146e-06, "loss": 4.7726, "step": 39500 }, { "epoch": 0.32, "grad_norm": 0.8589321970939636, "learning_rate": 6.759119069638021e-06, "loss": 4.768, "step": 39600 }, { "epoch": 0.32, "grad_norm": 1.4459155797958374, "learning_rate": 6.750935026884581e-06, "loss": 4.7664, "step": 39700 }, { "epoch": 0.33, "grad_norm": 3.3713083267211914, "learning_rate": 6.742750984131141e-06, "loss": 4.7684, "step": 39800 }, { "epoch": 0.33, "grad_norm": 3.4334757328033447, "learning_rate": 6.734566941377702e-06, "loss": 4.767, "step": 39900 }, { "epoch": 0.33, "grad_norm": 1.072906255722046, "learning_rate": 6.726382898624263e-06, "loss": 4.7623, "step": 40000 }, { "epoch": 0.33, "grad_norm": 1.8414703607559204, "learning_rate": 6.718198855870824e-06, "loss": 4.7657, "step": 40100 }, { "epoch": 0.33, "grad_norm": 1.423050045967102, "learning_rate": 6.710014813117385e-06, "loss": 4.7684, "step": 40200 }, { "epoch": 0.33, "grad_norm": 1.1521214246749878, "learning_rate": 6.701830770363945e-06, "loss": 4.7682, "step": 40300 }, { "epoch": 0.33, "grad_norm": 1.2841377258300781, "learning_rate": 6.693646727610505e-06, "loss": 4.7661, "step": 40400 }, { "epoch": 0.33, "grad_norm": 2.815016508102417, "learning_rate": 6.6854626848570656e-06, "loss": 4.7724, "step": 40500 }, { "epoch": 0.33, "grad_norm": 0.9150479435920715, "learning_rate": 6.6772786421036275e-06, "loss": 4.7667, "step": 40600 }, { "epoch": 0.33, "grad_norm": 1.2387914657592773, "learning_rate": 6.669094599350188e-06, "loss": 4.7695, "step": 40700 }, { "epoch": 0.33, "grad_norm": 3.5044503211975098, "learning_rate": 6.660910556596748e-06, "loss": 4.7671, "step": 40800 }, { "epoch": 0.33, "grad_norm": 1.1371254920959473, "learning_rate": 6.652726513843308e-06, "loss": 4.7652, "step": 40900 }, { "epoch": 0.34, "grad_norm": 1.3063691854476929, "learning_rate": 6.6445424710898694e-06, "loss": 4.7674, "step": 41000 }, { "epoch": 0.34, "grad_norm": 1.7822604179382324, "learning_rate": 6.6363584283364306e-06, "loss": 4.7692, "step": 41100 }, { "epoch": 0.34, "grad_norm": 0.9057841300964355, "learning_rate": 6.628174385582991e-06, "loss": 4.7676, "step": 41200 }, { "epoch": 0.34, "grad_norm": 1.9072014093399048, "learning_rate": 6.619990342829552e-06, "loss": 4.7625, "step": 41300 }, { "epoch": 0.34, "grad_norm": 0.9912985563278198, "learning_rate": 6.611806300076112e-06, "loss": 4.7709, "step": 41400 }, { "epoch": 0.34, "grad_norm": 2.278571605682373, "learning_rate": 6.6036222573226725e-06, "loss": 4.7725, "step": 41500 }, { "epoch": 0.34, "grad_norm": 2.2209765911102295, "learning_rate": 6.595438214569233e-06, "loss": 4.7693, "step": 41600 }, { "epoch": 0.34, "grad_norm": 1.4683972597122192, "learning_rate": 6.587254171815795e-06, "loss": 4.7715, "step": 41700 }, { "epoch": 0.34, "grad_norm": 2.1982457637786865, "learning_rate": 6.579070129062355e-06, "loss": 4.7629, "step": 41800 }, { "epoch": 0.34, "grad_norm": 3.916114568710327, "learning_rate": 6.570886086308915e-06, "loss": 4.7628, "step": 41900 }, { "epoch": 0.34, "grad_norm": 4.219468116760254, "learning_rate": 6.5627020435554755e-06, "loss": 4.766, "step": 42000 }, { "epoch": 0.34, "grad_norm": 1.3876959085464478, "learning_rate": 6.5545180008020375e-06, "loss": 4.7689, "step": 42100 }, { "epoch": 0.35, "grad_norm": 4.21665620803833, "learning_rate": 6.546333958048598e-06, "loss": 4.767, "step": 42200 }, { "epoch": 0.35, "grad_norm": 0.8205769658088684, "learning_rate": 6.538149915295158e-06, "loss": 4.7673, "step": 42300 }, { "epoch": 0.35, "grad_norm": 2.474163293838501, "learning_rate": 6.529965872541719e-06, "loss": 4.7662, "step": 42400 }, { "epoch": 0.35, "grad_norm": 2.4474871158599854, "learning_rate": 6.521781829788279e-06, "loss": 4.7672, "step": 42500 }, { "epoch": 0.35, "grad_norm": 1.422839879989624, "learning_rate": 6.51359778703484e-06, "loss": 4.7639, "step": 42600 }, { "epoch": 0.35, "grad_norm": 1.002898097038269, "learning_rate": 6.5054137442814e-06, "loss": 4.7707, "step": 42700 }, { "epoch": 0.35, "grad_norm": 1.21475350856781, "learning_rate": 6.497229701527962e-06, "loss": 4.7643, "step": 42800 }, { "epoch": 0.35, "grad_norm": 1.273319959640503, "learning_rate": 6.489045658774522e-06, "loss": 4.764, "step": 42900 }, { "epoch": 0.35, "grad_norm": 1.963934063911438, "learning_rate": 6.480861616021082e-06, "loss": 4.7675, "step": 43000 }, { "epoch": 0.35, "grad_norm": 2.1018524169921875, "learning_rate": 6.472677573267643e-06, "loss": 4.7722, "step": 43100 }, { "epoch": 0.35, "grad_norm": 1.1742087602615356, "learning_rate": 6.464493530514205e-06, "loss": 4.7697, "step": 43200 }, { "epoch": 0.35, "grad_norm": 1.3663380146026611, "learning_rate": 6.456309487760765e-06, "loss": 4.7654, "step": 43300 }, { "epoch": 0.36, "grad_norm": 1.2402204275131226, "learning_rate": 6.448125445007325e-06, "loss": 4.7657, "step": 43400 }, { "epoch": 0.36, "grad_norm": 1.8193938732147217, "learning_rate": 6.439941402253886e-06, "loss": 4.7631, "step": 43500 }, { "epoch": 0.36, "grad_norm": 0.9568284749984741, "learning_rate": 6.4317573595004465e-06, "loss": 4.7687, "step": 43600 }, { "epoch": 0.36, "grad_norm": 0.9550923109054565, "learning_rate": 6.423573316747007e-06, "loss": 4.766, "step": 43700 }, { "epoch": 0.36, "grad_norm": 1.496541976928711, "learning_rate": 6.415389273993567e-06, "loss": 4.7674, "step": 43800 }, { "epoch": 0.36, "grad_norm": 7.160891532897949, "learning_rate": 6.407205231240129e-06, "loss": 4.7667, "step": 43900 }, { "epoch": 0.36, "grad_norm": 1.1409087181091309, "learning_rate": 6.399021188486689e-06, "loss": 4.7657, "step": 44000 }, { "epoch": 0.36, "grad_norm": 1.3777464628219604, "learning_rate": 6.3908371457332496e-06, "loss": 4.7657, "step": 44100 }, { "epoch": 0.36, "grad_norm": 1.171425223350525, "learning_rate": 6.38265310297981e-06, "loss": 4.7645, "step": 44200 }, { "epoch": 0.36, "grad_norm": 1.4748115539550781, "learning_rate": 6.374469060226372e-06, "loss": 4.7622, "step": 44300 }, { "epoch": 0.36, "grad_norm": 4.880084991455078, "learning_rate": 6.366285017472932e-06, "loss": 4.766, "step": 44400 }, { "epoch": 0.36, "grad_norm": 1.557969331741333, "learning_rate": 6.358100974719492e-06, "loss": 4.7688, "step": 44500 }, { "epoch": 0.37, "grad_norm": 2.078839063644409, "learning_rate": 6.3499169319660534e-06, "loss": 4.7652, "step": 44600 }, { "epoch": 0.37, "grad_norm": 5.024000644683838, "learning_rate": 6.341732889212614e-06, "loss": 4.767, "step": 44700 }, { "epoch": 0.37, "grad_norm": 1.7246519327163696, "learning_rate": 6.333548846459174e-06, "loss": 4.7631, "step": 44800 }, { "epoch": 0.37, "grad_norm": 0.9058144688606262, "learning_rate": 6.325364803705735e-06, "loss": 4.7641, "step": 44900 }, { "epoch": 0.37, "grad_norm": 1.1066261529922485, "learning_rate": 6.317180760952296e-06, "loss": 4.7657, "step": 45000 }, { "epoch": 0.37, "grad_norm": 1.048341989517212, "learning_rate": 6.3089967181988565e-06, "loss": 4.7573, "step": 45100 }, { "epoch": 0.37, "grad_norm": 2.8883166313171387, "learning_rate": 6.300812675445417e-06, "loss": 4.7681, "step": 45200 }, { "epoch": 0.37, "grad_norm": 3.5398635864257812, "learning_rate": 6.292628632691977e-06, "loss": 4.7622, "step": 45300 }, { "epoch": 0.37, "grad_norm": 4.646157741546631, "learning_rate": 6.284444589938539e-06, "loss": 4.7675, "step": 45400 }, { "epoch": 0.37, "grad_norm": 1.9889692068099976, "learning_rate": 6.276260547185099e-06, "loss": 4.7616, "step": 45500 }, { "epoch": 0.37, "grad_norm": 1.8759746551513672, "learning_rate": 6.2680765044316595e-06, "loss": 4.7674, "step": 45600 }, { "epoch": 0.37, "grad_norm": 1.6846829652786255, "learning_rate": 6.259892461678221e-06, "loss": 4.762, "step": 45700 }, { "epoch": 0.37, "grad_norm": 0.9014168381690979, "learning_rate": 6.251708418924781e-06, "loss": 4.7671, "step": 45800 }, { "epoch": 0.38, "grad_norm": 2.2552223205566406, "learning_rate": 6.243524376171341e-06, "loss": 4.7665, "step": 45900 }, { "epoch": 0.38, "grad_norm": 7.994378566741943, "learning_rate": 6.235340333417902e-06, "loss": 4.7679, "step": 46000 }, { "epoch": 0.38, "grad_norm": 1.071326732635498, "learning_rate": 6.227156290664463e-06, "loss": 4.7621, "step": 46100 }, { "epoch": 0.38, "grad_norm": 1.366959810256958, "learning_rate": 6.218972247911024e-06, "loss": 4.7714, "step": 46200 }, { "epoch": 0.38, "grad_norm": 1.2230727672576904, "learning_rate": 6.210788205157584e-06, "loss": 4.7671, "step": 46300 }, { "epoch": 0.38, "grad_norm": 3.3939905166625977, "learning_rate": 6.202604162404144e-06, "loss": 4.7652, "step": 46400 }, { "epoch": 0.38, "grad_norm": 1.4360277652740479, "learning_rate": 6.194420119650706e-06, "loss": 4.7613, "step": 46500 }, { "epoch": 0.38, "grad_norm": 2.0407369136810303, "learning_rate": 6.186236076897266e-06, "loss": 4.7704, "step": 46600 }, { "epoch": 0.38, "grad_norm": 0.9247026443481445, "learning_rate": 6.178052034143827e-06, "loss": 4.7611, "step": 46700 }, { "epoch": 0.38, "grad_norm": 1.4636540412902832, "learning_rate": 6.169867991390388e-06, "loss": 4.7658, "step": 46800 }, { "epoch": 0.38, "grad_norm": 1.443965196609497, "learning_rate": 6.161683948636948e-06, "loss": 4.7674, "step": 46900 }, { "epoch": 0.38, "grad_norm": 1.4305676221847534, "learning_rate": 6.153499905883509e-06, "loss": 4.7653, "step": 47000 }, { "epoch": 0.39, "grad_norm": 0.9233379364013672, "learning_rate": 6.1453158631300694e-06, "loss": 4.7685, "step": 47100 }, { "epoch": 0.39, "grad_norm": 0.9005165100097656, "learning_rate": 6.1371318203766305e-06, "loss": 4.7605, "step": 47200 }, { "epoch": 0.39, "grad_norm": 2.1827316284179688, "learning_rate": 6.128947777623191e-06, "loss": 4.7639, "step": 47300 }, { "epoch": 0.39, "grad_norm": 1.330802083015442, "learning_rate": 6.120763734869751e-06, "loss": 4.765, "step": 47400 }, { "epoch": 0.39, "grad_norm": 1.0128445625305176, "learning_rate": 6.112579692116311e-06, "loss": 4.7704, "step": 47500 }, { "epoch": 0.39, "grad_norm": 3.6746978759765625, "learning_rate": 6.104395649362873e-06, "loss": 4.7645, "step": 47600 }, { "epoch": 0.39, "grad_norm": 1.546904444694519, "learning_rate": 6.0962116066094336e-06, "loss": 4.764, "step": 47700 }, { "epoch": 0.39, "grad_norm": 3.3686161041259766, "learning_rate": 6.088027563855994e-06, "loss": 4.7679, "step": 47800 }, { "epoch": 0.39, "grad_norm": 1.7452073097229004, "learning_rate": 6.079843521102555e-06, "loss": 4.7597, "step": 47900 }, { "epoch": 0.39, "grad_norm": 1.0575987100601196, "learning_rate": 6.071659478349115e-06, "loss": 4.7656, "step": 48000 }, { "epoch": 0.39, "grad_norm": 5.366837501525879, "learning_rate": 6.063475435595676e-06, "loss": 4.761, "step": 48100 }, { "epoch": 0.39, "grad_norm": 0.977851927280426, "learning_rate": 6.055291392842237e-06, "loss": 4.7644, "step": 48200 }, { "epoch": 0.4, "grad_norm": 4.891844749450684, "learning_rate": 6.047107350088798e-06, "loss": 4.7681, "step": 48300 }, { "epoch": 0.4, "grad_norm": 1.2509273290634155, "learning_rate": 6.038923307335358e-06, "loss": 4.7587, "step": 48400 }, { "epoch": 0.4, "grad_norm": 1.095798373222351, "learning_rate": 6.030739264581918e-06, "loss": 4.7617, "step": 48500 }, { "epoch": 0.4, "grad_norm": 1.093984842300415, "learning_rate": 6.0225552218284785e-06, "loss": 4.7629, "step": 48600 }, { "epoch": 0.4, "grad_norm": 0.8967887759208679, "learning_rate": 6.0143711790750405e-06, "loss": 4.7623, "step": 48700 }, { "epoch": 0.4, "grad_norm": 0.8838722705841064, "learning_rate": 6.006187136321601e-06, "loss": 4.7658, "step": 48800 }, { "epoch": 0.4, "grad_norm": 1.5173430442810059, "learning_rate": 5.998003093568161e-06, "loss": 4.7631, "step": 48900 }, { "epoch": 0.4, "grad_norm": 2.8584578037261963, "learning_rate": 5.989819050814722e-06, "loss": 4.7597, "step": 49000 }, { "epoch": 0.4, "grad_norm": 2.672813892364502, "learning_rate": 5.981635008061283e-06, "loss": 4.7614, "step": 49100 }, { "epoch": 0.4, "grad_norm": 1.2858943939208984, "learning_rate": 5.9734509653078435e-06, "loss": 4.7585, "step": 49200 }, { "epoch": 0.4, "grad_norm": 2.0808370113372803, "learning_rate": 5.965266922554404e-06, "loss": 4.7652, "step": 49300 }, { "epoch": 0.4, "grad_norm": 1.6658849716186523, "learning_rate": 5.957082879800965e-06, "loss": 4.7622, "step": 49400 }, { "epoch": 0.41, "grad_norm": 1.312129259109497, "learning_rate": 5.948898837047525e-06, "loss": 4.7643, "step": 49500 }, { "epoch": 0.41, "grad_norm": 1.109014868736267, "learning_rate": 5.940714794294085e-06, "loss": 4.7635, "step": 49600 }, { "epoch": 0.41, "grad_norm": 1.059754729270935, "learning_rate": 5.932530751540646e-06, "loss": 4.7635, "step": 49700 }, { "epoch": 0.41, "grad_norm": 1.4219303131103516, "learning_rate": 5.924346708787208e-06, "loss": 4.7644, "step": 49800 }, { "epoch": 0.41, "grad_norm": 4.515329360961914, "learning_rate": 5.916162666033768e-06, "loss": 4.764, "step": 49900 }, { "epoch": 0.41, "grad_norm": 2.7879600524902344, "learning_rate": 5.907978623280328e-06, "loss": 4.7629, "step": 50000 }, { "epoch": 0.41, "grad_norm": 1.4872534275054932, "learning_rate": 5.899794580526889e-06, "loss": 4.7627, "step": 50100 }, { "epoch": 0.41, "grad_norm": 1.7036062479019165, "learning_rate": 5.89161053777345e-06, "loss": 4.7602, "step": 50200 }, { "epoch": 0.41, "grad_norm": 1.1334501504898071, "learning_rate": 5.883426495020011e-06, "loss": 4.7649, "step": 50300 }, { "epoch": 0.41, "grad_norm": 1.8255513906478882, "learning_rate": 5.875242452266571e-06, "loss": 4.7662, "step": 50400 }, { "epoch": 0.41, "grad_norm": 0.9870157241821289, "learning_rate": 5.867058409513132e-06, "loss": 4.7613, "step": 50500 }, { "epoch": 0.41, "grad_norm": 1.7012158632278442, "learning_rate": 5.858874366759692e-06, "loss": 4.7609, "step": 50600 }, { "epoch": 0.41, "grad_norm": 0.9148879051208496, "learning_rate": 5.850690324006253e-06, "loss": 4.7601, "step": 50700 }, { "epoch": 0.42, "grad_norm": 1.1383265256881714, "learning_rate": 5.842506281252813e-06, "loss": 4.7583, "step": 50800 }, { "epoch": 0.42, "grad_norm": 0.8608391284942627, "learning_rate": 5.834322238499375e-06, "loss": 4.7639, "step": 50900 }, { "epoch": 0.42, "grad_norm": 2.019380807876587, "learning_rate": 5.826138195745935e-06, "loss": 4.7638, "step": 51000 }, { "epoch": 0.42, "grad_norm": 2.069756031036377, "learning_rate": 5.817954152992495e-06, "loss": 4.7691, "step": 51100 }, { "epoch": 0.42, "grad_norm": 1.2474431991577148, "learning_rate": 5.809770110239057e-06, "loss": 4.7668, "step": 51200 }, { "epoch": 0.42, "grad_norm": 1.1330208778381348, "learning_rate": 5.8015860674856176e-06, "loss": 4.7647, "step": 51300 }, { "epoch": 0.42, "grad_norm": 1.4908925294876099, "learning_rate": 5.793402024732178e-06, "loss": 4.759, "step": 51400 }, { "epoch": 0.42, "grad_norm": 1.1022288799285889, "learning_rate": 5.785217981978738e-06, "loss": 4.7583, "step": 51500 }, { "epoch": 0.42, "grad_norm": 1.2714922428131104, "learning_rate": 5.777033939225299e-06, "loss": 4.7619, "step": 51600 }, { "epoch": 0.42, "grad_norm": 1.3751145601272583, "learning_rate": 5.7688498964718595e-06, "loss": 4.7607, "step": 51700 }, { "epoch": 0.42, "grad_norm": 0.9955260753631592, "learning_rate": 5.76066585371842e-06, "loss": 4.7612, "step": 51800 }, { "epoch": 0.42, "grad_norm": 1.48429274559021, "learning_rate": 5.752481810964981e-06, "loss": 4.7631, "step": 51900 }, { "epoch": 0.43, "grad_norm": 3.2120578289031982, "learning_rate": 5.744297768211542e-06, "loss": 4.7652, "step": 52000 }, { "epoch": 0.43, "grad_norm": 1.1913766860961914, "learning_rate": 5.736113725458102e-06, "loss": 4.7651, "step": 52100 }, { "epoch": 0.43, "grad_norm": 3.3173420429229736, "learning_rate": 5.7279296827046625e-06, "loss": 4.7583, "step": 52200 }, { "epoch": 0.43, "grad_norm": 2.184640407562256, "learning_rate": 5.7197456399512245e-06, "loss": 4.7674, "step": 52300 }, { "epoch": 0.43, "grad_norm": 1.406989574432373, "learning_rate": 5.711561597197785e-06, "loss": 4.7624, "step": 52400 }, { "epoch": 0.43, "grad_norm": 4.420721530914307, "learning_rate": 5.703377554444345e-06, "loss": 4.7584, "step": 52500 }, { "epoch": 0.43, "grad_norm": 1.6438698768615723, "learning_rate": 5.695193511690905e-06, "loss": 4.759, "step": 52600 }, { "epoch": 0.43, "grad_norm": 1.0008363723754883, "learning_rate": 5.687009468937466e-06, "loss": 4.7622, "step": 52700 }, { "epoch": 0.43, "grad_norm": 0.9954501986503601, "learning_rate": 5.678825426184027e-06, "loss": 4.7599, "step": 52800 }, { "epoch": 0.43, "grad_norm": 2.2024335861206055, "learning_rate": 5.670641383430587e-06, "loss": 4.757, "step": 52900 }, { "epoch": 0.43, "grad_norm": 0.8690518140792847, "learning_rate": 5.662457340677148e-06, "loss": 4.7638, "step": 53000 }, { "epoch": 0.43, "grad_norm": 4.789288520812988, "learning_rate": 5.654273297923709e-06, "loss": 4.7651, "step": 53100 }, { "epoch": 0.44, "grad_norm": 1.9161509275436401, "learning_rate": 5.646089255170269e-06, "loss": 4.762, "step": 53200 }, { "epoch": 0.44, "grad_norm": 1.1791253089904785, "learning_rate": 5.63790521241683e-06, "loss": 4.7629, "step": 53300 }, { "epoch": 0.44, "grad_norm": 3.780832529067993, "learning_rate": 5.629721169663392e-06, "loss": 4.7553, "step": 53400 }, { "epoch": 0.44, "grad_norm": 1.1403292417526245, "learning_rate": 5.621537126909952e-06, "loss": 4.7615, "step": 53500 }, { "epoch": 0.44, "grad_norm": 1.2787580490112305, "learning_rate": 5.613353084156512e-06, "loss": 4.7623, "step": 53600 }, { "epoch": 0.44, "grad_norm": 2.774376153945923, "learning_rate": 5.6051690414030724e-06, "loss": 4.7626, "step": 53700 }, { "epoch": 0.44, "grad_norm": 3.4365766048431396, "learning_rate": 5.5969849986496336e-06, "loss": 4.7573, "step": 53800 }, { "epoch": 0.44, "grad_norm": 1.0744109153747559, "learning_rate": 5.588800955896194e-06, "loss": 4.7621, "step": 53900 }, { "epoch": 0.44, "grad_norm": 0.9712745547294617, "learning_rate": 5.580616913142755e-06, "loss": 4.7605, "step": 54000 }, { "epoch": 0.44, "grad_norm": 1.24153470993042, "learning_rate": 5.572432870389315e-06, "loss": 4.7654, "step": 54100 }, { "epoch": 0.44, "grad_norm": 1.9112986326217651, "learning_rate": 5.564248827635876e-06, "loss": 4.7666, "step": 54200 }, { "epoch": 0.44, "grad_norm": 2.2796552181243896, "learning_rate": 5.556064784882437e-06, "loss": 4.7594, "step": 54300 }, { "epoch": 0.45, "grad_norm": 1.3988897800445557, "learning_rate": 5.547880742128997e-06, "loss": 4.7596, "step": 54400 }, { "epoch": 0.45, "grad_norm": 1.2597512006759644, "learning_rate": 5.539696699375559e-06, "loss": 4.7648, "step": 54500 }, { "epoch": 0.45, "grad_norm": 2.735841989517212, "learning_rate": 5.531512656622119e-06, "loss": 4.766, "step": 54600 }, { "epoch": 0.45, "grad_norm": 1.2517529726028442, "learning_rate": 5.523328613868679e-06, "loss": 4.7585, "step": 54700 }, { "epoch": 0.45, "grad_norm": 1.7145378589630127, "learning_rate": 5.51514457111524e-06, "loss": 4.7575, "step": 54800 }, { "epoch": 0.45, "grad_norm": 1.1680995225906372, "learning_rate": 5.506960528361801e-06, "loss": 4.7611, "step": 54900 }, { "epoch": 0.45, "grad_norm": 3.5684611797332764, "learning_rate": 5.498776485608361e-06, "loss": 4.7579, "step": 55000 }, { "epoch": 0.45, "grad_norm": 1.0898152589797974, "learning_rate": 5.490592442854922e-06, "loss": 4.7587, "step": 55100 }, { "epoch": 0.45, "grad_norm": 1.0716261863708496, "learning_rate": 5.482408400101482e-06, "loss": 4.7605, "step": 55200 }, { "epoch": 0.45, "grad_norm": 2.373514413833618, "learning_rate": 5.4742243573480435e-06, "loss": 4.7626, "step": 55300 }, { "epoch": 0.45, "grad_norm": 3.0533201694488525, "learning_rate": 5.466040314594604e-06, "loss": 4.7601, "step": 55400 }, { "epoch": 0.45, "grad_norm": 0.9586790800094604, "learning_rate": 5.457856271841164e-06, "loss": 4.7607, "step": 55500 }, { "epoch": 0.46, "grad_norm": 1.8159968852996826, "learning_rate": 5.449672229087726e-06, "loss": 4.7613, "step": 55600 }, { "epoch": 0.46, "grad_norm": 1.0811960697174072, "learning_rate": 5.441488186334286e-06, "loss": 4.7613, "step": 55700 }, { "epoch": 0.46, "grad_norm": 1.0583518743515015, "learning_rate": 5.4333041435808465e-06, "loss": 4.7558, "step": 55800 }, { "epoch": 0.46, "grad_norm": 0.993607223033905, "learning_rate": 5.425120100827407e-06, "loss": 4.7577, "step": 55900 }, { "epoch": 0.46, "grad_norm": 1.3715596199035645, "learning_rate": 5.416936058073968e-06, "loss": 4.7565, "step": 56000 }, { "epoch": 0.46, "grad_norm": 1.3586755990982056, "learning_rate": 5.408752015320529e-06, "loss": 4.7627, "step": 56100 }, { "epoch": 0.46, "grad_norm": 1.5538294315338135, "learning_rate": 5.400567972567089e-06, "loss": 4.7616, "step": 56200 }, { "epoch": 0.46, "grad_norm": 1.3117858171463013, "learning_rate": 5.3923839298136495e-06, "loss": 4.7615, "step": 56300 }, { "epoch": 0.46, "grad_norm": 2.4825961589813232, "learning_rate": 5.384199887060211e-06, "loss": 4.7673, "step": 56400 }, { "epoch": 0.46, "grad_norm": 3.6540427207946777, "learning_rate": 5.376015844306771e-06, "loss": 4.7583, "step": 56500 }, { "epoch": 0.46, "grad_norm": 1.217731237411499, "learning_rate": 5.367831801553331e-06, "loss": 4.7647, "step": 56600 }, { "epoch": 0.46, "grad_norm": 1.3690531253814697, "learning_rate": 5.359647758799893e-06, "loss": 4.7607, "step": 56700 }, { "epoch": 0.46, "grad_norm": 2.986236572265625, "learning_rate": 5.351463716046453e-06, "loss": 4.7649, "step": 56800 }, { "epoch": 0.47, "grad_norm": 4.169172763824463, "learning_rate": 5.343279673293014e-06, "loss": 4.7609, "step": 56900 }, { "epoch": 0.47, "grad_norm": 3.941506862640381, "learning_rate": 5.335095630539574e-06, "loss": 4.7588, "step": 57000 }, { "epoch": 0.47, "grad_norm": 1.151626467704773, "learning_rate": 5.326911587786135e-06, "loss": 4.7626, "step": 57100 }, { "epoch": 0.47, "grad_norm": 1.1332154273986816, "learning_rate": 5.318727545032696e-06, "loss": 4.7638, "step": 57200 }, { "epoch": 0.47, "grad_norm": 1.0343974828720093, "learning_rate": 5.3105435022792564e-06, "loss": 4.7591, "step": 57300 }, { "epoch": 0.47, "grad_norm": 1.0772465467453003, "learning_rate": 5.302359459525817e-06, "loss": 4.7594, "step": 57400 }, { "epoch": 0.47, "grad_norm": 5.96359920501709, "learning_rate": 5.294175416772378e-06, "loss": 4.7597, "step": 57500 }, { "epoch": 0.47, "grad_norm": 1.451434850692749, "learning_rate": 5.285991374018938e-06, "loss": 4.7622, "step": 57600 }, { "epoch": 0.47, "grad_norm": 3.3634424209594727, "learning_rate": 5.277807331265498e-06, "loss": 4.7639, "step": 57700 }, { "epoch": 0.47, "grad_norm": 1.3753291368484497, "learning_rate": 5.2696232885120595e-06, "loss": 4.762, "step": 57800 }, { "epoch": 0.47, "grad_norm": 1.0764108896255493, "learning_rate": 5.261439245758621e-06, "loss": 4.7565, "step": 57900 }, { "epoch": 0.47, "grad_norm": 1.7822853326797485, "learning_rate": 5.253255203005181e-06, "loss": 4.76, "step": 58000 }, { "epoch": 0.48, "grad_norm": 1.0505635738372803, "learning_rate": 5.245071160251741e-06, "loss": 4.7563, "step": 58100 }, { "epoch": 0.48, "grad_norm": 2.2111964225769043, "learning_rate": 5.236887117498303e-06, "loss": 4.7613, "step": 58200 }, { "epoch": 0.48, "grad_norm": 1.0773766040802002, "learning_rate": 5.228703074744863e-06, "loss": 4.7645, "step": 58300 }, { "epoch": 0.48, "grad_norm": 1.387395977973938, "learning_rate": 5.220519031991424e-06, "loss": 4.7581, "step": 58400 }, { "epoch": 0.48, "grad_norm": 1.5407155752182007, "learning_rate": 5.212334989237984e-06, "loss": 4.7599, "step": 58500 }, { "epoch": 0.48, "grad_norm": 1.9045255184173584, "learning_rate": 5.204150946484545e-06, "loss": 4.7632, "step": 58600 }, { "epoch": 0.48, "grad_norm": 1.1140589714050293, "learning_rate": 5.195966903731105e-06, "loss": 4.758, "step": 58700 }, { "epoch": 0.48, "grad_norm": 3.0775272846221924, "learning_rate": 5.1877828609776655e-06, "loss": 4.7591, "step": 58800 }, { "epoch": 0.48, "grad_norm": 1.5358976125717163, "learning_rate": 5.179598818224227e-06, "loss": 4.7613, "step": 58900 }, { "epoch": 0.48, "grad_norm": 2.506425380706787, "learning_rate": 5.171414775470788e-06, "loss": 4.7584, "step": 59000 }, { "epoch": 0.48, "grad_norm": 1.562016248703003, "learning_rate": 5.163230732717348e-06, "loss": 4.7603, "step": 59100 }, { "epoch": 0.48, "grad_norm": 3.894599437713623, "learning_rate": 5.155046689963908e-06, "loss": 4.7582, "step": 59200 }, { "epoch": 0.49, "grad_norm": 1.7475073337554932, "learning_rate": 5.14686264721047e-06, "loss": 4.7552, "step": 59300 }, { "epoch": 0.49, "grad_norm": 2.168311357498169, "learning_rate": 5.1386786044570305e-06, "loss": 4.7546, "step": 59400 }, { "epoch": 0.49, "grad_norm": 1.3866901397705078, "learning_rate": 5.130494561703591e-06, "loss": 4.7595, "step": 59500 }, { "epoch": 0.49, "grad_norm": 1.5141569375991821, "learning_rate": 5.122310518950151e-06, "loss": 4.7594, "step": 59600 }, { "epoch": 0.49, "grad_norm": 1.22174072265625, "learning_rate": 5.114126476196712e-06, "loss": 4.7591, "step": 59700 }, { "epoch": 0.49, "grad_norm": 1.8501654863357544, "learning_rate": 5.1059424334432724e-06, "loss": 4.7566, "step": 59800 }, { "epoch": 0.49, "grad_norm": 0.8938846588134766, "learning_rate": 5.0977583906898336e-06, "loss": 4.7601, "step": 59900 }, { "epoch": 0.49, "grad_norm": 3.8000481128692627, "learning_rate": 5.089574347936394e-06, "loss": 4.7645, "step": 60000 }, { "epoch": 0.49, "grad_norm": 2.3883254528045654, "learning_rate": 5.081390305182955e-06, "loss": 4.759, "step": 60100 }, { "epoch": 0.49, "grad_norm": 1.5231989622116089, "learning_rate": 5.073206262429515e-06, "loss": 4.7591, "step": 60200 }, { "epoch": 0.49, "grad_norm": 2.584988832473755, "learning_rate": 5.0650222196760755e-06, "loss": 4.7614, "step": 60300 }, { "epoch": 0.49, "grad_norm": 1.4782294034957886, "learning_rate": 5.0568381769226374e-06, "loss": 4.7577, "step": 60400 }, { "epoch": 0.5, "grad_norm": 2.2520744800567627, "learning_rate": 5.048654134169198e-06, "loss": 4.7576, "step": 60500 }, { "epoch": 0.5, "grad_norm": 1.1555761098861694, "learning_rate": 5.040470091415758e-06, "loss": 4.7613, "step": 60600 }, { "epoch": 0.5, "grad_norm": 1.0871673822402954, "learning_rate": 5.032286048662318e-06, "loss": 4.7534, "step": 60700 }, { "epoch": 0.5, "grad_norm": 0.9198378324508667, "learning_rate": 5.024102005908879e-06, "loss": 4.7599, "step": 60800 }, { "epoch": 0.5, "grad_norm": 1.8254669904708862, "learning_rate": 5.01591796315544e-06, "loss": 4.7577, "step": 60900 }, { "epoch": 0.5, "grad_norm": 2.592374563217163, "learning_rate": 5.007733920402001e-06, "loss": 4.7596, "step": 61000 }, { "epoch": 0.5, "grad_norm": 1.4294451475143433, "learning_rate": 4.999549877648561e-06, "loss": 4.7587, "step": 61100 }, { "epoch": 0.5, "grad_norm": 1.3959410190582275, "learning_rate": 4.991365834895122e-06, "loss": 4.7587, "step": 61200 }, { "epoch": 0.5, "grad_norm": 1.02007257938385, "learning_rate": 4.983181792141682e-06, "loss": 4.7591, "step": 61300 }, { "epoch": 0.5, "grad_norm": 1.9636856317520142, "learning_rate": 4.9749977493882435e-06, "loss": 4.7586, "step": 61400 }, { "epoch": 0.5, "grad_norm": 1.2003675699234009, "learning_rate": 4.966813706634804e-06, "loss": 4.7578, "step": 61500 }, { "epoch": 0.5, "grad_norm": 1.2069259881973267, "learning_rate": 4.958629663881365e-06, "loss": 4.7594, "step": 61600 }, { "epoch": 0.5, "grad_norm": 1.160438895225525, "learning_rate": 4.950445621127925e-06, "loss": 4.7537, "step": 61700 }, { "epoch": 0.51, "grad_norm": 1.4509081840515137, "learning_rate": 4.942261578374486e-06, "loss": 4.7566, "step": 61800 }, { "epoch": 0.51, "grad_norm": 1.4008055925369263, "learning_rate": 4.9340775356210465e-06, "loss": 4.7596, "step": 61900 }, { "epoch": 0.51, "grad_norm": 2.445732593536377, "learning_rate": 4.925893492867608e-06, "loss": 4.7585, "step": 62000 }, { "epoch": 0.51, "grad_norm": 2.155773162841797, "learning_rate": 4.917709450114168e-06, "loss": 4.7594, "step": 62100 }, { "epoch": 0.51, "grad_norm": 1.3300435543060303, "learning_rate": 4.909525407360728e-06, "loss": 4.7599, "step": 62200 }, { "epoch": 0.51, "grad_norm": 1.1474816799163818, "learning_rate": 4.901341364607289e-06, "loss": 4.7575, "step": 62300 }, { "epoch": 0.51, "grad_norm": 1.1090396642684937, "learning_rate": 4.8931573218538495e-06, "loss": 4.7556, "step": 62400 }, { "epoch": 0.51, "grad_norm": 1.4400147199630737, "learning_rate": 4.884973279100411e-06, "loss": 4.7543, "step": 62500 }, { "epoch": 0.51, "grad_norm": 1.534568190574646, "learning_rate": 4.876789236346971e-06, "loss": 4.7542, "step": 62600 }, { "epoch": 0.51, "grad_norm": 1.1177995204925537, "learning_rate": 4.868605193593532e-06, "loss": 4.7557, "step": 62700 }, { "epoch": 0.51, "grad_norm": 1.2250655889511108, "learning_rate": 4.860421150840092e-06, "loss": 4.7588, "step": 62800 }, { "epoch": 0.51, "grad_norm": 1.2332854270935059, "learning_rate": 4.852237108086653e-06, "loss": 4.7623, "step": 62900 }, { "epoch": 0.52, "grad_norm": 1.0365347862243652, "learning_rate": 4.844053065333214e-06, "loss": 4.7591, "step": 63000 }, { "epoch": 0.52, "grad_norm": 3.2265894412994385, "learning_rate": 4.835869022579775e-06, "loss": 4.7589, "step": 63100 }, { "epoch": 0.52, "grad_norm": 1.374605417251587, "learning_rate": 4.827684979826335e-06, "loss": 4.7574, "step": 63200 }, { "epoch": 0.52, "grad_norm": 1.239890694618225, "learning_rate": 4.819500937072895e-06, "loss": 4.7622, "step": 63300 }, { "epoch": 0.52, "grad_norm": 4.042061805725098, "learning_rate": 4.8113168943194564e-06, "loss": 4.7591, "step": 63400 }, { "epoch": 0.52, "grad_norm": 2.166978597640991, "learning_rate": 4.803132851566017e-06, "loss": 4.7594, "step": 63500 }, { "epoch": 0.52, "grad_norm": 1.0814965963363647, "learning_rate": 4.794948808812578e-06, "loss": 4.759, "step": 63600 }, { "epoch": 0.52, "grad_norm": 1.7993803024291992, "learning_rate": 4.786764766059138e-06, "loss": 4.7574, "step": 63700 }, { "epoch": 0.52, "grad_norm": 1.1397624015808105, "learning_rate": 4.778580723305699e-06, "loss": 4.7568, "step": 63800 }, { "epoch": 0.52, "grad_norm": 1.801677942276001, "learning_rate": 4.7703966805522595e-06, "loss": 4.7561, "step": 63900 }, { "epoch": 0.52, "grad_norm": 1.5364161729812622, "learning_rate": 4.762212637798821e-06, "loss": 4.7584, "step": 64000 }, { "epoch": 0.52, "grad_norm": 1.0341291427612305, "learning_rate": 4.754028595045381e-06, "loss": 4.7581, "step": 64100 }, { "epoch": 0.53, "grad_norm": 1.0642578601837158, "learning_rate": 4.745844552291942e-06, "loss": 4.7566, "step": 64200 }, { "epoch": 0.53, "grad_norm": 1.6422146558761597, "learning_rate": 4.737660509538502e-06, "loss": 4.7583, "step": 64300 }, { "epoch": 0.53, "grad_norm": 1.8048427104949951, "learning_rate": 4.7294764667850625e-06, "loss": 4.7575, "step": 64400 }, { "epoch": 0.53, "grad_norm": 1.5397706031799316, "learning_rate": 4.721292424031624e-06, "loss": 4.7603, "step": 64500 }, { "epoch": 0.53, "grad_norm": 1.1673585176467896, "learning_rate": 4.713108381278184e-06, "loss": 4.7603, "step": 64600 }, { "epoch": 0.53, "grad_norm": 1.130509376525879, "learning_rate": 4.704924338524745e-06, "loss": 4.7582, "step": 64700 }, { "epoch": 0.53, "grad_norm": 1.8829139471054077, "learning_rate": 4.696740295771305e-06, "loss": 4.7559, "step": 64800 }, { "epoch": 0.53, "grad_norm": 1.014657974243164, "learning_rate": 4.688556253017866e-06, "loss": 4.7585, "step": 64900 }, { "epoch": 0.53, "grad_norm": 1.3047457933425903, "learning_rate": 4.680372210264427e-06, "loss": 4.7582, "step": 65000 }, { "epoch": 0.53, "grad_norm": 5.515758514404297, "learning_rate": 4.672188167510988e-06, "loss": 4.7581, "step": 65100 }, { "epoch": 0.53, "grad_norm": 1.061341643333435, "learning_rate": 4.664004124757548e-06, "loss": 4.7537, "step": 65200 }, { "epoch": 0.53, "grad_norm": 1.1620299816131592, "learning_rate": 4.655820082004109e-06, "loss": 4.7589, "step": 65300 }, { "epoch": 0.54, "grad_norm": 0.9839982390403748, "learning_rate": 4.647636039250669e-06, "loss": 4.7602, "step": 65400 }, { "epoch": 0.54, "grad_norm": 1.1345555782318115, "learning_rate": 4.6394519964972305e-06, "loss": 4.7595, "step": 65500 }, { "epoch": 0.54, "grad_norm": 1.2723256349563599, "learning_rate": 4.631267953743791e-06, "loss": 4.7631, "step": 65600 }, { "epoch": 0.54, "grad_norm": 1.2903733253479004, "learning_rate": 4.623083910990351e-06, "loss": 4.7572, "step": 65700 }, { "epoch": 0.54, "grad_norm": 1.849879503250122, "learning_rate": 4.614899868236912e-06, "loss": 4.7536, "step": 65800 }, { "epoch": 0.54, "grad_norm": 0.9856821298599243, "learning_rate": 4.6067158254834724e-06, "loss": 4.7566, "step": 65900 }, { "epoch": 0.54, "grad_norm": 1.0516011714935303, "learning_rate": 4.5985317827300335e-06, "loss": 4.7577, "step": 66000 }, { "epoch": 0.54, "grad_norm": 1.0833971500396729, "learning_rate": 4.590347739976594e-06, "loss": 4.7553, "step": 66100 }, { "epoch": 0.54, "grad_norm": 3.236478805541992, "learning_rate": 4.582163697223155e-06, "loss": 4.7524, "step": 66200 }, { "epoch": 0.54, "grad_norm": 4.561278820037842, "learning_rate": 4.573979654469715e-06, "loss": 4.7598, "step": 66300 }, { "epoch": 0.54, "grad_norm": 1.0862793922424316, "learning_rate": 4.565795611716276e-06, "loss": 4.756, "step": 66400 }, { "epoch": 0.54, "grad_norm": 1.248744249343872, "learning_rate": 4.5576115689628366e-06, "loss": 4.7579, "step": 66500 }, { "epoch": 0.55, "grad_norm": 1.2721776962280273, "learning_rate": 4.549427526209398e-06, "loss": 4.7549, "step": 66600 }, { "epoch": 0.55, "grad_norm": 1.6902192831039429, "learning_rate": 4.541243483455958e-06, "loss": 4.7605, "step": 66700 }, { "epoch": 0.55, "grad_norm": 2.721341609954834, "learning_rate": 4.533059440702518e-06, "loss": 4.7619, "step": 66800 }, { "epoch": 0.55, "grad_norm": 1.5549167394638062, "learning_rate": 4.524875397949079e-06, "loss": 4.7581, "step": 66900 }, { "epoch": 0.55, "grad_norm": 4.686578273773193, "learning_rate": 4.51669135519564e-06, "loss": 4.757, "step": 67000 }, { "epoch": 0.55, "grad_norm": 1.2544666528701782, "learning_rate": 4.508507312442201e-06, "loss": 4.7614, "step": 67100 }, { "epoch": 0.55, "grad_norm": 1.3165531158447266, "learning_rate": 4.500323269688761e-06, "loss": 4.7549, "step": 67200 }, { "epoch": 0.55, "grad_norm": 1.4608168601989746, "learning_rate": 4.492139226935322e-06, "loss": 4.7553, "step": 67300 }, { "epoch": 0.55, "grad_norm": 1.003299593925476, "learning_rate": 4.483955184181882e-06, "loss": 4.7638, "step": 67400 }, { "epoch": 0.55, "grad_norm": 1.496551752090454, "learning_rate": 4.4757711414284435e-06, "loss": 4.759, "step": 67500 }, { "epoch": 0.55, "grad_norm": 1.3934059143066406, "learning_rate": 4.467587098675004e-06, "loss": 4.7566, "step": 67600 }, { "epoch": 0.55, "grad_norm": 2.459867238998413, "learning_rate": 4.459403055921565e-06, "loss": 4.7567, "step": 67700 }, { "epoch": 0.55, "grad_norm": 1.2848294973373413, "learning_rate": 4.451219013168125e-06, "loss": 4.7562, "step": 67800 }, { "epoch": 0.56, "grad_norm": 1.5182512998580933, "learning_rate": 4.443034970414685e-06, "loss": 4.7595, "step": 67900 }, { "epoch": 0.56, "grad_norm": 1.2391725778579712, "learning_rate": 4.4348509276612465e-06, "loss": 4.7538, "step": 68000 }, { "epoch": 0.56, "grad_norm": 3.008521318435669, "learning_rate": 4.426666884907807e-06, "loss": 4.7568, "step": 68100 }, { "epoch": 0.56, "grad_norm": 1.6599717140197754, "learning_rate": 4.418482842154368e-06, "loss": 4.7598, "step": 68200 }, { "epoch": 0.56, "grad_norm": 2.2164254188537598, "learning_rate": 4.410298799400928e-06, "loss": 4.7545, "step": 68300 }, { "epoch": 0.56, "grad_norm": 3.473665237426758, "learning_rate": 4.402114756647489e-06, "loss": 4.7601, "step": 68400 }, { "epoch": 0.56, "grad_norm": 1.9182640314102173, "learning_rate": 4.3939307138940495e-06, "loss": 4.7559, "step": 68500 }, { "epoch": 0.56, "grad_norm": 2.2187399864196777, "learning_rate": 4.385746671140611e-06, "loss": 4.7611, "step": 68600 }, { "epoch": 0.56, "grad_norm": 2.2415308952331543, "learning_rate": 4.377562628387171e-06, "loss": 4.7572, "step": 68700 }, { "epoch": 0.56, "grad_norm": 1.0853921175003052, "learning_rate": 4.369378585633732e-06, "loss": 4.7522, "step": 68800 }, { "epoch": 0.56, "grad_norm": 2.0470669269561768, "learning_rate": 4.361194542880292e-06, "loss": 4.7567, "step": 68900 }, { "epoch": 0.56, "grad_norm": 1.5501480102539062, "learning_rate": 4.353010500126853e-06, "loss": 4.7548, "step": 69000 }, { "epoch": 0.57, "grad_norm": 1.0756503343582153, "learning_rate": 4.344826457373414e-06, "loss": 4.7556, "step": 69100 }, { "epoch": 0.57, "grad_norm": 1.0396485328674316, "learning_rate": 4.336642414619974e-06, "loss": 4.756, "step": 69200 }, { "epoch": 0.57, "grad_norm": 1.5130740404129028, "learning_rate": 4.328458371866535e-06, "loss": 4.7538, "step": 69300 }, { "epoch": 0.57, "grad_norm": 1.2191152572631836, "learning_rate": 4.320274329113095e-06, "loss": 4.7594, "step": 69400 }, { "epoch": 0.57, "grad_norm": 1.1031177043914795, "learning_rate": 4.3120902863596564e-06, "loss": 4.7585, "step": 69500 }, { "epoch": 0.57, "grad_norm": 3.345165967941284, "learning_rate": 4.303906243606217e-06, "loss": 4.7601, "step": 69600 }, { "epoch": 0.57, "grad_norm": 1.058370590209961, "learning_rate": 4.295722200852778e-06, "loss": 4.7575, "step": 69700 }, { "epoch": 0.57, "grad_norm": 1.364247441291809, "learning_rate": 4.287538158099338e-06, "loss": 4.7592, "step": 69800 }, { "epoch": 0.57, "grad_norm": 2.520071029663086, "learning_rate": 4.279354115345899e-06, "loss": 4.7597, "step": 69900 }, { "epoch": 0.57, "grad_norm": 1.4218943119049072, "learning_rate": 4.2711700725924595e-06, "loss": 4.7525, "step": 70000 }, { "epoch": 0.57, "grad_norm": 2.9582276344299316, "learning_rate": 4.2629860298390206e-06, "loss": 4.7611, "step": 70100 }, { "epoch": 0.57, "grad_norm": 2.1016061305999756, "learning_rate": 4.254801987085581e-06, "loss": 4.7557, "step": 70200 }, { "epoch": 0.58, "grad_norm": 1.1501710414886475, "learning_rate": 4.246617944332141e-06, "loss": 4.7582, "step": 70300 }, { "epoch": 0.58, "grad_norm": 2.3157947063446045, "learning_rate": 4.238433901578702e-06, "loss": 4.7574, "step": 70400 }, { "epoch": 0.58, "grad_norm": 1.0421010255813599, "learning_rate": 4.2302498588252625e-06, "loss": 4.7583, "step": 70500 }, { "epoch": 0.58, "grad_norm": 1.3601773977279663, "learning_rate": 4.222065816071824e-06, "loss": 4.7567, "step": 70600 }, { "epoch": 0.58, "grad_norm": 1.1386362314224243, "learning_rate": 4.213881773318384e-06, "loss": 4.7535, "step": 70700 }, { "epoch": 0.58, "grad_norm": 1.3439152240753174, "learning_rate": 4.205697730564945e-06, "loss": 4.7595, "step": 70800 }, { "epoch": 0.58, "grad_norm": 1.9923715591430664, "learning_rate": 4.197513687811505e-06, "loss": 4.7561, "step": 70900 }, { "epoch": 0.58, "grad_norm": 1.0728856325149536, "learning_rate": 4.189329645058066e-06, "loss": 4.7528, "step": 71000 }, { "epoch": 0.58, "grad_norm": 1.5504320859909058, "learning_rate": 4.181145602304627e-06, "loss": 4.7555, "step": 71100 }, { "epoch": 0.58, "grad_norm": 3.476879358291626, "learning_rate": 4.172961559551188e-06, "loss": 4.7571, "step": 71200 }, { "epoch": 0.58, "grad_norm": 1.4305684566497803, "learning_rate": 4.164777516797748e-06, "loss": 4.7501, "step": 71300 }, { "epoch": 0.58, "grad_norm": 1.2255500555038452, "learning_rate": 4.156593474044308e-06, "loss": 4.7591, "step": 71400 }, { "epoch": 0.59, "grad_norm": 2.292752742767334, "learning_rate": 4.148409431290869e-06, "loss": 4.7576, "step": 71500 }, { "epoch": 0.59, "grad_norm": 1.9670140743255615, "learning_rate": 4.14022538853743e-06, "loss": 4.7605, "step": 71600 }, { "epoch": 0.59, "grad_norm": 2.035198450088501, "learning_rate": 4.132041345783991e-06, "loss": 4.7564, "step": 71700 }, { "epoch": 0.59, "grad_norm": 1.918428659439087, "learning_rate": 4.123857303030551e-06, "loss": 4.7538, "step": 71800 }, { "epoch": 0.59, "grad_norm": 4.245315074920654, "learning_rate": 4.115673260277112e-06, "loss": 4.7585, "step": 71900 }, { "epoch": 0.59, "grad_norm": 3.4246652126312256, "learning_rate": 4.107489217523672e-06, "loss": 4.7507, "step": 72000 }, { "epoch": 0.59, "grad_norm": 1.2266836166381836, "learning_rate": 4.0993051747702335e-06, "loss": 4.7602, "step": 72100 }, { "epoch": 0.59, "grad_norm": 1.9559603929519653, "learning_rate": 4.091121132016794e-06, "loss": 4.7555, "step": 72200 }, { "epoch": 0.59, "grad_norm": 2.2520241737365723, "learning_rate": 4.082937089263355e-06, "loss": 4.7551, "step": 72300 }, { "epoch": 0.59, "grad_norm": 2.5236260890960693, "learning_rate": 4.074753046509915e-06, "loss": 4.7552, "step": 72400 }, { "epoch": 0.59, "grad_norm": 1.0424671173095703, "learning_rate": 4.066569003756476e-06, "loss": 4.7548, "step": 72500 }, { "epoch": 0.59, "grad_norm": 1.1566205024719238, "learning_rate": 4.0583849610030366e-06, "loss": 4.7522, "step": 72600 }, { "epoch": 0.59, "grad_norm": 1.9585150480270386, "learning_rate": 4.050200918249597e-06, "loss": 4.7609, "step": 72700 }, { "epoch": 0.6, "grad_norm": 6.500349044799805, "learning_rate": 4.042016875496158e-06, "loss": 4.7562, "step": 72800 }, { "epoch": 0.6, "grad_norm": 1.1571673154830933, "learning_rate": 4.033832832742718e-06, "loss": 4.7571, "step": 72900 }, { "epoch": 0.6, "grad_norm": 1.7180365324020386, "learning_rate": 4.025648789989279e-06, "loss": 4.7614, "step": 73000 }, { "epoch": 0.6, "grad_norm": 1.5343599319458008, "learning_rate": 4.01746474723584e-06, "loss": 4.7588, "step": 73100 }, { "epoch": 0.6, "grad_norm": 1.5855658054351807, "learning_rate": 4.009280704482401e-06, "loss": 4.7603, "step": 73200 }, { "epoch": 0.6, "grad_norm": 1.0107240676879883, "learning_rate": 4.001096661728961e-06, "loss": 4.7576, "step": 73300 }, { "epoch": 0.6, "grad_norm": 1.6505345106124878, "learning_rate": 3.992912618975522e-06, "loss": 4.7562, "step": 73400 }, { "epoch": 0.6, "grad_norm": 2.6212563514709473, "learning_rate": 3.984728576222082e-06, "loss": 4.7516, "step": 73500 }, { "epoch": 0.6, "grad_norm": 1.5305055379867554, "learning_rate": 3.9765445334686435e-06, "loss": 4.7533, "step": 73600 }, { "epoch": 0.6, "grad_norm": 3.195974826812744, "learning_rate": 3.968360490715204e-06, "loss": 4.7509, "step": 73700 }, { "epoch": 0.6, "grad_norm": 1.088273286819458, "learning_rate": 3.960176447961764e-06, "loss": 4.7559, "step": 73800 }, { "epoch": 0.6, "grad_norm": 2.045375108718872, "learning_rate": 3.951992405208325e-06, "loss": 4.7516, "step": 73900 }, { "epoch": 0.61, "grad_norm": 1.5279570817947388, "learning_rate": 3.943808362454885e-06, "loss": 4.7566, "step": 74000 }, { "epoch": 0.61, "grad_norm": 1.987199306488037, "learning_rate": 3.9356243197014465e-06, "loss": 4.7531, "step": 74100 }, { "epoch": 0.61, "grad_norm": 1.7594095468521118, "learning_rate": 3.927440276948007e-06, "loss": 4.7568, "step": 74200 }, { "epoch": 0.61, "grad_norm": 1.9795931577682495, "learning_rate": 3.919256234194568e-06, "loss": 4.7606, "step": 74300 }, { "epoch": 0.61, "grad_norm": 1.383016586303711, "learning_rate": 3.911072191441128e-06, "loss": 4.7559, "step": 74400 }, { "epoch": 0.61, "grad_norm": 1.4739179611206055, "learning_rate": 3.902888148687689e-06, "loss": 4.7553, "step": 74500 }, { "epoch": 0.61, "grad_norm": 1.1515618562698364, "learning_rate": 3.89470410593425e-06, "loss": 4.7608, "step": 74600 }, { "epoch": 0.61, "grad_norm": 1.4380580186843872, "learning_rate": 3.886520063180811e-06, "loss": 4.7578, "step": 74700 }, { "epoch": 0.61, "grad_norm": 1.187768578529358, "learning_rate": 3.878336020427371e-06, "loss": 4.7533, "step": 74800 }, { "epoch": 0.61, "grad_norm": 4.768670082092285, "learning_rate": 3.870151977673931e-06, "loss": 4.7573, "step": 74900 }, { "epoch": 0.61, "grad_norm": 1.2797937393188477, "learning_rate": 3.861967934920492e-06, "loss": 4.7546, "step": 75000 }, { "epoch": 0.61, "grad_norm": 2.6686596870422363, "learning_rate": 3.8537838921670525e-06, "loss": 4.7562, "step": 75100 }, { "epoch": 0.62, "grad_norm": 2.777021646499634, "learning_rate": 3.845599849413614e-06, "loss": 4.7578, "step": 75200 }, { "epoch": 0.62, "grad_norm": 1.4774482250213623, "learning_rate": 3.837415806660174e-06, "loss": 4.7553, "step": 75300 }, { "epoch": 0.62, "grad_norm": 1.154783010482788, "learning_rate": 3.829231763906735e-06, "loss": 4.7536, "step": 75400 }, { "epoch": 0.62, "grad_norm": 1.1363816261291504, "learning_rate": 3.821047721153295e-06, "loss": 4.7553, "step": 75500 }, { "epoch": 0.62, "grad_norm": 1.314833402633667, "learning_rate": 3.8128636783998564e-06, "loss": 4.754, "step": 75600 }, { "epoch": 0.62, "grad_norm": 2.0026655197143555, "learning_rate": 3.804679635646417e-06, "loss": 4.7564, "step": 75700 }, { "epoch": 0.62, "grad_norm": 2.530662775039673, "learning_rate": 3.7964955928929774e-06, "loss": 4.7621, "step": 75800 }, { "epoch": 0.62, "grad_norm": 1.7200578451156616, "learning_rate": 3.7883115501395385e-06, "loss": 4.7534, "step": 75900 }, { "epoch": 0.62, "grad_norm": 1.1230708360671997, "learning_rate": 3.7801275073860988e-06, "loss": 4.7573, "step": 76000 }, { "epoch": 0.62, "grad_norm": 1.1518919467926025, "learning_rate": 3.77194346463266e-06, "loss": 4.756, "step": 76100 }, { "epoch": 0.62, "grad_norm": 4.6440582275390625, "learning_rate": 3.76375942187922e-06, "loss": 4.7538, "step": 76200 }, { "epoch": 0.62, "grad_norm": 1.2866283655166626, "learning_rate": 3.755575379125781e-06, "loss": 4.7572, "step": 76300 }, { "epoch": 0.63, "grad_norm": 1.0763148069381714, "learning_rate": 3.7473913363723415e-06, "loss": 4.7568, "step": 76400 }, { "epoch": 0.63, "grad_norm": 1.0883269309997559, "learning_rate": 3.739207293618902e-06, "loss": 4.7562, "step": 76500 }, { "epoch": 0.63, "grad_norm": 1.50298011302948, "learning_rate": 3.7310232508654625e-06, "loss": 4.7512, "step": 76600 }, { "epoch": 0.63, "grad_norm": 1.144468069076538, "learning_rate": 3.7228392081120236e-06, "loss": 4.7571, "step": 76700 }, { "epoch": 0.63, "grad_norm": 1.2953712940216064, "learning_rate": 3.7146551653585843e-06, "loss": 4.7502, "step": 76800 }, { "epoch": 0.63, "grad_norm": 3.047788143157959, "learning_rate": 3.7064711226051445e-06, "loss": 4.7561, "step": 76900 }, { "epoch": 0.63, "grad_norm": 1.7507997751235962, "learning_rate": 3.6982870798517057e-06, "loss": 4.7547, "step": 77000 }, { "epoch": 0.63, "grad_norm": 1.189469814300537, "learning_rate": 3.690103037098266e-06, "loss": 4.7554, "step": 77100 }, { "epoch": 0.63, "grad_norm": 1.2107117176055908, "learning_rate": 3.681918994344827e-06, "loss": 4.7544, "step": 77200 }, { "epoch": 0.63, "grad_norm": 1.0349069833755493, "learning_rate": 3.6737349515913873e-06, "loss": 4.7603, "step": 77300 }, { "epoch": 0.63, "grad_norm": 1.1641030311584473, "learning_rate": 3.665550908837948e-06, "loss": 4.7566, "step": 77400 }, { "epoch": 0.63, "grad_norm": 1.1426669359207153, "learning_rate": 3.6573668660845087e-06, "loss": 4.7535, "step": 77500 }, { "epoch": 0.64, "grad_norm": 2.1960771083831787, "learning_rate": 3.6491828233310694e-06, "loss": 4.7559, "step": 77600 }, { "epoch": 0.64, "grad_norm": 1.1451727151870728, "learning_rate": 3.6409987805776296e-06, "loss": 4.7591, "step": 77700 }, { "epoch": 0.64, "grad_norm": 1.3886533975601196, "learning_rate": 3.6328147378241908e-06, "loss": 4.7553, "step": 77800 }, { "epoch": 0.64, "grad_norm": 1.1393433809280396, "learning_rate": 3.6246306950707514e-06, "loss": 4.7581, "step": 77900 }, { "epoch": 0.64, "grad_norm": 1.2284306287765503, "learning_rate": 3.616446652317312e-06, "loss": 4.7566, "step": 78000 }, { "epoch": 0.64, "grad_norm": 1.0621302127838135, "learning_rate": 3.608262609563873e-06, "loss": 4.7532, "step": 78100 }, { "epoch": 0.64, "grad_norm": 1.1183658838272095, "learning_rate": 3.600078566810433e-06, "loss": 4.7583, "step": 78200 }, { "epoch": 0.64, "grad_norm": 1.1688071489334106, "learning_rate": 3.591894524056994e-06, "loss": 4.7542, "step": 78300 }, { "epoch": 0.64, "grad_norm": 3.1448984146118164, "learning_rate": 3.5837104813035545e-06, "loss": 4.7592, "step": 78400 }, { "epoch": 0.64, "grad_norm": 1.1687928438186646, "learning_rate": 3.5755264385501156e-06, "loss": 4.7553, "step": 78500 }, { "epoch": 0.64, "grad_norm": 1.1805286407470703, "learning_rate": 3.567342395796676e-06, "loss": 4.7544, "step": 78600 }, { "epoch": 0.64, "grad_norm": 2.4032955169677734, "learning_rate": 3.5591583530432365e-06, "loss": 4.7569, "step": 78700 }, { "epoch": 0.64, "grad_norm": 3.784090757369995, "learning_rate": 3.550974310289797e-06, "loss": 4.7565, "step": 78800 }, { "epoch": 0.65, "grad_norm": 1.2469580173492432, "learning_rate": 3.542790267536358e-06, "loss": 4.7485, "step": 78900 }, { "epoch": 0.65, "grad_norm": 2.6100597381591797, "learning_rate": 3.5346062247829186e-06, "loss": 4.7561, "step": 79000 }, { "epoch": 0.65, "grad_norm": 1.4072147607803345, "learning_rate": 3.5264221820294793e-06, "loss": 4.7556, "step": 79100 }, { "epoch": 0.65, "grad_norm": 1.2158293724060059, "learning_rate": 3.51823813927604e-06, "loss": 4.7503, "step": 79200 }, { "epoch": 0.65, "grad_norm": 1.4874674081802368, "learning_rate": 3.5100540965226003e-06, "loss": 4.7547, "step": 79300 }, { "epoch": 0.65, "grad_norm": 1.221482515335083, "learning_rate": 3.5018700537691614e-06, "loss": 4.7559, "step": 79400 }, { "epoch": 0.65, "grad_norm": 1.1589709520339966, "learning_rate": 3.4936860110157216e-06, "loss": 4.7545, "step": 79500 }, { "epoch": 0.65, "grad_norm": 1.2871575355529785, "learning_rate": 3.4855019682622828e-06, "loss": 4.7522, "step": 79600 }, { "epoch": 0.65, "grad_norm": 1.7240387201309204, "learning_rate": 3.477317925508843e-06, "loss": 4.7566, "step": 79700 }, { "epoch": 0.65, "grad_norm": 6.059484004974365, "learning_rate": 3.4691338827554037e-06, "loss": 4.7566, "step": 79800 }, { "epoch": 0.65, "grad_norm": 1.1639505624771118, "learning_rate": 3.4609498400019644e-06, "loss": 4.7521, "step": 79900 }, { "epoch": 0.65, "grad_norm": 1.1786649227142334, "learning_rate": 3.452765797248525e-06, "loss": 4.7475, "step": 80000 }, { "epoch": 0.66, "grad_norm": 1.275763988494873, "learning_rate": 3.4445817544950862e-06, "loss": 4.7512, "step": 80100 }, { "epoch": 0.66, "grad_norm": 2.3286848068237305, "learning_rate": 3.4363977117416465e-06, "loss": 4.7573, "step": 80200 }, { "epoch": 0.66, "grad_norm": 1.2990089654922485, "learning_rate": 3.428213668988207e-06, "loss": 4.752, "step": 80300 }, { "epoch": 0.66, "grad_norm": 2.1798534393310547, "learning_rate": 3.4200296262347674e-06, "loss": 4.7566, "step": 80400 }, { "epoch": 0.66, "grad_norm": 2.029482841491699, "learning_rate": 3.4118455834813286e-06, "loss": 4.7528, "step": 80500 }, { "epoch": 0.66, "grad_norm": 1.3646825551986694, "learning_rate": 3.403661540727889e-06, "loss": 4.7567, "step": 80600 }, { "epoch": 0.66, "grad_norm": 1.1107501983642578, "learning_rate": 3.39547749797445e-06, "loss": 4.7518, "step": 80700 }, { "epoch": 0.66, "grad_norm": 1.7238624095916748, "learning_rate": 3.38729345522101e-06, "loss": 4.7535, "step": 80800 }, { "epoch": 0.66, "grad_norm": 1.2147496938705444, "learning_rate": 3.379109412467571e-06, "loss": 4.7529, "step": 80900 }, { "epoch": 0.66, "grad_norm": 1.3453848361968994, "learning_rate": 3.3709253697141316e-06, "loss": 4.7549, "step": 81000 }, { "epoch": 0.66, "grad_norm": 1.3312301635742188, "learning_rate": 3.3627413269606923e-06, "loss": 4.7533, "step": 81100 }, { "epoch": 0.66, "grad_norm": 1.0629857778549194, "learning_rate": 3.3545572842072534e-06, "loss": 4.754, "step": 81200 }, { "epoch": 0.67, "grad_norm": 1.145863652229309, "learning_rate": 3.3463732414538136e-06, "loss": 4.7561, "step": 81300 }, { "epoch": 0.67, "grad_norm": 1.1477543115615845, "learning_rate": 3.3381891987003743e-06, "loss": 4.7546, "step": 81400 }, { "epoch": 0.67, "grad_norm": 2.111903190612793, "learning_rate": 3.330005155946935e-06, "loss": 4.7523, "step": 81500 }, { "epoch": 0.67, "grad_norm": 2.4378135204315186, "learning_rate": 3.3218211131934957e-06, "loss": 4.7524, "step": 81600 }, { "epoch": 0.67, "grad_norm": 1.0920718908309937, "learning_rate": 3.313637070440056e-06, "loss": 4.7579, "step": 81700 }, { "epoch": 0.67, "grad_norm": 1.142166018486023, "learning_rate": 3.305453027686617e-06, "loss": 4.751, "step": 81800 }, { "epoch": 0.67, "grad_norm": 2.497532844543457, "learning_rate": 3.2972689849331774e-06, "loss": 4.7587, "step": 81900 }, { "epoch": 0.67, "grad_norm": 1.1811760663986206, "learning_rate": 3.2890849421797385e-06, "loss": 4.7565, "step": 82000 }, { "epoch": 0.67, "grad_norm": 1.4381294250488281, "learning_rate": 3.2809008994262987e-06, "loss": 4.7523, "step": 82100 }, { "epoch": 0.67, "grad_norm": 1.1105141639709473, "learning_rate": 3.2727168566728594e-06, "loss": 4.755, "step": 82200 }, { "epoch": 0.67, "grad_norm": 1.0709772109985352, "learning_rate": 3.2645328139194206e-06, "loss": 4.7518, "step": 82300 }, { "epoch": 0.67, "grad_norm": 1.3575836420059204, "learning_rate": 3.256348771165981e-06, "loss": 4.7538, "step": 82400 }, { "epoch": 0.68, "grad_norm": 1.1453856229782104, "learning_rate": 3.2481647284125415e-06, "loss": 4.7567, "step": 82500 }, { "epoch": 0.68, "grad_norm": 1.2590690851211548, "learning_rate": 3.239980685659102e-06, "loss": 4.7556, "step": 82600 }, { "epoch": 0.68, "grad_norm": 1.3445689678192139, "learning_rate": 3.231796642905663e-06, "loss": 4.7529, "step": 82700 }, { "epoch": 0.68, "grad_norm": 1.1240034103393555, "learning_rate": 3.223612600152223e-06, "loss": 4.7595, "step": 82800 }, { "epoch": 0.68, "grad_norm": 1.2913769483566284, "learning_rate": 3.2154285573987843e-06, "loss": 4.7523, "step": 82900 }, { "epoch": 0.68, "grad_norm": 1.2136105298995972, "learning_rate": 3.2072445146453445e-06, "loss": 4.7548, "step": 83000 }, { "epoch": 0.68, "grad_norm": 1.0630725622177124, "learning_rate": 3.1990604718919057e-06, "loss": 4.7551, "step": 83100 }, { "epoch": 0.68, "grad_norm": 1.495082139968872, "learning_rate": 3.190876429138466e-06, "loss": 4.7553, "step": 83200 }, { "epoch": 0.68, "grad_norm": 0.9895689487457275, "learning_rate": 3.1826923863850266e-06, "loss": 4.759, "step": 83300 }, { "epoch": 0.68, "grad_norm": 1.4668093919754028, "learning_rate": 3.1745083436315877e-06, "loss": 4.7561, "step": 83400 }, { "epoch": 0.68, "grad_norm": 1.5256825685501099, "learning_rate": 3.166324300878148e-06, "loss": 4.7573, "step": 83500 }, { "epoch": 0.68, "grad_norm": 3.0631277561187744, "learning_rate": 3.158140258124709e-06, "loss": 4.7543, "step": 83600 }, { "epoch": 0.69, "grad_norm": 1.171787977218628, "learning_rate": 3.1499562153712694e-06, "loss": 4.7508, "step": 83700 }, { "epoch": 0.69, "grad_norm": 7.035879611968994, "learning_rate": 3.14177217261783e-06, "loss": 4.7519, "step": 83800 }, { "epoch": 0.69, "grad_norm": 2.3109359741210938, "learning_rate": 3.1335881298643903e-06, "loss": 4.7573, "step": 83900 }, { "epoch": 0.69, "grad_norm": 2.3658266067504883, "learning_rate": 3.1254040871109514e-06, "loss": 4.754, "step": 84000 }, { "epoch": 0.69, "grad_norm": 1.6524670124053955, "learning_rate": 3.1172200443575117e-06, "loss": 4.7549, "step": 84100 }, { "epoch": 0.69, "grad_norm": 1.797340989112854, "learning_rate": 3.109036001604073e-06, "loss": 4.7499, "step": 84200 }, { "epoch": 0.69, "grad_norm": 3.3878042697906494, "learning_rate": 3.100851958850633e-06, "loss": 4.7518, "step": 84300 }, { "epoch": 0.69, "grad_norm": 1.5656503438949585, "learning_rate": 3.0926679160971938e-06, "loss": 4.7588, "step": 84400 }, { "epoch": 0.69, "grad_norm": 1.4081205129623413, "learning_rate": 3.084483873343755e-06, "loss": 4.7587, "step": 84500 }, { "epoch": 0.69, "grad_norm": 2.011707305908203, "learning_rate": 3.076299830590315e-06, "loss": 4.7525, "step": 84600 }, { "epoch": 0.69, "grad_norm": 1.1103359460830688, "learning_rate": 3.0681157878368763e-06, "loss": 4.7547, "step": 84700 }, { "epoch": 0.69, "grad_norm": 1.331764578819275, "learning_rate": 3.0599317450834365e-06, "loss": 4.7544, "step": 84800 }, { "epoch": 0.69, "grad_norm": 1.1731749773025513, "learning_rate": 3.0517477023299972e-06, "loss": 4.7527, "step": 84900 }, { "epoch": 0.7, "grad_norm": 2.4476029872894287, "learning_rate": 3.043563659576558e-06, "loss": 4.7529, "step": 85000 }, { "epoch": 0.7, "grad_norm": 2.6501026153564453, "learning_rate": 3.0353796168231186e-06, "loss": 4.7495, "step": 85100 }, { "epoch": 0.7, "grad_norm": 1.4330114126205444, "learning_rate": 3.027195574069679e-06, "loss": 4.7551, "step": 85200 }, { "epoch": 0.7, "grad_norm": 1.794797420501709, "learning_rate": 3.01901153131624e-06, "loss": 4.7554, "step": 85300 }, { "epoch": 0.7, "grad_norm": 1.1396609544754028, "learning_rate": 3.0108274885628003e-06, "loss": 4.7521, "step": 85400 }, { "epoch": 0.7, "grad_norm": 1.5291541814804077, "learning_rate": 3.0026434458093614e-06, "loss": 4.7538, "step": 85500 }, { "epoch": 0.7, "grad_norm": 0.9390245676040649, "learning_rate": 2.9944594030559216e-06, "loss": 4.7499, "step": 85600 }, { "epoch": 0.7, "grad_norm": 4.141879558563232, "learning_rate": 2.9862753603024823e-06, "loss": 4.7587, "step": 85700 }, { "epoch": 0.7, "grad_norm": 2.151954412460327, "learning_rate": 2.9780913175490434e-06, "loss": 4.7525, "step": 85800 }, { "epoch": 0.7, "grad_norm": 1.340173363685608, "learning_rate": 2.9699072747956037e-06, "loss": 4.7519, "step": 85900 }, { "epoch": 0.7, "grad_norm": 1.9204574823379517, "learning_rate": 2.9617232320421644e-06, "loss": 4.7583, "step": 86000 }, { "epoch": 0.7, "grad_norm": 1.391839623451233, "learning_rate": 2.953539189288725e-06, "loss": 4.751, "step": 86100 }, { "epoch": 0.71, "grad_norm": 1.4064441919326782, "learning_rate": 2.9453551465352858e-06, "loss": 4.7506, "step": 86200 }, { "epoch": 0.71, "grad_norm": 1.2319107055664062, "learning_rate": 2.937171103781846e-06, "loss": 4.7551, "step": 86300 }, { "epoch": 0.71, "grad_norm": 2.515320301055908, "learning_rate": 2.928987061028407e-06, "loss": 4.7517, "step": 86400 }, { "epoch": 0.71, "grad_norm": 2.4007177352905273, "learning_rate": 2.9208030182749674e-06, "loss": 4.7513, "step": 86500 }, { "epoch": 0.71, "grad_norm": 1.4867286682128906, "learning_rate": 2.9126189755215285e-06, "loss": 4.7549, "step": 86600 }, { "epoch": 0.71, "grad_norm": 1.2570160627365112, "learning_rate": 2.904434932768089e-06, "loss": 4.753, "step": 86700 }, { "epoch": 0.71, "grad_norm": 2.847069025039673, "learning_rate": 2.8962508900146495e-06, "loss": 4.7555, "step": 86800 }, { "epoch": 0.71, "grad_norm": 1.0997235774993896, "learning_rate": 2.8880668472612106e-06, "loss": 4.7532, "step": 86900 }, { "epoch": 0.71, "grad_norm": 1.8394368886947632, "learning_rate": 2.879882804507771e-06, "loss": 4.7504, "step": 87000 }, { "epoch": 0.71, "grad_norm": 3.6865549087524414, "learning_rate": 2.871698761754332e-06, "loss": 4.7567, "step": 87100 }, { "epoch": 0.71, "grad_norm": 3.022850275039673, "learning_rate": 2.8635147190008923e-06, "loss": 4.7509, "step": 87200 }, { "epoch": 0.71, "grad_norm": 1.7531808614730835, "learning_rate": 2.855330676247453e-06, "loss": 4.7527, "step": 87300 }, { "epoch": 0.72, "grad_norm": 2.0469372272491455, "learning_rate": 2.8471466334940136e-06, "loss": 4.7564, "step": 87400 }, { "epoch": 0.72, "grad_norm": 1.4322601556777954, "learning_rate": 2.8389625907405743e-06, "loss": 4.7552, "step": 87500 }, { "epoch": 0.72, "grad_norm": 1.2034333944320679, "learning_rate": 2.8307785479871346e-06, "loss": 4.7555, "step": 87600 }, { "epoch": 0.72, "grad_norm": 1.0759299993515015, "learning_rate": 2.8225945052336957e-06, "loss": 4.7508, "step": 87700 }, { "epoch": 0.72, "grad_norm": 1.1701573133468628, "learning_rate": 2.814410462480256e-06, "loss": 4.7535, "step": 87800 }, { "epoch": 0.72, "grad_norm": 1.4818124771118164, "learning_rate": 2.8062264197268167e-06, "loss": 4.7528, "step": 87900 }, { "epoch": 0.72, "grad_norm": 1.362298846244812, "learning_rate": 2.7980423769733778e-06, "loss": 4.7488, "step": 88000 }, { "epoch": 0.72, "grad_norm": 0.9951609969139099, "learning_rate": 2.789858334219938e-06, "loss": 4.7509, "step": 88100 }, { "epoch": 0.72, "grad_norm": 1.2555766105651855, "learning_rate": 2.781674291466499e-06, "loss": 4.7559, "step": 88200 }, { "epoch": 0.72, "grad_norm": 1.8623309135437012, "learning_rate": 2.7734902487130594e-06, "loss": 4.7489, "step": 88300 }, { "epoch": 0.72, "grad_norm": 1.2883721590042114, "learning_rate": 2.76530620595962e-06, "loss": 4.751, "step": 88400 }, { "epoch": 0.72, "grad_norm": 1.1867636442184448, "learning_rate": 2.757122163206181e-06, "loss": 4.7524, "step": 88500 }, { "epoch": 0.73, "grad_norm": 1.4036273956298828, "learning_rate": 2.7489381204527415e-06, "loss": 4.755, "step": 88600 }, { "epoch": 0.73, "grad_norm": 1.2148162126541138, "learning_rate": 2.7407540776993018e-06, "loss": 4.7582, "step": 88700 }, { "epoch": 0.73, "grad_norm": 2.2214956283569336, "learning_rate": 2.732570034945863e-06, "loss": 4.7543, "step": 88800 }, { "epoch": 0.73, "grad_norm": 1.103264331817627, "learning_rate": 2.724385992192423e-06, "loss": 4.7468, "step": 88900 }, { "epoch": 0.73, "grad_norm": 1.3318493366241455, "learning_rate": 2.7162019494389843e-06, "loss": 4.7547, "step": 89000 }, { "epoch": 0.73, "grad_norm": 1.7869521379470825, "learning_rate": 2.708017906685545e-06, "loss": 4.7528, "step": 89100 }, { "epoch": 0.73, "grad_norm": 1.0730737447738647, "learning_rate": 2.6998338639321052e-06, "loss": 4.7554, "step": 89200 }, { "epoch": 0.73, "grad_norm": 1.677322268486023, "learning_rate": 2.6916498211786663e-06, "loss": 4.7574, "step": 89300 }, { "epoch": 0.73, "grad_norm": 1.7166889905929565, "learning_rate": 2.6834657784252266e-06, "loss": 4.7563, "step": 89400 }, { "epoch": 0.73, "grad_norm": 1.3023245334625244, "learning_rate": 2.6752817356717877e-06, "loss": 4.7569, "step": 89500 }, { "epoch": 0.73, "grad_norm": 1.2815351486206055, "learning_rate": 2.667097692918348e-06, "loss": 4.7568, "step": 89600 }, { "epoch": 0.73, "grad_norm": 1.1161749362945557, "learning_rate": 2.6589136501649087e-06, "loss": 4.7536, "step": 89700 }, { "epoch": 0.73, "grad_norm": 1.4548900127410889, "learning_rate": 2.650729607411469e-06, "loss": 4.7566, "step": 89800 }, { "epoch": 0.74, "grad_norm": 8.324539184570312, "learning_rate": 2.64254556465803e-06, "loss": 4.7539, "step": 89900 }, { "epoch": 0.74, "grad_norm": 2.0228288173675537, "learning_rate": 2.6343615219045903e-06, "loss": 4.7514, "step": 90000 }, { "epoch": 0.74, "grad_norm": 1.1695142984390259, "learning_rate": 2.6261774791511514e-06, "loss": 4.7519, "step": 90100 }, { "epoch": 0.74, "grad_norm": 1.6865144968032837, "learning_rate": 2.617993436397712e-06, "loss": 4.7557, "step": 90200 }, { "epoch": 0.74, "grad_norm": 0.9601481556892395, "learning_rate": 2.6098093936442724e-06, "loss": 4.7518, "step": 90300 }, { "epoch": 0.74, "grad_norm": 1.0379222631454468, "learning_rate": 2.6016253508908335e-06, "loss": 4.7521, "step": 90400 }, { "epoch": 0.74, "grad_norm": 1.6704763174057007, "learning_rate": 2.5934413081373938e-06, "loss": 4.7526, "step": 90500 }, { "epoch": 0.74, "grad_norm": 1.0544642210006714, "learning_rate": 2.585257265383955e-06, "loss": 4.7529, "step": 90600 }, { "epoch": 0.74, "grad_norm": 1.2152049541473389, "learning_rate": 2.577073222630515e-06, "loss": 4.7557, "step": 90700 }, { "epoch": 0.74, "grad_norm": 1.1299751996994019, "learning_rate": 2.568889179877076e-06, "loss": 4.7552, "step": 90800 }, { "epoch": 0.74, "grad_norm": 1.3130440711975098, "learning_rate": 2.5607051371236365e-06, "loss": 4.7512, "step": 90900 }, { "epoch": 0.74, "grad_norm": 1.1738765239715576, "learning_rate": 2.5525210943701972e-06, "loss": 4.7478, "step": 91000 }, { "epoch": 0.75, "grad_norm": 1.3825798034667969, "learning_rate": 2.5443370516167575e-06, "loss": 4.7545, "step": 91100 }, { "epoch": 0.75, "grad_norm": 1.2850853204727173, "learning_rate": 2.5361530088633186e-06, "loss": 4.7546, "step": 91200 }, { "epoch": 0.75, "grad_norm": 1.215085506439209, "learning_rate": 2.5279689661098793e-06, "loss": 4.7488, "step": 91300 }, { "epoch": 0.75, "grad_norm": 1.4124336242675781, "learning_rate": 2.5197849233564396e-06, "loss": 4.7441, "step": 91400 }, { "epoch": 0.75, "grad_norm": 2.5708861351013184, "learning_rate": 2.5116008806030007e-06, "loss": 4.7553, "step": 91500 }, { "epoch": 0.75, "grad_norm": 1.9249249696731567, "learning_rate": 2.503416837849561e-06, "loss": 4.7565, "step": 91600 }, { "epoch": 0.75, "grad_norm": 1.1398611068725586, "learning_rate": 2.4952327950961216e-06, "loss": 4.7513, "step": 91700 }, { "epoch": 0.75, "grad_norm": 2.037564516067505, "learning_rate": 2.4870487523426827e-06, "loss": 4.7517, "step": 91800 }, { "epoch": 0.75, "grad_norm": 1.4297902584075928, "learning_rate": 2.478864709589243e-06, "loss": 4.7494, "step": 91900 }, { "epoch": 0.75, "grad_norm": 1.369734764099121, "learning_rate": 2.4706806668358037e-06, "loss": 4.7511, "step": 92000 }, { "epoch": 0.75, "grad_norm": 1.1665796041488647, "learning_rate": 2.4624966240823644e-06, "loss": 4.7576, "step": 92100 }, { "epoch": 0.75, "grad_norm": 1.085404396057129, "learning_rate": 2.454312581328925e-06, "loss": 4.7535, "step": 92200 }, { "epoch": 0.76, "grad_norm": 5.764316082000732, "learning_rate": 2.4461285385754858e-06, "loss": 4.7497, "step": 92300 }, { "epoch": 0.76, "grad_norm": 1.3492110967636108, "learning_rate": 2.4379444958220465e-06, "loss": 4.7558, "step": 92400 }, { "epoch": 0.76, "grad_norm": 1.0760524272918701, "learning_rate": 2.429760453068607e-06, "loss": 4.7525, "step": 92500 }, { "epoch": 0.76, "grad_norm": 1.2596811056137085, "learning_rate": 2.4215764103151674e-06, "loss": 4.7532, "step": 92600 }, { "epoch": 0.76, "grad_norm": 1.0836505889892578, "learning_rate": 2.413392367561728e-06, "loss": 4.7506, "step": 92700 }, { "epoch": 0.76, "grad_norm": 2.759760618209839, "learning_rate": 2.405208324808289e-06, "loss": 4.7493, "step": 92800 }, { "epoch": 0.76, "grad_norm": 1.3454488515853882, "learning_rate": 2.39702428205485e-06, "loss": 4.7539, "step": 92900 }, { "epoch": 0.76, "grad_norm": 1.2812906503677368, "learning_rate": 2.3888402393014106e-06, "loss": 4.7509, "step": 93000 }, { "epoch": 0.76, "grad_norm": 1.247383952140808, "learning_rate": 2.380656196547971e-06, "loss": 4.7493, "step": 93100 }, { "epoch": 0.76, "grad_norm": 1.803625226020813, "learning_rate": 2.3724721537945316e-06, "loss": 4.7527, "step": 93200 }, { "epoch": 0.76, "grad_norm": 2.4045066833496094, "learning_rate": 2.3642881110410922e-06, "loss": 4.7578, "step": 93300 }, { "epoch": 0.76, "grad_norm": 2.0578811168670654, "learning_rate": 2.356104068287653e-06, "loss": 4.7539, "step": 93400 }, { "epoch": 0.77, "grad_norm": 2.907444477081299, "learning_rate": 2.3479200255342136e-06, "loss": 4.7534, "step": 93500 }, { "epoch": 0.77, "grad_norm": 1.3155359029769897, "learning_rate": 2.3397359827807743e-06, "loss": 4.7543, "step": 93600 }, { "epoch": 0.77, "grad_norm": 1.2676302194595337, "learning_rate": 2.331551940027335e-06, "loss": 4.7501, "step": 93700 }, { "epoch": 0.77, "grad_norm": 1.1166573762893677, "learning_rate": 2.3233678972738953e-06, "loss": 4.7518, "step": 93800 }, { "epoch": 0.77, "grad_norm": 1.3180181980133057, "learning_rate": 2.315183854520456e-06, "loss": 4.7548, "step": 93900 }, { "epoch": 0.77, "grad_norm": 2.867478132247925, "learning_rate": 2.306999811767017e-06, "loss": 4.7551, "step": 94000 }, { "epoch": 0.77, "grad_norm": 1.3548469543457031, "learning_rate": 2.2988157690135778e-06, "loss": 4.7524, "step": 94100 }, { "epoch": 0.77, "grad_norm": 3.94809627532959, "learning_rate": 2.290631726260138e-06, "loss": 4.7516, "step": 94200 }, { "epoch": 0.77, "grad_norm": 1.0845712423324585, "learning_rate": 2.2824476835066987e-06, "loss": 4.7549, "step": 94300 }, { "epoch": 0.77, "grad_norm": 0.9430265426635742, "learning_rate": 2.2742636407532594e-06, "loss": 4.7552, "step": 94400 }, { "epoch": 0.77, "grad_norm": 1.1491626501083374, "learning_rate": 2.26607959799982e-06, "loss": 4.7533, "step": 94500 }, { "epoch": 0.77, "grad_norm": 1.323564887046814, "learning_rate": 2.257895555246381e-06, "loss": 4.7502, "step": 94600 }, { "epoch": 0.78, "grad_norm": 1.2415287494659424, "learning_rate": 2.2497115124929415e-06, "loss": 4.754, "step": 94700 }, { "epoch": 0.78, "grad_norm": 1.1996134519577026, "learning_rate": 2.241527469739502e-06, "loss": 4.7475, "step": 94800 }, { "epoch": 0.78, "grad_norm": 1.3265007734298706, "learning_rate": 2.2333434269860624e-06, "loss": 4.7504, "step": 94900 }, { "epoch": 0.78, "grad_norm": 2.0656216144561768, "learning_rate": 2.225159384232623e-06, "loss": 4.7536, "step": 95000 }, { "epoch": 0.78, "grad_norm": 1.7077275514602661, "learning_rate": 2.2169753414791843e-06, "loss": 4.7559, "step": 95100 }, { "epoch": 0.78, "grad_norm": 0.9614852070808411, "learning_rate": 2.208791298725745e-06, "loss": 4.753, "step": 95200 }, { "epoch": 0.78, "grad_norm": 1.010793685913086, "learning_rate": 2.2006072559723056e-06, "loss": 4.7531, "step": 95300 }, { "epoch": 0.78, "grad_norm": 1.7269645929336548, "learning_rate": 2.192423213218866e-06, "loss": 4.7525, "step": 95400 }, { "epoch": 0.78, "grad_norm": 1.1839239597320557, "learning_rate": 2.1842391704654266e-06, "loss": 4.7516, "step": 95500 }, { "epoch": 0.78, "grad_norm": 1.0646226406097412, "learning_rate": 2.1760551277119873e-06, "loss": 4.7489, "step": 95600 }, { "epoch": 0.78, "grad_norm": 1.2255668640136719, "learning_rate": 2.167871084958548e-06, "loss": 4.7525, "step": 95700 }, { "epoch": 0.78, "grad_norm": 1.5146337747573853, "learning_rate": 2.1596870422051087e-06, "loss": 4.7524, "step": 95800 }, { "epoch": 0.78, "grad_norm": 2.578728437423706, "learning_rate": 2.1515029994516693e-06, "loss": 4.7537, "step": 95900 }, { "epoch": 0.79, "grad_norm": 1.3910084962844849, "learning_rate": 2.14331895669823e-06, "loss": 4.7557, "step": 96000 }, { "epoch": 0.79, "grad_norm": 1.6304432153701782, "learning_rate": 2.1351349139447903e-06, "loss": 4.7509, "step": 96100 }, { "epoch": 0.79, "grad_norm": 1.6290279626846313, "learning_rate": 2.1269508711913514e-06, "loss": 4.7499, "step": 96200 }, { "epoch": 0.79, "grad_norm": 1.312935471534729, "learning_rate": 2.118766828437912e-06, "loss": 4.7512, "step": 96300 }, { "epoch": 0.79, "grad_norm": 2.8677687644958496, "learning_rate": 2.110582785684473e-06, "loss": 4.7507, "step": 96400 }, { "epoch": 0.79, "grad_norm": 2.544320583343506, "learning_rate": 2.1023987429310335e-06, "loss": 4.7503, "step": 96500 }, { "epoch": 0.79, "grad_norm": 2.5052340030670166, "learning_rate": 2.0942147001775938e-06, "loss": 4.7543, "step": 96600 }, { "epoch": 0.79, "grad_norm": 2.0886638164520264, "learning_rate": 2.0860306574241544e-06, "loss": 4.7513, "step": 96700 }, { "epoch": 0.79, "grad_norm": 1.1290991306304932, "learning_rate": 2.077846614670715e-06, "loss": 4.7486, "step": 96800 }, { "epoch": 0.79, "grad_norm": 0.968976616859436, "learning_rate": 2.069662571917276e-06, "loss": 4.7549, "step": 96900 }, { "epoch": 0.79, "grad_norm": 1.1029621362686157, "learning_rate": 2.0614785291638365e-06, "loss": 4.7545, "step": 97000 }, { "epoch": 0.79, "grad_norm": 1.5654712915420532, "learning_rate": 2.053294486410397e-06, "loss": 4.7516, "step": 97100 }, { "epoch": 0.8, "grad_norm": 1.3423889875411987, "learning_rate": 2.045110443656958e-06, "loss": 4.7529, "step": 97200 }, { "epoch": 0.8, "grad_norm": 1.2194217443466187, "learning_rate": 2.0369264009035186e-06, "loss": 4.7503, "step": 97300 }, { "epoch": 0.8, "grad_norm": 1.0679503679275513, "learning_rate": 2.0287423581500793e-06, "loss": 4.7515, "step": 97400 }, { "epoch": 0.8, "grad_norm": 1.2756659984588623, "learning_rate": 2.02055831539664e-06, "loss": 4.752, "step": 97500 }, { "epoch": 0.8, "grad_norm": 1.315800428390503, "learning_rate": 2.0123742726432007e-06, "loss": 4.7554, "step": 97600 }, { "epoch": 0.8, "grad_norm": 1.2954620122909546, "learning_rate": 2.0041902298897614e-06, "loss": 4.7525, "step": 97700 }, { "epoch": 0.8, "grad_norm": 1.1520215272903442, "learning_rate": 1.9960061871363216e-06, "loss": 4.7548, "step": 97800 }, { "epoch": 0.8, "grad_norm": 1.7471413612365723, "learning_rate": 1.9878221443828823e-06, "loss": 4.7549, "step": 97900 }, { "epoch": 0.8, "grad_norm": 1.0936230421066284, "learning_rate": 1.979638101629443e-06, "loss": 4.753, "step": 98000 }, { "epoch": 0.8, "grad_norm": 1.110677719116211, "learning_rate": 1.9714540588760037e-06, "loss": 4.7524, "step": 98100 }, { "epoch": 0.8, "grad_norm": 1.094068169593811, "learning_rate": 1.9632700161225644e-06, "loss": 4.7508, "step": 98200 }, { "epoch": 0.8, "grad_norm": 1.3435810804367065, "learning_rate": 1.955085973369125e-06, "loss": 4.7504, "step": 98300 }, { "epoch": 0.81, "grad_norm": 1.7671642303466797, "learning_rate": 1.9469019306156858e-06, "loss": 4.7504, "step": 98400 }, { "epoch": 0.81, "grad_norm": 2.0996792316436768, "learning_rate": 1.9387178878622464e-06, "loss": 4.7468, "step": 98500 }, { "epoch": 0.81, "grad_norm": 1.256888508796692, "learning_rate": 1.930533845108807e-06, "loss": 4.7501, "step": 98600 }, { "epoch": 0.81, "grad_norm": 1.4650033712387085, "learning_rate": 1.922349802355368e-06, "loss": 4.7493, "step": 98700 }, { "epoch": 0.81, "grad_norm": 1.3852843046188354, "learning_rate": 1.9141657596019285e-06, "loss": 4.7553, "step": 98800 }, { "epoch": 0.81, "grad_norm": 1.2050074338912964, "learning_rate": 1.905981716848489e-06, "loss": 4.7481, "step": 98900 }, { "epoch": 0.81, "grad_norm": 2.955382823944092, "learning_rate": 1.8977976740950497e-06, "loss": 4.7498, "step": 99000 }, { "epoch": 0.81, "grad_norm": 1.6441978216171265, "learning_rate": 1.8896136313416102e-06, "loss": 4.7498, "step": 99100 }, { "epoch": 0.81, "grad_norm": 1.577948808670044, "learning_rate": 1.8814295885881709e-06, "loss": 4.7552, "step": 99200 }, { "epoch": 0.81, "grad_norm": 1.3677524328231812, "learning_rate": 1.8732455458347315e-06, "loss": 4.7555, "step": 99300 }, { "epoch": 0.81, "grad_norm": 1.4369767904281616, "learning_rate": 1.8650615030812922e-06, "loss": 4.7499, "step": 99400 }, { "epoch": 0.81, "grad_norm": 1.5186824798583984, "learning_rate": 1.8568774603278531e-06, "loss": 4.7539, "step": 99500 }, { "epoch": 0.82, "grad_norm": 1.2422914505004883, "learning_rate": 1.8486934175744136e-06, "loss": 4.7557, "step": 99600 }, { "epoch": 0.82, "grad_norm": 1.3044426441192627, "learning_rate": 1.8405093748209743e-06, "loss": 4.7525, "step": 99700 }, { "epoch": 0.82, "grad_norm": 3.3080742359161377, "learning_rate": 1.832325332067535e-06, "loss": 4.7509, "step": 99800 }, { "epoch": 0.82, "grad_norm": 1.1785410642623901, "learning_rate": 1.8241412893140955e-06, "loss": 4.7519, "step": 99900 }, { "epoch": 0.82, "grad_norm": 1.4587723016738892, "learning_rate": 1.8159572465606562e-06, "loss": 4.7484, "step": 100000 }, { "epoch": 0.82, "grad_norm": 3.0926313400268555, "learning_rate": 1.8077732038072169e-06, "loss": 4.7562, "step": 100100 }, { "epoch": 0.82, "grad_norm": 1.2200719118118286, "learning_rate": 1.7995891610537775e-06, "loss": 4.7584, "step": 100200 }, { "epoch": 0.82, "grad_norm": 1.3386414051055908, "learning_rate": 1.791405118300338e-06, "loss": 4.7458, "step": 100300 }, { "epoch": 0.82, "grad_norm": 1.484997034072876, "learning_rate": 1.7832210755468987e-06, "loss": 4.7449, "step": 100400 }, { "epoch": 0.82, "grad_norm": 2.519181489944458, "learning_rate": 1.7750370327934594e-06, "loss": 4.7529, "step": 100500 }, { "epoch": 0.82, "grad_norm": 2.090131998062134, "learning_rate": 1.7668529900400199e-06, "loss": 4.756, "step": 100600 }, { "epoch": 0.82, "grad_norm": 1.105173110961914, "learning_rate": 1.7586689472865808e-06, "loss": 4.754, "step": 100700 }, { "epoch": 0.82, "grad_norm": 1.1731809377670288, "learning_rate": 1.7504849045331415e-06, "loss": 4.7607, "step": 100800 }, { "epoch": 0.83, "grad_norm": 1.1397889852523804, "learning_rate": 1.7423008617797022e-06, "loss": 4.7506, "step": 100900 }, { "epoch": 0.83, "grad_norm": 1.1067900657653809, "learning_rate": 1.7341168190262629e-06, "loss": 4.7521, "step": 101000 }, { "epoch": 0.83, "grad_norm": 1.0581586360931396, "learning_rate": 1.7259327762728233e-06, "loss": 4.7531, "step": 101100 }, { "epoch": 0.83, "grad_norm": 1.9792087078094482, "learning_rate": 1.717748733519384e-06, "loss": 4.7541, "step": 101200 }, { "epoch": 0.83, "grad_norm": 1.133318305015564, "learning_rate": 1.7095646907659447e-06, "loss": 4.7554, "step": 101300 }, { "epoch": 0.83, "grad_norm": 1.241073489189148, "learning_rate": 1.7013806480125052e-06, "loss": 4.7493, "step": 101400 }, { "epoch": 0.83, "grad_norm": 1.2004437446594238, "learning_rate": 1.6931966052590659e-06, "loss": 4.7515, "step": 101500 }, { "epoch": 0.83, "grad_norm": 1.545440912246704, "learning_rate": 1.6850125625056266e-06, "loss": 4.7498, "step": 101600 }, { "epoch": 0.83, "grad_norm": 1.2501575946807861, "learning_rate": 1.6768285197521873e-06, "loss": 4.7457, "step": 101700 }, { "epoch": 0.83, "grad_norm": 1.2254008054733276, "learning_rate": 1.6686444769987482e-06, "loss": 4.754, "step": 101800 }, { "epoch": 0.83, "grad_norm": 1.8597551584243774, "learning_rate": 1.6604604342453086e-06, "loss": 4.7545, "step": 101900 }, { "epoch": 0.83, "grad_norm": 1.5887017250061035, "learning_rate": 1.6522763914918693e-06, "loss": 4.7491, "step": 102000 }, { "epoch": 0.84, "grad_norm": 1.3773962259292603, "learning_rate": 1.64409234873843e-06, "loss": 4.753, "step": 102100 }, { "epoch": 0.84, "grad_norm": 1.1974895000457764, "learning_rate": 1.6359083059849907e-06, "loss": 4.7563, "step": 102200 }, { "epoch": 0.84, "grad_norm": 1.3141651153564453, "learning_rate": 1.6277242632315512e-06, "loss": 4.7483, "step": 102300 }, { "epoch": 0.84, "grad_norm": 2.256546974182129, "learning_rate": 1.6195402204781119e-06, "loss": 4.747, "step": 102400 }, { "epoch": 0.84, "grad_norm": 2.344313859939575, "learning_rate": 1.6113561777246726e-06, "loss": 4.7458, "step": 102500 }, { "epoch": 0.84, "grad_norm": 1.533346176147461, "learning_rate": 1.603172134971233e-06, "loss": 4.7488, "step": 102600 }, { "epoch": 0.84, "grad_norm": 1.1802254915237427, "learning_rate": 1.5949880922177937e-06, "loss": 4.7503, "step": 102700 }, { "epoch": 0.84, "grad_norm": 1.1822803020477295, "learning_rate": 1.5868040494643544e-06, "loss": 4.7526, "step": 102800 }, { "epoch": 0.84, "grad_norm": 1.3468492031097412, "learning_rate": 1.5786200067109153e-06, "loss": 4.7511, "step": 102900 }, { "epoch": 0.84, "grad_norm": 4.000704765319824, "learning_rate": 1.570435963957476e-06, "loss": 4.754, "step": 103000 }, { "epoch": 0.84, "grad_norm": 1.139367699623108, "learning_rate": 1.5622519212040365e-06, "loss": 4.7488, "step": 103100 }, { "epoch": 0.84, "grad_norm": 4.506742000579834, "learning_rate": 1.5540678784505972e-06, "loss": 4.7518, "step": 103200 }, { "epoch": 0.85, "grad_norm": 1.9105794429779053, "learning_rate": 1.5458838356971579e-06, "loss": 4.751, "step": 103300 }, { "epoch": 0.85, "grad_norm": 1.203366994857788, "learning_rate": 1.5376997929437184e-06, "loss": 4.7505, "step": 103400 }, { "epoch": 0.85, "grad_norm": 1.2069025039672852, "learning_rate": 1.529515750190279e-06, "loss": 4.7502, "step": 103500 }, { "epoch": 0.85, "grad_norm": 1.0046311616897583, "learning_rate": 1.5213317074368397e-06, "loss": 4.7522, "step": 103600 }, { "epoch": 0.85, "grad_norm": 2.022199869155884, "learning_rate": 1.5131476646834004e-06, "loss": 4.7543, "step": 103700 }, { "epoch": 0.85, "grad_norm": 1.0690085887908936, "learning_rate": 1.504963621929961e-06, "loss": 4.7497, "step": 103800 }, { "epoch": 0.85, "grad_norm": 1.2978872060775757, "learning_rate": 1.4967795791765216e-06, "loss": 4.7511, "step": 103900 }, { "epoch": 0.85, "grad_norm": 1.110472321510315, "learning_rate": 1.4885955364230825e-06, "loss": 4.7504, "step": 104000 }, { "epoch": 0.85, "grad_norm": 2.129612684249878, "learning_rate": 1.4804114936696432e-06, "loss": 4.7538, "step": 104100 }, { "epoch": 0.85, "grad_norm": 1.1347908973693848, "learning_rate": 1.4722274509162037e-06, "loss": 4.7535, "step": 104200 }, { "epoch": 0.85, "grad_norm": 1.1420745849609375, "learning_rate": 1.4640434081627644e-06, "loss": 4.7491, "step": 104300 }, { "epoch": 0.85, "grad_norm": 2.80501127243042, "learning_rate": 1.455859365409325e-06, "loss": 4.7507, "step": 104400 }, { "epoch": 0.86, "grad_norm": 1.399776816368103, "learning_rate": 1.4476753226558857e-06, "loss": 4.7523, "step": 104500 }, { "epoch": 0.86, "grad_norm": 1.2114007472991943, "learning_rate": 1.4394912799024462e-06, "loss": 4.7522, "step": 104600 }, { "epoch": 0.86, "grad_norm": 1.078600525856018, "learning_rate": 1.431307237149007e-06, "loss": 4.7535, "step": 104700 }, { "epoch": 0.86, "grad_norm": 1.3322091102600098, "learning_rate": 1.4231231943955676e-06, "loss": 4.7511, "step": 104800 }, { "epoch": 0.86, "grad_norm": 1.2436057329177856, "learning_rate": 1.4149391516421283e-06, "loss": 4.7501, "step": 104900 }, { "epoch": 0.86, "grad_norm": 1.163930058479309, "learning_rate": 1.4067551088886888e-06, "loss": 4.7533, "step": 105000 }, { "epoch": 0.86, "grad_norm": 1.1139936447143555, "learning_rate": 1.3985710661352497e-06, "loss": 4.7474, "step": 105100 }, { "epoch": 0.86, "grad_norm": 1.704499363899231, "learning_rate": 1.3903870233818104e-06, "loss": 4.7524, "step": 105200 }, { "epoch": 0.86, "grad_norm": 1.2708555459976196, "learning_rate": 1.382202980628371e-06, "loss": 4.7558, "step": 105300 }, { "epoch": 0.86, "grad_norm": 2.6546807289123535, "learning_rate": 1.3740189378749315e-06, "loss": 4.7514, "step": 105400 }, { "epoch": 0.86, "grad_norm": 1.196606159210205, "learning_rate": 1.3658348951214922e-06, "loss": 4.7479, "step": 105500 }, { "epoch": 0.86, "grad_norm": 2.2983286380767822, "learning_rate": 1.357650852368053e-06, "loss": 4.7532, "step": 105600 }, { "epoch": 0.87, "grad_norm": 1.0857946872711182, "learning_rate": 1.3494668096146136e-06, "loss": 4.7531, "step": 105700 }, { "epoch": 0.87, "grad_norm": 1.606785535812378, "learning_rate": 1.341282766861174e-06, "loss": 4.7506, "step": 105800 }, { "epoch": 0.87, "grad_norm": 1.9557284116744995, "learning_rate": 1.3330987241077348e-06, "loss": 4.7553, "step": 105900 }, { "epoch": 0.87, "grad_norm": 2.19726824760437, "learning_rate": 1.3249146813542955e-06, "loss": 4.7524, "step": 106000 }, { "epoch": 0.87, "grad_norm": 1.0980935096740723, "learning_rate": 1.316730638600856e-06, "loss": 4.7515, "step": 106100 }, { "epoch": 0.87, "grad_norm": 1.3451943397521973, "learning_rate": 1.3085465958474168e-06, "loss": 4.7547, "step": 106200 }, { "epoch": 0.87, "grad_norm": 1.2886918783187866, "learning_rate": 1.3003625530939775e-06, "loss": 4.7505, "step": 106300 }, { "epoch": 0.87, "grad_norm": 1.1479195356369019, "learning_rate": 1.2921785103405382e-06, "loss": 4.7538, "step": 106400 }, { "epoch": 0.87, "grad_norm": 1.7975718975067139, "learning_rate": 1.283994467587099e-06, "loss": 4.7472, "step": 106500 }, { "epoch": 0.87, "grad_norm": 1.536818027496338, "learning_rate": 1.2758104248336594e-06, "loss": 4.7498, "step": 106600 }, { "epoch": 0.87, "grad_norm": 1.1835073232650757, "learning_rate": 1.26762638208022e-06, "loss": 4.751, "step": 106700 }, { "epoch": 0.87, "grad_norm": 1.418748378753662, "learning_rate": 1.2594423393267808e-06, "loss": 4.7506, "step": 106800 }, { "epoch": 0.87, "grad_norm": 1.7083241939544678, "learning_rate": 1.2512582965733413e-06, "loss": 4.757, "step": 106900 }, { "epoch": 0.88, "grad_norm": 1.9533259868621826, "learning_rate": 1.243074253819902e-06, "loss": 4.7491, "step": 107000 }, { "epoch": 0.88, "grad_norm": 1.549060344696045, "learning_rate": 1.2348902110664629e-06, "loss": 4.7502, "step": 107100 }, { "epoch": 0.88, "grad_norm": 1.22414231300354, "learning_rate": 1.2267061683130233e-06, "loss": 4.7512, "step": 107200 }, { "epoch": 0.88, "grad_norm": 2.5019216537475586, "learning_rate": 1.218522125559584e-06, "loss": 4.7551, "step": 107300 }, { "epoch": 0.88, "grad_norm": 1.4612125158309937, "learning_rate": 1.2103380828061447e-06, "loss": 4.7517, "step": 107400 }, { "epoch": 0.88, "grad_norm": 1.41428542137146, "learning_rate": 1.2021540400527052e-06, "loss": 4.7522, "step": 107500 }, { "epoch": 0.88, "grad_norm": 1.1158181428909302, "learning_rate": 1.193969997299266e-06, "loss": 4.7534, "step": 107600 }, { "epoch": 0.88, "grad_norm": 1.4456719160079956, "learning_rate": 1.1857859545458268e-06, "loss": 4.7515, "step": 107700 }, { "epoch": 0.88, "grad_norm": 2.0484957695007324, "learning_rate": 1.1776019117923873e-06, "loss": 4.7475, "step": 107800 }, { "epoch": 0.88, "grad_norm": 1.0839197635650635, "learning_rate": 1.169417869038948e-06, "loss": 4.7474, "step": 107900 }, { "epoch": 0.88, "grad_norm": 1.4242494106292725, "learning_rate": 1.1612338262855086e-06, "loss": 4.7516, "step": 108000 }, { "epoch": 0.88, "grad_norm": 1.1142181158065796, "learning_rate": 1.1530497835320691e-06, "loss": 4.7513, "step": 108100 }, { "epoch": 0.89, "grad_norm": 1.2992855310440063, "learning_rate": 1.14486574077863e-06, "loss": 4.7497, "step": 108200 }, { "epoch": 0.89, "grad_norm": 1.1050403118133545, "learning_rate": 1.1366816980251905e-06, "loss": 4.7478, "step": 108300 }, { "epoch": 0.89, "grad_norm": 1.6111624240875244, "learning_rate": 1.1284976552717512e-06, "loss": 4.7521, "step": 108400 }, { "epoch": 0.89, "grad_norm": 1.6379482746124268, "learning_rate": 1.1203136125183119e-06, "loss": 4.7456, "step": 108500 }, { "epoch": 0.89, "grad_norm": 1.8351396322250366, "learning_rate": 1.1121295697648726e-06, "loss": 4.7518, "step": 108600 }, { "epoch": 0.89, "grad_norm": 1.1721076965332031, "learning_rate": 1.1039455270114333e-06, "loss": 4.7557, "step": 108700 }, { "epoch": 0.89, "grad_norm": 1.4993492364883423, "learning_rate": 1.095761484257994e-06, "loss": 4.7519, "step": 108800 }, { "epoch": 0.89, "grad_norm": 1.1917214393615723, "learning_rate": 1.0875774415045544e-06, "loss": 4.748, "step": 108900 }, { "epoch": 0.89, "grad_norm": 1.0404828786849976, "learning_rate": 1.0793933987511151e-06, "loss": 4.7565, "step": 109000 }, { "epoch": 0.89, "grad_norm": 1.5994240045547485, "learning_rate": 1.0712093559976758e-06, "loss": 4.7499, "step": 109100 }, { "epoch": 0.89, "grad_norm": 1.197583556175232, "learning_rate": 1.0630253132442365e-06, "loss": 4.7537, "step": 109200 }, { "epoch": 0.89, "grad_norm": 1.6032483577728271, "learning_rate": 1.0548412704907972e-06, "loss": 4.7542, "step": 109300 }, { "epoch": 0.9, "grad_norm": 1.39584481716156, "learning_rate": 1.0466572277373579e-06, "loss": 4.7493, "step": 109400 }, { "epoch": 0.9, "grad_norm": 1.410801649093628, "learning_rate": 1.0384731849839184e-06, "loss": 4.7559, "step": 109500 }, { "epoch": 0.9, "grad_norm": 1.246910810470581, "learning_rate": 1.030289142230479e-06, "loss": 4.751, "step": 109600 }, { "epoch": 0.9, "grad_norm": 4.328908920288086, "learning_rate": 1.0221050994770397e-06, "loss": 4.7496, "step": 109700 }, { "epoch": 0.9, "grad_norm": 1.5280972719192505, "learning_rate": 1.0139210567236004e-06, "loss": 4.7539, "step": 109800 }, { "epoch": 0.9, "grad_norm": 2.1216630935668945, "learning_rate": 1.0057370139701611e-06, "loss": 4.7523, "step": 109900 }, { "epoch": 0.9, "grad_norm": 1.4128057956695557, "learning_rate": 9.975529712167218e-07, "loss": 4.7492, "step": 110000 }, { "epoch": 0.9, "grad_norm": 1.2564375400543213, "learning_rate": 9.893689284632823e-07, "loss": 4.754, "step": 110100 }, { "epoch": 0.9, "grad_norm": 2.3144404888153076, "learning_rate": 9.81184885709843e-07, "loss": 4.7473, "step": 110200 }, { "epoch": 0.9, "grad_norm": 0.9776962399482727, "learning_rate": 9.730008429564037e-07, "loss": 4.7479, "step": 110300 }, { "epoch": 0.9, "grad_norm": 2.479701519012451, "learning_rate": 9.648168002029644e-07, "loss": 4.7514, "step": 110400 }, { "epoch": 0.9, "grad_norm": 3.217172145843506, "learning_rate": 9.56632757449525e-07, "loss": 4.7535, "step": 110500 }, { "epoch": 0.91, "grad_norm": 1.103346824645996, "learning_rate": 9.484487146960856e-07, "loss": 4.7474, "step": 110600 }, { "epoch": 0.91, "grad_norm": 1.1965771913528442, "learning_rate": 9.402646719426463e-07, "loss": 4.7475, "step": 110700 }, { "epoch": 0.91, "grad_norm": 1.2940635681152344, "learning_rate": 9.320806291892069e-07, "loss": 4.7484, "step": 110800 }, { "epoch": 0.91, "grad_norm": 1.06132972240448, "learning_rate": 9.238965864357677e-07, "loss": 4.7547, "step": 110900 }, { "epoch": 0.91, "grad_norm": 1.8715641498565674, "learning_rate": 9.157125436823283e-07, "loss": 4.749, "step": 111000 }, { "epoch": 0.91, "grad_norm": 1.1907116174697876, "learning_rate": 9.07528500928889e-07, "loss": 4.7539, "step": 111100 }, { "epoch": 0.91, "grad_norm": 1.5867308378219604, "learning_rate": 8.993444581754496e-07, "loss": 4.7562, "step": 111200 }, { "epoch": 0.91, "grad_norm": 1.2849870920181274, "learning_rate": 8.911604154220103e-07, "loss": 4.7512, "step": 111300 }, { "epoch": 0.91, "grad_norm": 1.3407094478607178, "learning_rate": 8.829763726685708e-07, "loss": 4.7543, "step": 111400 }, { "epoch": 0.91, "grad_norm": 1.0691750049591064, "learning_rate": 8.747923299151316e-07, "loss": 4.7451, "step": 111500 }, { "epoch": 0.91, "grad_norm": 1.0635693073272705, "learning_rate": 8.666082871616922e-07, "loss": 4.7521, "step": 111600 }, { "epoch": 0.91, "grad_norm": 1.5273666381835938, "learning_rate": 8.584242444082529e-07, "loss": 4.7551, "step": 111700 }, { "epoch": 0.91, "grad_norm": 1.7429158687591553, "learning_rate": 8.502402016548135e-07, "loss": 4.7544, "step": 111800 }, { "epoch": 0.92, "grad_norm": 1.0581636428833008, "learning_rate": 8.420561589013741e-07, "loss": 4.7532, "step": 111900 }, { "epoch": 0.92, "grad_norm": 1.3187443017959595, "learning_rate": 8.338721161479348e-07, "loss": 4.7551, "step": 112000 }, { "epoch": 0.92, "grad_norm": 1.2842453718185425, "learning_rate": 8.256880733944956e-07, "loss": 4.7482, "step": 112100 }, { "epoch": 0.92, "grad_norm": 1.264115571975708, "learning_rate": 8.175040306410561e-07, "loss": 4.7475, "step": 112200 }, { "epoch": 0.92, "grad_norm": 1.338619589805603, "learning_rate": 8.093199878876168e-07, "loss": 4.7465, "step": 112300 }, { "epoch": 0.92, "grad_norm": 1.2081668376922607, "learning_rate": 8.011359451341774e-07, "loss": 4.75, "step": 112400 }, { "epoch": 0.92, "grad_norm": 1.475716471672058, "learning_rate": 7.92951902380738e-07, "loss": 4.7475, "step": 112500 }, { "epoch": 0.92, "grad_norm": 1.1391123533248901, "learning_rate": 7.847678596272988e-07, "loss": 4.7529, "step": 112600 }, { "epoch": 0.92, "grad_norm": 1.5139949321746826, "learning_rate": 7.765838168738595e-07, "loss": 4.7457, "step": 112700 }, { "epoch": 0.92, "grad_norm": 1.250877857208252, "learning_rate": 7.683997741204201e-07, "loss": 4.7512, "step": 112800 }, { "epoch": 0.92, "grad_norm": 1.3660330772399902, "learning_rate": 7.602157313669807e-07, "loss": 4.7519, "step": 112900 }, { "epoch": 0.92, "grad_norm": 1.3007818460464478, "learning_rate": 7.520316886135414e-07, "loss": 4.7539, "step": 113000 }, { "epoch": 0.93, "grad_norm": 1.1533290147781372, "learning_rate": 7.438476458601019e-07, "loss": 4.7522, "step": 113100 }, { "epoch": 0.93, "grad_norm": 1.1087945699691772, "learning_rate": 7.356636031066627e-07, "loss": 4.7549, "step": 113200 }, { "epoch": 0.93, "grad_norm": 1.3991641998291016, "learning_rate": 7.274795603532233e-07, "loss": 4.7538, "step": 113300 }, { "epoch": 0.93, "grad_norm": 1.820162296295166, "learning_rate": 7.19295517599784e-07, "loss": 4.7521, "step": 113400 }, { "epoch": 0.93, "grad_norm": 1.1187483072280884, "learning_rate": 7.111114748463446e-07, "loss": 4.7577, "step": 113500 }, { "epoch": 0.93, "grad_norm": 1.0411303043365479, "learning_rate": 7.029274320929053e-07, "loss": 4.7514, "step": 113600 }, { "epoch": 0.93, "grad_norm": 1.2369052171707153, "learning_rate": 6.947433893394661e-07, "loss": 4.7497, "step": 113700 }, { "epoch": 0.93, "grad_norm": 1.505008578300476, "learning_rate": 6.865593465860267e-07, "loss": 4.7505, "step": 113800 }, { "epoch": 0.93, "grad_norm": 1.2643870115280151, "learning_rate": 6.783753038325872e-07, "loss": 4.7499, "step": 113900 }, { "epoch": 0.93, "grad_norm": 1.095914363861084, "learning_rate": 6.701912610791479e-07, "loss": 4.7478, "step": 114000 }, { "epoch": 0.93, "grad_norm": 1.4800920486450195, "learning_rate": 6.620072183257085e-07, "loss": 4.748, "step": 114100 }, { "epoch": 0.93, "grad_norm": 1.6267393827438354, "learning_rate": 6.538231755722692e-07, "loss": 4.7523, "step": 114200 }, { "epoch": 0.94, "grad_norm": 1.7788121700286865, "learning_rate": 6.456391328188299e-07, "loss": 4.7548, "step": 114300 }, { "epoch": 0.94, "grad_norm": 1.2876346111297607, "learning_rate": 6.374550900653906e-07, "loss": 4.7555, "step": 114400 }, { "epoch": 0.94, "grad_norm": 1.0011918544769287, "learning_rate": 6.292710473119512e-07, "loss": 4.7487, "step": 114500 }, { "epoch": 0.94, "grad_norm": 2.096606731414795, "learning_rate": 6.210870045585119e-07, "loss": 4.7527, "step": 114600 }, { "epoch": 0.94, "grad_norm": 1.2112175226211548, "learning_rate": 6.129029618050726e-07, "loss": 4.7482, "step": 114700 }, { "epoch": 0.94, "grad_norm": 1.4160529375076294, "learning_rate": 6.047189190516331e-07, "loss": 4.7567, "step": 114800 }, { "epoch": 0.94, "grad_norm": 1.36955988407135, "learning_rate": 5.965348762981938e-07, "loss": 4.7562, "step": 114900 }, { "epoch": 0.94, "grad_norm": 1.577446460723877, "learning_rate": 5.883508335447545e-07, "loss": 4.7534, "step": 115000 }, { "epoch": 0.94, "grad_norm": 1.2918033599853516, "learning_rate": 5.801667907913151e-07, "loss": 4.7472, "step": 115100 }, { "epoch": 0.94, "grad_norm": 1.07747220993042, "learning_rate": 5.719827480378758e-07, "loss": 4.7495, "step": 115200 }, { "epoch": 0.94, "grad_norm": 1.4656786918640137, "learning_rate": 5.637987052844365e-07, "loss": 4.7501, "step": 115300 }, { "epoch": 0.94, "grad_norm": 1.2017009258270264, "learning_rate": 5.556146625309971e-07, "loss": 4.7482, "step": 115400 }, { "epoch": 0.95, "grad_norm": 1.3578641414642334, "learning_rate": 5.474306197775578e-07, "loss": 4.7467, "step": 115500 }, { "epoch": 0.95, "grad_norm": 1.246904969215393, "learning_rate": 5.392465770241185e-07, "loss": 4.7564, "step": 115600 }, { "epoch": 0.95, "grad_norm": 1.289212942123413, "learning_rate": 5.31062534270679e-07, "loss": 4.7516, "step": 115700 }, { "epoch": 0.95, "grad_norm": 2.1466667652130127, "learning_rate": 5.228784915172397e-07, "loss": 4.7485, "step": 115800 }, { "epoch": 0.95, "grad_norm": 1.6222789287567139, "learning_rate": 5.146944487638004e-07, "loss": 4.7535, "step": 115900 }, { "epoch": 0.95, "grad_norm": 1.2257126569747925, "learning_rate": 5.06510406010361e-07, "loss": 4.7537, "step": 116000 }, { "epoch": 0.95, "grad_norm": 1.0743142366409302, "learning_rate": 4.983263632569217e-07, "loss": 4.7468, "step": 116100 }, { "epoch": 0.95, "grad_norm": 1.4326200485229492, "learning_rate": 4.901423205034824e-07, "loss": 4.7537, "step": 116200 }, { "epoch": 0.95, "grad_norm": 1.513900876045227, "learning_rate": 4.81958277750043e-07, "loss": 4.7492, "step": 116300 }, { "epoch": 0.95, "grad_norm": 1.3525514602661133, "learning_rate": 4.7377423499660366e-07, "loss": 4.7511, "step": 116400 }, { "epoch": 0.95, "grad_norm": 1.0643374919891357, "learning_rate": 4.655901922431643e-07, "loss": 4.756, "step": 116500 }, { "epoch": 0.95, "grad_norm": 1.0973927974700928, "learning_rate": 4.57406149489725e-07, "loss": 4.7512, "step": 116600 }, { "epoch": 0.96, "grad_norm": 1.498604416847229, "learning_rate": 4.492221067362856e-07, "loss": 4.7534, "step": 116700 }, { "epoch": 0.96, "grad_norm": 2.6713321208953857, "learning_rate": 4.4103806398284626e-07, "loss": 4.7497, "step": 116800 }, { "epoch": 0.96, "grad_norm": 1.2505509853363037, "learning_rate": 4.3285402122940695e-07, "loss": 4.7551, "step": 116900 }, { "epoch": 0.96, "grad_norm": 1.1062592267990112, "learning_rate": 4.246699784759676e-07, "loss": 4.7536, "step": 117000 }, { "epoch": 0.96, "grad_norm": 1.4121407270431519, "learning_rate": 4.1648593572252823e-07, "loss": 4.7548, "step": 117100 }, { "epoch": 0.96, "grad_norm": 1.2598930597305298, "learning_rate": 4.083018929690889e-07, "loss": 4.7559, "step": 117200 }, { "epoch": 0.96, "grad_norm": 1.1028268337249756, "learning_rate": 4.0011785021564955e-07, "loss": 4.7495, "step": 117300 }, { "epoch": 0.96, "grad_norm": 1.5532176494598389, "learning_rate": 3.919338074622102e-07, "loss": 4.7543, "step": 117400 }, { "epoch": 0.96, "grad_norm": 1.2546781301498413, "learning_rate": 3.837497647087709e-07, "loss": 4.7502, "step": 117500 }, { "epoch": 0.96, "grad_norm": 1.0731525421142578, "learning_rate": 3.755657219553315e-07, "loss": 4.7519, "step": 117600 }, { "epoch": 0.96, "grad_norm": 1.3289110660552979, "learning_rate": 3.6738167920189216e-07, "loss": 4.7495, "step": 117700 }, { "epoch": 0.96, "grad_norm": 1.2810921669006348, "learning_rate": 3.5919763644845285e-07, "loss": 4.7522, "step": 117800 }, { "epoch": 0.96, "grad_norm": 1.0300296545028687, "learning_rate": 3.510135936950135e-07, "loss": 4.7491, "step": 117900 }, { "epoch": 0.97, "grad_norm": 1.9749176502227783, "learning_rate": 3.428295509415742e-07, "loss": 4.7558, "step": 118000 }, { "epoch": 0.97, "grad_norm": 1.6398601531982422, "learning_rate": 3.346455081881348e-07, "loss": 4.7489, "step": 118100 }, { "epoch": 0.97, "grad_norm": 1.154733657836914, "learning_rate": 3.2646146543469545e-07, "loss": 4.747, "step": 118200 }, { "epoch": 0.97, "grad_norm": 1.1068767309188843, "learning_rate": 3.1827742268125614e-07, "loss": 4.7532, "step": 118300 }, { "epoch": 0.97, "grad_norm": 1.5782713890075684, "learning_rate": 3.100933799278168e-07, "loss": 4.7525, "step": 118400 }, { "epoch": 0.97, "grad_norm": 1.2185344696044922, "learning_rate": 3.019093371743774e-07, "loss": 4.7507, "step": 118500 }, { "epoch": 0.97, "grad_norm": 1.168750286102295, "learning_rate": 2.9372529442093805e-07, "loss": 4.7503, "step": 118600 }, { "epoch": 0.97, "grad_norm": 1.3794721364974976, "learning_rate": 2.8554125166749874e-07, "loss": 4.7539, "step": 118700 }, { "epoch": 0.97, "grad_norm": 1.0481081008911133, "learning_rate": 2.773572089140594e-07, "loss": 4.7493, "step": 118800 }, { "epoch": 0.97, "grad_norm": 1.1850849390029907, "learning_rate": 2.6917316616062007e-07, "loss": 4.752, "step": 118900 }, { "epoch": 0.97, "grad_norm": 1.3937416076660156, "learning_rate": 2.609891234071807e-07, "loss": 4.7518, "step": 119000 }, { "epoch": 0.97, "grad_norm": 1.0362590551376343, "learning_rate": 2.5280508065374134e-07, "loss": 4.7494, "step": 119100 }, { "epoch": 0.98, "grad_norm": 1.226879358291626, "learning_rate": 2.4462103790030203e-07, "loss": 4.7541, "step": 119200 }, { "epoch": 0.98, "grad_norm": 1.289158582687378, "learning_rate": 2.3643699514686265e-07, "loss": 4.7541, "step": 119300 }, { "epoch": 0.98, "grad_norm": 1.2052675485610962, "learning_rate": 2.282529523934233e-07, "loss": 4.7494, "step": 119400 }, { "epoch": 0.98, "grad_norm": 1.3836339712142944, "learning_rate": 2.2006890963998397e-07, "loss": 4.7501, "step": 119500 }, { "epoch": 0.98, "grad_norm": 1.075812578201294, "learning_rate": 2.1188486688654464e-07, "loss": 4.7466, "step": 119600 }, { "epoch": 0.98, "grad_norm": 1.0450024604797363, "learning_rate": 2.0370082413310527e-07, "loss": 4.7513, "step": 119700 }, { "epoch": 0.98, "grad_norm": 1.6995435953140259, "learning_rate": 1.9551678137966594e-07, "loss": 4.7544, "step": 119800 }, { "epoch": 0.98, "grad_norm": 2.0586297512054443, "learning_rate": 1.873327386262266e-07, "loss": 4.7512, "step": 119900 }, { "epoch": 0.98, "grad_norm": 1.734843134880066, "learning_rate": 1.7914869587278727e-07, "loss": 4.752, "step": 120000 }, { "epoch": 0.98, "grad_norm": 1.8307639360427856, "learning_rate": 1.709646531193479e-07, "loss": 4.7486, "step": 120100 }, { "epoch": 0.98, "grad_norm": 1.0754849910736084, "learning_rate": 1.6278061036590857e-07, "loss": 4.7518, "step": 120200 }, { "epoch": 0.98, "grad_norm": 1.8558402061462402, "learning_rate": 1.545965676124692e-07, "loss": 4.7496, "step": 120300 }, { "epoch": 0.99, "grad_norm": 1.3178366422653198, "learning_rate": 1.4641252485902987e-07, "loss": 4.7474, "step": 120400 }, { "epoch": 0.99, "grad_norm": 1.176018238067627, "learning_rate": 1.3822848210559053e-07, "loss": 4.7517, "step": 120500 }, { "epoch": 0.99, "grad_norm": 1.0331361293792725, "learning_rate": 1.300444393521512e-07, "loss": 4.7445, "step": 120600 }, { "epoch": 0.99, "grad_norm": 1.2724260091781616, "learning_rate": 1.2186039659871183e-07, "loss": 4.7519, "step": 120700 }, { "epoch": 0.99, "grad_norm": 1.1402854919433594, "learning_rate": 1.1367635384527251e-07, "loss": 4.7547, "step": 120800 }, { "epoch": 0.99, "grad_norm": 1.6155171394348145, "learning_rate": 1.0549231109183315e-07, "loss": 4.7553, "step": 120900 }, { "epoch": 0.99, "grad_norm": 1.3488671779632568, "learning_rate": 9.73082683383938e-08, "loss": 4.7489, "step": 121000 }, { "epoch": 0.99, "grad_norm": 1.4476072788238525, "learning_rate": 8.912422558495446e-08, "loss": 4.7531, "step": 121100 }, { "epoch": 0.99, "grad_norm": 1.5881561040878296, "learning_rate": 8.094018283151511e-08, "loss": 4.7472, "step": 121200 }, { "epoch": 0.99, "grad_norm": 1.3526599407196045, "learning_rate": 7.275614007807578e-08, "loss": 4.7507, "step": 121300 }, { "epoch": 0.99, "grad_norm": 1.1709797382354736, "learning_rate": 6.457209732463643e-08, "loss": 4.7495, "step": 121400 }, { "epoch": 0.99, "grad_norm": 1.1517142057418823, "learning_rate": 5.6388054571197084e-08, "loss": 4.7534, "step": 121500 }, { "epoch": 1.0, "grad_norm": 1.6636133193969727, "learning_rate": 4.820401181775774e-08, "loss": 4.7524, "step": 121600 }, { "epoch": 1.0, "grad_norm": 1.1265342235565186, "learning_rate": 4.001996906431839e-08, "loss": 4.749, "step": 121700 }, { "epoch": 1.0, "grad_norm": 1.4785326719284058, "learning_rate": 3.183592631087905e-08, "loss": 4.7507, "step": 121800 }, { "epoch": 1.0, "grad_norm": 3.0491678714752197, "learning_rate": 2.3651883557439706e-08, "loss": 4.7502, "step": 121900 }, { "epoch": 1.0, "grad_norm": 1.162163257598877, "learning_rate": 1.5467840804000363e-08, "loss": 4.7493, "step": 122000 } ], "logging_steps": 100, "max_steps": 122189, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 1.4891090143551898e+18, "train_batch_size": 96, "trial_name": null, "trial_params": null }