{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.6124837451235372, "eval_steps": 500, "global_step": 217000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000743080066877206, "grad_norm": 53.60346603393555, "learning_rate": 3.715262297518205e-07, "loss": 0.8923, "step": 100 }, { "epoch": 0.001486160133754412, "grad_norm": 4.683310508728027, "learning_rate": 7.43052459503641e-07, "loss": 0.2479, "step": 200 }, { "epoch": 0.002229240200631618, "grad_norm": 2.3936846256256104, "learning_rate": 1.1145786892554615e-06, "loss": 0.1825, "step": 300 }, { "epoch": 0.002972320267508824, "grad_norm": 2.798039436340332, "learning_rate": 1.486104919007282e-06, "loss": 0.1833, "step": 400 }, { "epoch": 0.00371540033438603, "grad_norm": 2.2894914150238037, "learning_rate": 1.8576311487591026e-06, "loss": 0.1668, "step": 500 }, { "epoch": 0.004458480401263236, "grad_norm": 6.374472141265869, "learning_rate": 2.229157378510923e-06, "loss": 0.1465, "step": 600 }, { "epoch": 0.005201560468140442, "grad_norm": 4.546179294586182, "learning_rate": 2.6006836082627435e-06, "loss": 0.1476, "step": 700 }, { "epoch": 0.005944640535017648, "grad_norm": 13.52912712097168, "learning_rate": 2.972209838014564e-06, "loss": 0.1603, "step": 800 }, { "epoch": 0.0066877206018948546, "grad_norm": 2.7029688358306885, "learning_rate": 3.3437360677663843e-06, "loss": 0.1444, "step": 900 }, { "epoch": 0.00743080066877206, "grad_norm": 3.5417072772979736, "learning_rate": 3.715262297518205e-06, "loss": 0.1311, "step": 1000 }, { "epoch": 0.008173880735649267, "grad_norm": 8.042838096618652, "learning_rate": 4.086788527270026e-06, "loss": 0.1546, "step": 1100 }, { "epoch": 0.008916960802526472, "grad_norm": 2.0573182106018066, "learning_rate": 4.458314757021846e-06, "loss": 0.1166, "step": 1200 }, { "epoch": 0.009660040869403677, "grad_norm": 3.3977742195129395, "learning_rate": 4.8298409867736665e-06, "loss": 0.1363, "step": 1300 }, { "epoch": 0.010403120936280884, "grad_norm": 2.2985846996307373, "learning_rate": 5.201367216525487e-06, "loss": 0.1261, "step": 1400 }, { "epoch": 0.01114620100315809, "grad_norm": 1.9680628776550293, "learning_rate": 5.572893446277307e-06, "loss": 0.118, "step": 1500 }, { "epoch": 0.011889281070035297, "grad_norm": 1.7824922800064087, "learning_rate": 5.944419676029128e-06, "loss": 0.123, "step": 1600 }, { "epoch": 0.012632361136912502, "grad_norm": 2.1391842365264893, "learning_rate": 6.315945905780948e-06, "loss": 0.1247, "step": 1700 }, { "epoch": 0.013375441203789709, "grad_norm": 1.4350066184997559, "learning_rate": 6.687472135532769e-06, "loss": 0.1153, "step": 1800 }, { "epoch": 0.014118521270666914, "grad_norm": 9.50867748260498, "learning_rate": 7.05899836528459e-06, "loss": 0.1315, "step": 1900 }, { "epoch": 0.01486160133754412, "grad_norm": 4.24827241897583, "learning_rate": 7.43052459503641e-06, "loss": 0.1064, "step": 2000 }, { "epoch": 0.015604681404421327, "grad_norm": 9.737875938415527, "learning_rate": 7.80205082478823e-06, "loss": 0.1121, "step": 2100 }, { "epoch": 0.016347761471298534, "grad_norm": 1.2200586795806885, "learning_rate": 8.173577054540051e-06, "loss": 0.1135, "step": 2200 }, { "epoch": 0.01709084153817574, "grad_norm": 1.4241139888763428, "learning_rate": 8.54510328429187e-06, "loss": 0.1179, "step": 2300 }, { "epoch": 0.017833921605052944, "grad_norm": 7.41565465927124, "learning_rate": 8.916629514043692e-06, "loss": 0.1022, "step": 2400 }, { "epoch": 0.01857700167193015, "grad_norm": 2.8896067142486572, "learning_rate": 9.288155743795512e-06, "loss": 0.0938, "step": 2500 }, { "epoch": 0.019320081738807355, "grad_norm": 1.3710119724273682, "learning_rate": 9.659681973547333e-06, "loss": 0.0969, "step": 2600 }, { "epoch": 0.020063161805684564, "grad_norm": 1.5391430854797363, "learning_rate": 1.0031208203299153e-05, "loss": 0.1133, "step": 2700 }, { "epoch": 0.02080624187256177, "grad_norm": 1.1579798460006714, "learning_rate": 1.0402734433050974e-05, "loss": 0.1121, "step": 2800 }, { "epoch": 0.021549321939438974, "grad_norm": 0.845294713973999, "learning_rate": 1.0774260662802795e-05, "loss": 0.1036, "step": 2900 }, { "epoch": 0.02229240200631618, "grad_norm": 1.0687625408172607, "learning_rate": 1.1145786892554615e-05, "loss": 0.0978, "step": 3000 }, { "epoch": 0.023035482073193385, "grad_norm": 1.5421385765075684, "learning_rate": 1.1517313122306434e-05, "loss": 0.102, "step": 3100 }, { "epoch": 0.023778562140070594, "grad_norm": 1.586124300956726, "learning_rate": 1.1888839352058256e-05, "loss": 0.1065, "step": 3200 }, { "epoch": 0.0245216422069478, "grad_norm": 6.6496381759643555, "learning_rate": 1.2260365581810077e-05, "loss": 0.0957, "step": 3300 }, { "epoch": 0.025264722273825004, "grad_norm": 1.0494637489318848, "learning_rate": 1.2631891811561896e-05, "loss": 0.0956, "step": 3400 }, { "epoch": 0.02600780234070221, "grad_norm": 23.01812744140625, "learning_rate": 1.3003418041313718e-05, "loss": 0.0967, "step": 3500 }, { "epoch": 0.026750882407579418, "grad_norm": 1.8499131202697754, "learning_rate": 1.3374944271065537e-05, "loss": 0.092, "step": 3600 }, { "epoch": 0.027493962474456624, "grad_norm": 1.666256308555603, "learning_rate": 1.3746470500817357e-05, "loss": 0.0922, "step": 3700 }, { "epoch": 0.02823704254133383, "grad_norm": 1.540336012840271, "learning_rate": 1.411799673056918e-05, "loss": 0.1136, "step": 3800 }, { "epoch": 0.028980122608211034, "grad_norm": 1.0387918949127197, "learning_rate": 1.4489522960321001e-05, "loss": 0.0891, "step": 3900 }, { "epoch": 0.02972320267508824, "grad_norm": 3.97080659866333, "learning_rate": 1.486104919007282e-05, "loss": 0.113, "step": 4000 }, { "epoch": 0.030466282741965448, "grad_norm": 1.724788784980774, "learning_rate": 1.523257541982464e-05, "loss": 0.0968, "step": 4100 }, { "epoch": 0.031209362808842653, "grad_norm": 1.169791340827942, "learning_rate": 1.560410164957646e-05, "loss": 0.1018, "step": 4200 }, { "epoch": 0.03195244287571986, "grad_norm": 1.2633169889450073, "learning_rate": 1.597562787932828e-05, "loss": 0.1015, "step": 4300 }, { "epoch": 0.03269552294259707, "grad_norm": 1.881282925605774, "learning_rate": 1.6347154109080102e-05, "loss": 0.094, "step": 4400 }, { "epoch": 0.03343860300947427, "grad_norm": 2.1061973571777344, "learning_rate": 1.671868033883192e-05, "loss": 0.0917, "step": 4500 }, { "epoch": 0.03418168307635148, "grad_norm": 0.998253583908081, "learning_rate": 1.709020656858374e-05, "loss": 0.0815, "step": 4600 }, { "epoch": 0.03492476314322868, "grad_norm": 1.806101679801941, "learning_rate": 1.7461732798335563e-05, "loss": 0.0926, "step": 4700 }, { "epoch": 0.03566784321010589, "grad_norm": 3.480004072189331, "learning_rate": 1.7833259028087384e-05, "loss": 0.0936, "step": 4800 }, { "epoch": 0.036410923276983094, "grad_norm": 2.259936809539795, "learning_rate": 1.8204785257839205e-05, "loss": 0.0931, "step": 4900 }, { "epoch": 0.0371540033438603, "grad_norm": 0.9388754963874817, "learning_rate": 1.8576311487591023e-05, "loss": 0.0738, "step": 5000 }, { "epoch": 0.037897083410737505, "grad_norm": 1.2603083848953247, "learning_rate": 1.8947837717342845e-05, "loss": 0.08, "step": 5100 }, { "epoch": 0.03864016347761471, "grad_norm": 1.3742884397506714, "learning_rate": 1.9319363947094666e-05, "loss": 0.0849, "step": 5200 }, { "epoch": 0.03938324354449192, "grad_norm": 1.1790800094604492, "learning_rate": 1.9690890176846484e-05, "loss": 0.0854, "step": 5300 }, { "epoch": 0.04012632361136913, "grad_norm": 0.9031810164451599, "learning_rate": 2.0062416406598305e-05, "loss": 0.0823, "step": 5400 }, { "epoch": 0.04086940367824633, "grad_norm": 1.4861712455749512, "learning_rate": 2.043394263635013e-05, "loss": 0.0711, "step": 5500 }, { "epoch": 0.04161248374512354, "grad_norm": 1.0741400718688965, "learning_rate": 2.0805468866101948e-05, "loss": 0.0922, "step": 5600 }, { "epoch": 0.04235556381200074, "grad_norm": 0.9590131044387817, "learning_rate": 2.117699509585377e-05, "loss": 0.0837, "step": 5700 }, { "epoch": 0.04309864387887795, "grad_norm": 2.766433000564575, "learning_rate": 2.154852132560559e-05, "loss": 0.0758, "step": 5800 }, { "epoch": 0.043841723945755154, "grad_norm": 7.11095666885376, "learning_rate": 2.1920047555357408e-05, "loss": 0.0765, "step": 5900 }, { "epoch": 0.04458480401263236, "grad_norm": 0.8902910947799683, "learning_rate": 2.229157378510923e-05, "loss": 0.071, "step": 6000 }, { "epoch": 0.045327884079509564, "grad_norm": 1.372057318687439, "learning_rate": 2.266310001486105e-05, "loss": 0.0821, "step": 6100 }, { "epoch": 0.04607096414638677, "grad_norm": 1.666020393371582, "learning_rate": 2.303462624461287e-05, "loss": 0.0795, "step": 6200 }, { "epoch": 0.04681404421326398, "grad_norm": 0.8905176520347595, "learning_rate": 2.3406152474364693e-05, "loss": 0.0949, "step": 6300 }, { "epoch": 0.04755712428014119, "grad_norm": 0.8247348070144653, "learning_rate": 2.377767870411651e-05, "loss": 0.081, "step": 6400 }, { "epoch": 0.04830020434701839, "grad_norm": 1.195694923400879, "learning_rate": 2.4149204933868332e-05, "loss": 0.08, "step": 6500 }, { "epoch": 0.0490432844138956, "grad_norm": 1.4031062126159668, "learning_rate": 2.4520731163620154e-05, "loss": 0.0874, "step": 6600 }, { "epoch": 0.0497863644807728, "grad_norm": 2.289123058319092, "learning_rate": 2.4892257393371975e-05, "loss": 0.0728, "step": 6700 }, { "epoch": 0.05052944454765001, "grad_norm": 1.2186951637268066, "learning_rate": 2.5263783623123793e-05, "loss": 0.0763, "step": 6800 }, { "epoch": 0.051272524614527214, "grad_norm": 0.9299426078796387, "learning_rate": 2.5635309852875618e-05, "loss": 0.0885, "step": 6900 }, { "epoch": 0.05201560468140442, "grad_norm": 1.0141850709915161, "learning_rate": 2.6006836082627435e-05, "loss": 0.0711, "step": 7000 }, { "epoch": 0.052758684748281624, "grad_norm": 1.7413781881332397, "learning_rate": 2.6378362312379257e-05, "loss": 0.0796, "step": 7100 }, { "epoch": 0.053501764815158837, "grad_norm": 1.567700743675232, "learning_rate": 2.6749888542131075e-05, "loss": 0.078, "step": 7200 }, { "epoch": 0.05424484488203604, "grad_norm": 1.0199575424194336, "learning_rate": 2.7121414771882896e-05, "loss": 0.075, "step": 7300 }, { "epoch": 0.05498792494891325, "grad_norm": 1.1288998126983643, "learning_rate": 2.7492941001634714e-05, "loss": 0.0822, "step": 7400 }, { "epoch": 0.05573100501579045, "grad_norm": 0.901070237159729, "learning_rate": 2.786446723138654e-05, "loss": 0.0763, "step": 7500 }, { "epoch": 0.05647408508266766, "grad_norm": 1.6706836223602295, "learning_rate": 2.823599346113836e-05, "loss": 0.0639, "step": 7600 }, { "epoch": 0.05721716514954486, "grad_norm": 0.7994457483291626, "learning_rate": 2.8607519690890178e-05, "loss": 0.0707, "step": 7700 }, { "epoch": 0.05796024521642207, "grad_norm": 0.67986661195755, "learning_rate": 2.8979045920642002e-05, "loss": 0.0736, "step": 7800 }, { "epoch": 0.058703325283299274, "grad_norm": 3.7049405574798584, "learning_rate": 2.9350572150393817e-05, "loss": 0.0652, "step": 7900 }, { "epoch": 0.05944640535017648, "grad_norm": 0.9632551670074463, "learning_rate": 2.972209838014564e-05, "loss": 0.06, "step": 8000 }, { "epoch": 0.060189485417053684, "grad_norm": 0.7603564262390137, "learning_rate": 3.009362460989746e-05, "loss": 0.0658, "step": 8100 }, { "epoch": 0.060932565483930896, "grad_norm": 0.5974241495132446, "learning_rate": 3.046515083964928e-05, "loss": 0.08, "step": 8200 }, { "epoch": 0.0616756455508081, "grad_norm": 1.1488542556762695, "learning_rate": 3.08366770694011e-05, "loss": 0.0758, "step": 8300 }, { "epoch": 0.06241872561768531, "grad_norm": 0.4799422025680542, "learning_rate": 3.120820329915292e-05, "loss": 0.077, "step": 8400 }, { "epoch": 0.0631618056845625, "grad_norm": 1.223254919052124, "learning_rate": 3.1579729528904744e-05, "loss": 0.0765, "step": 8500 }, { "epoch": 0.06390488575143972, "grad_norm": 3.7319703102111816, "learning_rate": 3.195125575865656e-05, "loss": 0.0647, "step": 8600 }, { "epoch": 0.06464796581831693, "grad_norm": 1.3452308177947998, "learning_rate": 3.232278198840839e-05, "loss": 0.0719, "step": 8700 }, { "epoch": 0.06539104588519414, "grad_norm": 2.8234987258911133, "learning_rate": 3.2694308218160205e-05, "loss": 0.0795, "step": 8800 }, { "epoch": 0.06613412595207134, "grad_norm": 1.100813865661621, "learning_rate": 3.306583444791202e-05, "loss": 0.0827, "step": 8900 }, { "epoch": 0.06687720601894855, "grad_norm": 0.7778908610343933, "learning_rate": 3.343736067766384e-05, "loss": 0.0614, "step": 9000 }, { "epoch": 0.06762028608582575, "grad_norm": 0.7844123244285583, "learning_rate": 3.3808886907415665e-05, "loss": 0.0682, "step": 9100 }, { "epoch": 0.06836336615270296, "grad_norm": 1.4667831659317017, "learning_rate": 3.418041313716748e-05, "loss": 0.0664, "step": 9200 }, { "epoch": 0.06910644621958016, "grad_norm": 1.0010597705841064, "learning_rate": 3.455193936691931e-05, "loss": 0.0792, "step": 9300 }, { "epoch": 0.06984952628645737, "grad_norm": 2.788736343383789, "learning_rate": 3.4923465596671126e-05, "loss": 0.0697, "step": 9400 }, { "epoch": 0.07059260635333457, "grad_norm": 0.8231617212295532, "learning_rate": 3.5294991826422944e-05, "loss": 0.0614, "step": 9500 }, { "epoch": 0.07133568642021178, "grad_norm": 1.3828425407409668, "learning_rate": 3.566651805617477e-05, "loss": 0.0877, "step": 9600 }, { "epoch": 0.07207876648708898, "grad_norm": 1.5393320322036743, "learning_rate": 3.6038044285926586e-05, "loss": 0.0647, "step": 9700 }, { "epoch": 0.07282184655396619, "grad_norm": 4.535276412963867, "learning_rate": 3.640957051567841e-05, "loss": 0.0776, "step": 9800 }, { "epoch": 0.0735649266208434, "grad_norm": 1.729735016822815, "learning_rate": 3.678109674543023e-05, "loss": 0.0631, "step": 9900 }, { "epoch": 0.0743080066877206, "grad_norm": 1.2225292921066284, "learning_rate": 3.715262297518205e-05, "loss": 0.0639, "step": 10000 }, { "epoch": 0.0750510867545978, "grad_norm": 0.6971098780632019, "learning_rate": 3.752414920493387e-05, "loss": 0.0835, "step": 10100 }, { "epoch": 0.07579416682147501, "grad_norm": 1.1242376565933228, "learning_rate": 3.789567543468569e-05, "loss": 0.0738, "step": 10200 }, { "epoch": 0.07653724688835221, "grad_norm": 0.8073209524154663, "learning_rate": 3.8267201664437514e-05, "loss": 0.0758, "step": 10300 }, { "epoch": 0.07728032695522942, "grad_norm": 0.8637209534645081, "learning_rate": 3.863872789418933e-05, "loss": 0.0787, "step": 10400 }, { "epoch": 0.07802340702210664, "grad_norm": 0.5651852488517761, "learning_rate": 3.9010254123941157e-05, "loss": 0.0678, "step": 10500 }, { "epoch": 0.07876648708898384, "grad_norm": 0.5803704857826233, "learning_rate": 3.938178035369297e-05, "loss": 0.0642, "step": 10600 }, { "epoch": 0.07950956715586105, "grad_norm": 0.4971671998500824, "learning_rate": 3.975330658344479e-05, "loss": 0.0701, "step": 10700 }, { "epoch": 0.08025264722273825, "grad_norm": 0.8406235575675964, "learning_rate": 4.012483281319661e-05, "loss": 0.0571, "step": 10800 }, { "epoch": 0.08099572728961546, "grad_norm": 0.6480376124382019, "learning_rate": 4.0496359042948435e-05, "loss": 0.0622, "step": 10900 }, { "epoch": 0.08173880735649267, "grad_norm": 2.608079671859741, "learning_rate": 4.086788527270026e-05, "loss": 0.0703, "step": 11000 }, { "epoch": 0.08248188742336987, "grad_norm": 3.149582624435425, "learning_rate": 4.123941150245208e-05, "loss": 0.0654, "step": 11100 }, { "epoch": 0.08322496749024708, "grad_norm": 1.2726750373840332, "learning_rate": 4.1610937732203895e-05, "loss": 0.0657, "step": 11200 }, { "epoch": 0.08396804755712428, "grad_norm": 1.2025091648101807, "learning_rate": 4.198246396195571e-05, "loss": 0.0696, "step": 11300 }, { "epoch": 0.08471112762400149, "grad_norm": 1.3108296394348145, "learning_rate": 4.235399019170754e-05, "loss": 0.0669, "step": 11400 }, { "epoch": 0.08545420769087869, "grad_norm": 0.7787677049636841, "learning_rate": 4.2725516421459356e-05, "loss": 0.0613, "step": 11500 }, { "epoch": 0.0861972877577559, "grad_norm": 0.4056709110736847, "learning_rate": 4.309704265121118e-05, "loss": 0.0594, "step": 11600 }, { "epoch": 0.0869403678246331, "grad_norm": 1.2153915166854858, "learning_rate": 4.3468568880963e-05, "loss": 0.068, "step": 11700 }, { "epoch": 0.08768344789151031, "grad_norm": 0.4020076096057892, "learning_rate": 4.3840095110714816e-05, "loss": 0.0687, "step": 11800 }, { "epoch": 0.08842652795838751, "grad_norm": 1.3127304315567017, "learning_rate": 4.421162134046664e-05, "loss": 0.0785, "step": 11900 }, { "epoch": 0.08916960802526472, "grad_norm": 0.9813829660415649, "learning_rate": 4.458314757021846e-05, "loss": 0.0713, "step": 12000 }, { "epoch": 0.08991268809214192, "grad_norm": 0.4510834515094757, "learning_rate": 4.4954673799970283e-05, "loss": 0.0686, "step": 12100 }, { "epoch": 0.09065576815901913, "grad_norm": 0.8281304240226746, "learning_rate": 4.53262000297221e-05, "loss": 0.0626, "step": 12200 }, { "epoch": 0.09139884822589633, "grad_norm": 2.757025957107544, "learning_rate": 4.569772625947392e-05, "loss": 0.0705, "step": 12300 }, { "epoch": 0.09214192829277354, "grad_norm": 2.689703941345215, "learning_rate": 4.606925248922574e-05, "loss": 0.0722, "step": 12400 }, { "epoch": 0.09288500835965076, "grad_norm": 3.0333175659179688, "learning_rate": 4.644077871897756e-05, "loss": 0.0777, "step": 12500 }, { "epoch": 0.09362808842652796, "grad_norm": 0.5244377255439758, "learning_rate": 4.6812304948729386e-05, "loss": 0.0655, "step": 12600 }, { "epoch": 0.09437116849340517, "grad_norm": 1.2108030319213867, "learning_rate": 4.7183831178481204e-05, "loss": 0.064, "step": 12700 }, { "epoch": 0.09511424856028237, "grad_norm": 0.687308669090271, "learning_rate": 4.755535740823302e-05, "loss": 0.0653, "step": 12800 }, { "epoch": 0.09585732862715958, "grad_norm": 0.7371180653572083, "learning_rate": 4.792688363798484e-05, "loss": 0.0623, "step": 12900 }, { "epoch": 0.09660040869403679, "grad_norm": 1.1190991401672363, "learning_rate": 4.8298409867736665e-05, "loss": 0.0647, "step": 13000 }, { "epoch": 0.09734348876091399, "grad_norm": 0.7727264165878296, "learning_rate": 4.866993609748848e-05, "loss": 0.0728, "step": 13100 }, { "epoch": 0.0980865688277912, "grad_norm": 1.4672969579696655, "learning_rate": 4.904146232724031e-05, "loss": 0.0675, "step": 13200 }, { "epoch": 0.0988296488946684, "grad_norm": 0.6341565251350403, "learning_rate": 4.9412988556992125e-05, "loss": 0.0586, "step": 13300 }, { "epoch": 0.0995727289615456, "grad_norm": 1.1076687574386597, "learning_rate": 4.978451478674395e-05, "loss": 0.0667, "step": 13400 }, { "epoch": 0.10031580902842281, "grad_norm": 0.4673527777194977, "learning_rate": 4.999178699372683e-05, "loss": 0.0596, "step": 13500 }, { "epoch": 0.10105888909530002, "grad_norm": 1.0587416887283325, "learning_rate": 4.997223221688595e-05, "loss": 0.0582, "step": 13600 }, { "epoch": 0.10180196916217722, "grad_norm": 0.5402936935424805, "learning_rate": 4.995267744004506e-05, "loss": 0.061, "step": 13700 }, { "epoch": 0.10254504922905443, "grad_norm": 0.7856093049049377, "learning_rate": 4.993312266320417e-05, "loss": 0.0587, "step": 13800 }, { "epoch": 0.10328812929593163, "grad_norm": 3.009688377380371, "learning_rate": 4.991356788636328e-05, "loss": 0.07, "step": 13900 }, { "epoch": 0.10403120936280884, "grad_norm": 0.8565527200698853, "learning_rate": 4.98940131095224e-05, "loss": 0.0512, "step": 14000 }, { "epoch": 0.10477428942968604, "grad_norm": 1.0530641078948975, "learning_rate": 4.987445833268151e-05, "loss": 0.0623, "step": 14100 }, { "epoch": 0.10551736949656325, "grad_norm": 0.5506693124771118, "learning_rate": 4.985490355584062e-05, "loss": 0.0603, "step": 14200 }, { "epoch": 0.10626044956344045, "grad_norm": 0.54316645860672, "learning_rate": 4.9835348778999734e-05, "loss": 0.0571, "step": 14300 }, { "epoch": 0.10700352963031767, "grad_norm": 0.25276821851730347, "learning_rate": 4.981579400215885e-05, "loss": 0.0631, "step": 14400 }, { "epoch": 0.10774660969719488, "grad_norm": 0.8100239634513855, "learning_rate": 4.979623922531796e-05, "loss": 0.059, "step": 14500 }, { "epoch": 0.10848968976407208, "grad_norm": 0.3892728388309479, "learning_rate": 4.977668444847708e-05, "loss": 0.0697, "step": 14600 }, { "epoch": 0.10923276983094929, "grad_norm": 0.4793049693107605, "learning_rate": 4.9757129671636186e-05, "loss": 0.0666, "step": 14700 }, { "epoch": 0.1099758498978265, "grad_norm": 0.4594714343547821, "learning_rate": 4.9737574894795304e-05, "loss": 0.0697, "step": 14800 }, { "epoch": 0.1107189299647037, "grad_norm": 0.5363215804100037, "learning_rate": 4.9718020117954415e-05, "loss": 0.0552, "step": 14900 }, { "epoch": 0.1114620100315809, "grad_norm": 0.6913743019104004, "learning_rate": 4.9698465341113534e-05, "loss": 0.0728, "step": 15000 }, { "epoch": 0.11220509009845811, "grad_norm": 0.50197434425354, "learning_rate": 4.967891056427264e-05, "loss": 0.0637, "step": 15100 }, { "epoch": 0.11294817016533532, "grad_norm": 1.240340232849121, "learning_rate": 4.9659355787431756e-05, "loss": 0.0534, "step": 15200 }, { "epoch": 0.11369125023221252, "grad_norm": 1.007525086402893, "learning_rate": 4.963980101059087e-05, "loss": 0.0608, "step": 15300 }, { "epoch": 0.11443433029908973, "grad_norm": 0.8173413276672363, "learning_rate": 4.9620246233749986e-05, "loss": 0.0679, "step": 15400 }, { "epoch": 0.11517741036596693, "grad_norm": 0.7330752015113831, "learning_rate": 4.96006914569091e-05, "loss": 0.073, "step": 15500 }, { "epoch": 0.11592049043284414, "grad_norm": 0.7711820006370544, "learning_rate": 4.958113668006821e-05, "loss": 0.051, "step": 15600 }, { "epoch": 0.11666357049972134, "grad_norm": 0.9198204874992371, "learning_rate": 4.956158190322732e-05, "loss": 0.063, "step": 15700 }, { "epoch": 0.11740665056659855, "grad_norm": 0.5483224391937256, "learning_rate": 4.954202712638644e-05, "loss": 0.0606, "step": 15800 }, { "epoch": 0.11814973063347575, "grad_norm": 0.5079103708267212, "learning_rate": 4.952247234954555e-05, "loss": 0.0593, "step": 15900 }, { "epoch": 0.11889281070035296, "grad_norm": 0.23663076758384705, "learning_rate": 4.950291757270466e-05, "loss": 0.0603, "step": 16000 }, { "epoch": 0.11963589076723016, "grad_norm": 0.6162376999855042, "learning_rate": 4.948336279586377e-05, "loss": 0.0521, "step": 16100 }, { "epoch": 0.12037897083410737, "grad_norm": 0.44027620553970337, "learning_rate": 4.946380801902289e-05, "loss": 0.0534, "step": 16200 }, { "epoch": 0.12112205090098459, "grad_norm": 0.7208848595619202, "learning_rate": 4.9444253242182e-05, "loss": 0.057, "step": 16300 }, { "epoch": 0.12186513096786179, "grad_norm": 0.6339391469955444, "learning_rate": 4.942469846534112e-05, "loss": 0.0643, "step": 16400 }, { "epoch": 0.122608211034739, "grad_norm": 0.5541401505470276, "learning_rate": 4.9405143688500225e-05, "loss": 0.0609, "step": 16500 }, { "epoch": 0.1233512911016162, "grad_norm": 0.759369432926178, "learning_rate": 4.938558891165934e-05, "loss": 0.0583, "step": 16600 }, { "epoch": 0.12409437116849341, "grad_norm": 0.3653734028339386, "learning_rate": 4.9366034134818454e-05, "loss": 0.0655, "step": 16700 }, { "epoch": 0.12483745123537061, "grad_norm": 0.49060988426208496, "learning_rate": 4.934647935797757e-05, "loss": 0.0635, "step": 16800 }, { "epoch": 0.1255805313022478, "grad_norm": 0.6726740598678589, "learning_rate": 4.932692458113668e-05, "loss": 0.0618, "step": 16900 }, { "epoch": 0.126323611369125, "grad_norm": 1.4769412279129028, "learning_rate": 4.9307369804295795e-05, "loss": 0.0601, "step": 17000 }, { "epoch": 0.12706669143600222, "grad_norm": 0.2961408197879791, "learning_rate": 4.9287815027454906e-05, "loss": 0.0679, "step": 17100 }, { "epoch": 0.12780977150287945, "grad_norm": 0.30870574712753296, "learning_rate": 4.9268260250614025e-05, "loss": 0.0684, "step": 17200 }, { "epoch": 0.12855285156975665, "grad_norm": 0.43332019448280334, "learning_rate": 4.9248705473773136e-05, "loss": 0.0712, "step": 17300 }, { "epoch": 0.12929593163663386, "grad_norm": 1.5273380279541016, "learning_rate": 4.922915069693225e-05, "loss": 0.0653, "step": 17400 }, { "epoch": 0.13003901170351106, "grad_norm": 0.2486886978149414, "learning_rate": 4.920959592009136e-05, "loss": 0.0637, "step": 17500 }, { "epoch": 0.13078209177038827, "grad_norm": 0.6933874487876892, "learning_rate": 4.919004114325048e-05, "loss": 0.0623, "step": 17600 }, { "epoch": 0.13152517183726548, "grad_norm": 0.24979476630687714, "learning_rate": 4.917048636640959e-05, "loss": 0.0667, "step": 17700 }, { "epoch": 0.13226825190414268, "grad_norm": 0.4043433368206024, "learning_rate": 4.91509315895687e-05, "loss": 0.0632, "step": 17800 }, { "epoch": 0.13301133197101989, "grad_norm": 1.597890019416809, "learning_rate": 4.913137681272781e-05, "loss": 0.0847, "step": 17900 }, { "epoch": 0.1337544120378971, "grad_norm": 0.5368407368659973, "learning_rate": 4.911182203588693e-05, "loss": 0.061, "step": 18000 }, { "epoch": 0.1344974921047743, "grad_norm": 0.5569105744361877, "learning_rate": 4.909226725904604e-05, "loss": 0.0637, "step": 18100 }, { "epoch": 0.1352405721716515, "grad_norm": 0.753337562084198, "learning_rate": 4.907271248220516e-05, "loss": 0.0626, "step": 18200 }, { "epoch": 0.1359836522385287, "grad_norm": 0.22668150067329407, "learning_rate": 4.905315770536426e-05, "loss": 0.0553, "step": 18300 }, { "epoch": 0.1367267323054059, "grad_norm": 0.6265455484390259, "learning_rate": 4.903360292852338e-05, "loss": 0.0635, "step": 18400 }, { "epoch": 0.13746981237228312, "grad_norm": 0.5316629409790039, "learning_rate": 4.901404815168249e-05, "loss": 0.0579, "step": 18500 }, { "epoch": 0.13821289243916032, "grad_norm": 0.722754716873169, "learning_rate": 4.899449337484161e-05, "loss": 0.0675, "step": 18600 }, { "epoch": 0.13895597250603753, "grad_norm": 2.0471303462982178, "learning_rate": 4.897493859800072e-05, "loss": 0.0598, "step": 18700 }, { "epoch": 0.13969905257291473, "grad_norm": 1.298561453819275, "learning_rate": 4.8955383821159834e-05, "loss": 0.0608, "step": 18800 }, { "epoch": 0.14044213263979194, "grad_norm": 0.6990522146224976, "learning_rate": 4.8935829044318945e-05, "loss": 0.0721, "step": 18900 }, { "epoch": 0.14118521270666914, "grad_norm": 0.6482419967651367, "learning_rate": 4.891627426747806e-05, "loss": 0.0557, "step": 19000 }, { "epoch": 0.14192829277354635, "grad_norm": 0.48900648951530457, "learning_rate": 4.8896719490637175e-05, "loss": 0.0609, "step": 19100 }, { "epoch": 0.14267137284042355, "grad_norm": 0.32317060232162476, "learning_rate": 4.8877164713796286e-05, "loss": 0.059, "step": 19200 }, { "epoch": 0.14341445290730076, "grad_norm": 0.22735153138637543, "learning_rate": 4.8857609936955404e-05, "loss": 0.059, "step": 19300 }, { "epoch": 0.14415753297417797, "grad_norm": 0.36908093094825745, "learning_rate": 4.8838055160114515e-05, "loss": 0.0549, "step": 19400 }, { "epoch": 0.14490061304105517, "grad_norm": 0.7472820281982422, "learning_rate": 4.881850038327363e-05, "loss": 0.0518, "step": 19500 }, { "epoch": 0.14564369310793238, "grad_norm": 0.41608092188835144, "learning_rate": 4.8798945606432745e-05, "loss": 0.06, "step": 19600 }, { "epoch": 0.14638677317480958, "grad_norm": 0.5696953535079956, "learning_rate": 4.8779390829591856e-05, "loss": 0.0748, "step": 19700 }, { "epoch": 0.1471298532416868, "grad_norm": 1.9534960985183716, "learning_rate": 4.875983605275097e-05, "loss": 0.0562, "step": 19800 }, { "epoch": 0.147872933308564, "grad_norm": 0.7562423348426819, "learning_rate": 4.874028127591008e-05, "loss": 0.0643, "step": 19900 }, { "epoch": 0.1486160133754412, "grad_norm": 0.7428703904151917, "learning_rate": 4.87207264990692e-05, "loss": 0.0728, "step": 20000 }, { "epoch": 0.1493590934423184, "grad_norm": 2.623988151550293, "learning_rate": 4.870117172222831e-05, "loss": 0.0649, "step": 20100 }, { "epoch": 0.1501021735091956, "grad_norm": 1.0980720520019531, "learning_rate": 4.868161694538742e-05, "loss": 0.057, "step": 20200 }, { "epoch": 0.1508452535760728, "grad_norm": 0.7622421979904175, "learning_rate": 4.866206216854654e-05, "loss": 0.074, "step": 20300 }, { "epoch": 0.15158833364295002, "grad_norm": 0.9466246962547302, "learning_rate": 4.864250739170565e-05, "loss": 0.0494, "step": 20400 }, { "epoch": 0.15233141370982722, "grad_norm": 0.47995156049728394, "learning_rate": 4.862295261486476e-05, "loss": 0.0506, "step": 20500 }, { "epoch": 0.15307449377670443, "grad_norm": 0.8018309473991394, "learning_rate": 4.860339783802387e-05, "loss": 0.0638, "step": 20600 }, { "epoch": 0.15381757384358163, "grad_norm": 0.4469168782234192, "learning_rate": 4.858384306118299e-05, "loss": 0.0565, "step": 20700 }, { "epoch": 0.15456065391045884, "grad_norm": 0.5291076898574829, "learning_rate": 4.85642882843421e-05, "loss": 0.0531, "step": 20800 }, { "epoch": 0.15530373397733604, "grad_norm": 0.4876657724380493, "learning_rate": 4.854473350750122e-05, "loss": 0.0614, "step": 20900 }, { "epoch": 0.15604681404421328, "grad_norm": 0.35491397976875305, "learning_rate": 4.8525178730660325e-05, "loss": 0.06, "step": 21000 }, { "epoch": 0.15678989411109048, "grad_norm": 0.3132985830307007, "learning_rate": 4.850562395381944e-05, "loss": 0.0587, "step": 21100 }, { "epoch": 0.1575329741779677, "grad_norm": 0.41143614053726196, "learning_rate": 4.8486069176978554e-05, "loss": 0.0595, "step": 21200 }, { "epoch": 0.1582760542448449, "grad_norm": 0.27149155735969543, "learning_rate": 4.846651440013767e-05, "loss": 0.05, "step": 21300 }, { "epoch": 0.1590191343117221, "grad_norm": 0.2574271261692047, "learning_rate": 4.8446959623296784e-05, "loss": 0.0485, "step": 21400 }, { "epoch": 0.1597622143785993, "grad_norm": 0.6292992234230042, "learning_rate": 4.8427404846455895e-05, "loss": 0.0636, "step": 21500 }, { "epoch": 0.1605052944454765, "grad_norm": 0.6022348403930664, "learning_rate": 4.8407850069615006e-05, "loss": 0.0603, "step": 21600 }, { "epoch": 0.16124837451235371, "grad_norm": 0.49796241521835327, "learning_rate": 4.8388295292774125e-05, "loss": 0.0519, "step": 21700 }, { "epoch": 0.16199145457923092, "grad_norm": 0.5306269526481628, "learning_rate": 4.8368740515933236e-05, "loss": 0.0587, "step": 21800 }, { "epoch": 0.16273453464610813, "grad_norm": 0.36153480410575867, "learning_rate": 4.834918573909235e-05, "loss": 0.0614, "step": 21900 }, { "epoch": 0.16347761471298533, "grad_norm": 0.41434040665626526, "learning_rate": 4.832963096225146e-05, "loss": 0.053, "step": 22000 }, { "epoch": 0.16422069477986254, "grad_norm": 0.3059290647506714, "learning_rate": 4.831007618541058e-05, "loss": 0.0493, "step": 22100 }, { "epoch": 0.16496377484673974, "grad_norm": 0.2532903552055359, "learning_rate": 4.829052140856969e-05, "loss": 0.0616, "step": 22200 }, { "epoch": 0.16570685491361695, "grad_norm": 0.45050281286239624, "learning_rate": 4.8270966631728806e-05, "loss": 0.0539, "step": 22300 }, { "epoch": 0.16644993498049415, "grad_norm": 0.5246642231941223, "learning_rate": 4.825141185488791e-05, "loss": 0.0656, "step": 22400 }, { "epoch": 0.16719301504737136, "grad_norm": 0.5318479537963867, "learning_rate": 4.823185707804703e-05, "loss": 0.0584, "step": 22500 }, { "epoch": 0.16793609511424856, "grad_norm": 0.4292713701725006, "learning_rate": 4.821230230120614e-05, "loss": 0.0635, "step": 22600 }, { "epoch": 0.16867917518112577, "grad_norm": 0.4730810225009918, "learning_rate": 4.819274752436526e-05, "loss": 0.0626, "step": 22700 }, { "epoch": 0.16942225524800297, "grad_norm": 0.5150263905525208, "learning_rate": 4.817319274752436e-05, "loss": 0.0678, "step": 22800 }, { "epoch": 0.17016533531488018, "grad_norm": 0.3297137916088104, "learning_rate": 4.815363797068348e-05, "loss": 0.0477, "step": 22900 }, { "epoch": 0.17090841538175738, "grad_norm": 0.46765702962875366, "learning_rate": 4.813408319384259e-05, "loss": 0.0651, "step": 23000 }, { "epoch": 0.1716514954486346, "grad_norm": 0.4213978350162506, "learning_rate": 4.811452841700171e-05, "loss": 0.0506, "step": 23100 }, { "epoch": 0.1723945755155118, "grad_norm": 0.841088056564331, "learning_rate": 4.809497364016082e-05, "loss": 0.0557, "step": 23200 }, { "epoch": 0.173137655582389, "grad_norm": 0.4980924427509308, "learning_rate": 4.8075418863319934e-05, "loss": 0.056, "step": 23300 }, { "epoch": 0.1738807356492662, "grad_norm": 0.5139170289039612, "learning_rate": 4.8055864086479045e-05, "loss": 0.0649, "step": 23400 }, { "epoch": 0.1746238157161434, "grad_norm": 0.20433948934078217, "learning_rate": 4.803630930963816e-05, "loss": 0.057, "step": 23500 }, { "epoch": 0.17536689578302062, "grad_norm": 0.21892039477825165, "learning_rate": 4.8016754532797275e-05, "loss": 0.0591, "step": 23600 }, { "epoch": 0.17610997584989782, "grad_norm": 0.534686803817749, "learning_rate": 4.7997199755956386e-05, "loss": 0.0742, "step": 23700 }, { "epoch": 0.17685305591677503, "grad_norm": 0.2837038040161133, "learning_rate": 4.79776449791155e-05, "loss": 0.0573, "step": 23800 }, { "epoch": 0.17759613598365223, "grad_norm": 0.18008054792881012, "learning_rate": 4.7958090202274616e-05, "loss": 0.0517, "step": 23900 }, { "epoch": 0.17833921605052944, "grad_norm": 2.39695405960083, "learning_rate": 4.793853542543373e-05, "loss": 0.0703, "step": 24000 }, { "epoch": 0.17908229611740664, "grad_norm": 2.7672462463378906, "learning_rate": 4.7918980648592845e-05, "loss": 0.054, "step": 24100 }, { "epoch": 0.17982537618428385, "grad_norm": 1.2251644134521484, "learning_rate": 4.789942587175195e-05, "loss": 0.0607, "step": 24200 }, { "epoch": 0.18056845625116105, "grad_norm": 0.1749626100063324, "learning_rate": 4.787987109491107e-05, "loss": 0.0726, "step": 24300 }, { "epoch": 0.18131153631803826, "grad_norm": 0.48285311460494995, "learning_rate": 4.786031631807018e-05, "loss": 0.0502, "step": 24400 }, { "epoch": 0.18205461638491546, "grad_norm": 0.8955739736557007, "learning_rate": 4.78407615412293e-05, "loss": 0.0735, "step": 24500 }, { "epoch": 0.18279769645179267, "grad_norm": 0.41905316710472107, "learning_rate": 4.782120676438841e-05, "loss": 0.0511, "step": 24600 }, { "epoch": 0.18354077651866987, "grad_norm": 0.32530397176742554, "learning_rate": 4.780165198754752e-05, "loss": 0.0628, "step": 24700 }, { "epoch": 0.18428385658554708, "grad_norm": 1.1762590408325195, "learning_rate": 4.778209721070663e-05, "loss": 0.0554, "step": 24800 }, { "epoch": 0.1850269366524243, "grad_norm": 0.714205265045166, "learning_rate": 4.776254243386575e-05, "loss": 0.0526, "step": 24900 }, { "epoch": 0.18577001671930152, "grad_norm": 0.4641837477684021, "learning_rate": 4.774298765702486e-05, "loss": 0.0603, "step": 25000 }, { "epoch": 0.18651309678617872, "grad_norm": 0.6658568382263184, "learning_rate": 4.772343288018397e-05, "loss": 0.0468, "step": 25100 }, { "epoch": 0.18725617685305593, "grad_norm": 0.4403233826160431, "learning_rate": 4.7703878103343084e-05, "loss": 0.0484, "step": 25200 }, { "epoch": 0.18799925691993313, "grad_norm": 0.7554200291633606, "learning_rate": 4.76843233265022e-05, "loss": 0.0542, "step": 25300 }, { "epoch": 0.18874233698681034, "grad_norm": 0.4334091246128082, "learning_rate": 4.766476854966131e-05, "loss": 0.0518, "step": 25400 }, { "epoch": 0.18948541705368754, "grad_norm": 1.033246397972107, "learning_rate": 4.764521377282043e-05, "loss": 0.0622, "step": 25500 }, { "epoch": 0.19022849712056475, "grad_norm": 2.522212028503418, "learning_rate": 4.7625658995979536e-05, "loss": 0.0664, "step": 25600 }, { "epoch": 0.19097157718744195, "grad_norm": 0.5005839467048645, "learning_rate": 4.7606104219138654e-05, "loss": 0.052, "step": 25700 }, { "epoch": 0.19171465725431916, "grad_norm": 0.5608080625534058, "learning_rate": 4.7586549442297766e-05, "loss": 0.0506, "step": 25800 }, { "epoch": 0.19245773732119636, "grad_norm": 0.7228055000305176, "learning_rate": 4.7566994665456884e-05, "loss": 0.059, "step": 25900 }, { "epoch": 0.19320081738807357, "grad_norm": 0.7754433751106262, "learning_rate": 4.754743988861599e-05, "loss": 0.0554, "step": 26000 }, { "epoch": 0.19394389745495078, "grad_norm": 0.4492543041706085, "learning_rate": 4.7527885111775106e-05, "loss": 0.057, "step": 26100 }, { "epoch": 0.19468697752182798, "grad_norm": 0.4262939989566803, "learning_rate": 4.750833033493422e-05, "loss": 0.0622, "step": 26200 }, { "epoch": 0.19543005758870519, "grad_norm": 0.5197632312774658, "learning_rate": 4.7488775558093336e-05, "loss": 0.0491, "step": 26300 }, { "epoch": 0.1961731376555824, "grad_norm": 0.36253371834754944, "learning_rate": 4.746922078125245e-05, "loss": 0.054, "step": 26400 }, { "epoch": 0.1969162177224596, "grad_norm": 0.5164226293563843, "learning_rate": 4.744966600441156e-05, "loss": 0.0617, "step": 26500 }, { "epoch": 0.1976592977893368, "grad_norm": 0.37217646837234497, "learning_rate": 4.743011122757067e-05, "loss": 0.0507, "step": 26600 }, { "epoch": 0.198402377856214, "grad_norm": 0.3745191991329193, "learning_rate": 4.741055645072979e-05, "loss": 0.0642, "step": 26700 }, { "epoch": 0.1991454579230912, "grad_norm": 0.3051743507385254, "learning_rate": 4.73910016738889e-05, "loss": 0.0588, "step": 26800 }, { "epoch": 0.19988853798996842, "grad_norm": 0.9582915902137756, "learning_rate": 4.737144689704801e-05, "loss": 0.067, "step": 26900 }, { "epoch": 0.20063161805684562, "grad_norm": 0.44074833393096924, "learning_rate": 4.735189212020712e-05, "loss": 0.058, "step": 27000 }, { "epoch": 0.20137469812372283, "grad_norm": 0.504858672618866, "learning_rate": 4.733233734336624e-05, "loss": 0.0494, "step": 27100 }, { "epoch": 0.20211777819060003, "grad_norm": 0.5550320148468018, "learning_rate": 4.731278256652535e-05, "loss": 0.052, "step": 27200 }, { "epoch": 0.20286085825747724, "grad_norm": 0.5049013495445251, "learning_rate": 4.729322778968447e-05, "loss": 0.0626, "step": 27300 }, { "epoch": 0.20360393832435444, "grad_norm": 0.6147429347038269, "learning_rate": 4.7273673012843575e-05, "loss": 0.0522, "step": 27400 }, { "epoch": 0.20434701839123165, "grad_norm": 0.33005601167678833, "learning_rate": 4.725411823600269e-05, "loss": 0.052, "step": 27500 }, { "epoch": 0.20509009845810885, "grad_norm": 5.219080448150635, "learning_rate": 4.7234563459161804e-05, "loss": 0.053, "step": 27600 }, { "epoch": 0.20583317852498606, "grad_norm": 0.24771401286125183, "learning_rate": 4.721500868232092e-05, "loss": 0.0486, "step": 27700 }, { "epoch": 0.20657625859186327, "grad_norm": 0.38321298360824585, "learning_rate": 4.7195453905480034e-05, "loss": 0.0538, "step": 27800 }, { "epoch": 0.20731933865874047, "grad_norm": 0.5224072933197021, "learning_rate": 4.7175899128639145e-05, "loss": 0.0528, "step": 27900 }, { "epoch": 0.20806241872561768, "grad_norm": 1.855690360069275, "learning_rate": 4.7156344351798256e-05, "loss": 0.0544, "step": 28000 }, { "epoch": 0.20880549879249488, "grad_norm": 0.40254470705986023, "learning_rate": 4.7136789574957375e-05, "loss": 0.0529, "step": 28100 }, { "epoch": 0.2095485788593721, "grad_norm": 0.2925325632095337, "learning_rate": 4.7117234798116486e-05, "loss": 0.0657, "step": 28200 }, { "epoch": 0.2102916589262493, "grad_norm": 0.3594666123390198, "learning_rate": 4.70976800212756e-05, "loss": 0.0629, "step": 28300 }, { "epoch": 0.2110347389931265, "grad_norm": 0.5638003945350647, "learning_rate": 4.707812524443471e-05, "loss": 0.0571, "step": 28400 }, { "epoch": 0.2117778190600037, "grad_norm": 0.42436695098876953, "learning_rate": 4.705857046759383e-05, "loss": 0.0558, "step": 28500 }, { "epoch": 0.2125208991268809, "grad_norm": 0.3580825924873352, "learning_rate": 4.703901569075294e-05, "loss": 0.0607, "step": 28600 }, { "epoch": 0.21326397919375814, "grad_norm": 0.6465857028961182, "learning_rate": 4.701946091391205e-05, "loss": 0.0516, "step": 28700 }, { "epoch": 0.21400705926063535, "grad_norm": 0.3862816095352173, "learning_rate": 4.699990613707117e-05, "loss": 0.0508, "step": 28800 }, { "epoch": 0.21475013932751255, "grad_norm": 0.40670090913772583, "learning_rate": 4.698035136023028e-05, "loss": 0.0727, "step": 28900 }, { "epoch": 0.21549321939438976, "grad_norm": 0.4922596216201782, "learning_rate": 4.696079658338939e-05, "loss": 0.0492, "step": 29000 }, { "epoch": 0.21623629946126696, "grad_norm": 0.6401015520095825, "learning_rate": 4.694124180654851e-05, "loss": 0.0571, "step": 29100 }, { "epoch": 0.21697937952814417, "grad_norm": 1.4843968152999878, "learning_rate": 4.692168702970762e-05, "loss": 0.0521, "step": 29200 }, { "epoch": 0.21772245959502137, "grad_norm": 0.3753911554813385, "learning_rate": 4.690213225286673e-05, "loss": 0.0494, "step": 29300 }, { "epoch": 0.21846553966189858, "grad_norm": 0.4754498302936554, "learning_rate": 4.688257747602585e-05, "loss": 0.0618, "step": 29400 }, { "epoch": 0.21920861972877578, "grad_norm": 0.4274585545063019, "learning_rate": 4.686302269918496e-05, "loss": 0.0633, "step": 29500 }, { "epoch": 0.219951699795653, "grad_norm": 0.278058797121048, "learning_rate": 4.684346792234407e-05, "loss": 0.0488, "step": 29600 }, { "epoch": 0.2206947798625302, "grad_norm": 0.9572972655296326, "learning_rate": 4.6823913145503184e-05, "loss": 0.0538, "step": 29700 }, { "epoch": 0.2214378599294074, "grad_norm": 0.5258646011352539, "learning_rate": 4.68043583686623e-05, "loss": 0.0529, "step": 29800 }, { "epoch": 0.2221809399962846, "grad_norm": 0.29945823550224304, "learning_rate": 4.678480359182141e-05, "loss": 0.0549, "step": 29900 }, { "epoch": 0.2229240200631618, "grad_norm": 0.31282907724380493, "learning_rate": 4.6765248814980525e-05, "loss": 0.0531, "step": 30000 }, { "epoch": 0.22366710013003901, "grad_norm": 0.23627343773841858, "learning_rate": 4.6745694038139636e-05, "loss": 0.056, "step": 30100 }, { "epoch": 0.22441018019691622, "grad_norm": 0.5076658725738525, "learning_rate": 4.6726139261298754e-05, "loss": 0.0582, "step": 30200 }, { "epoch": 0.22515326026379343, "grad_norm": 0.4498322010040283, "learning_rate": 4.6706584484457866e-05, "loss": 0.0656, "step": 30300 }, { "epoch": 0.22589634033067063, "grad_norm": 2.0586252212524414, "learning_rate": 4.6687029707616984e-05, "loss": 0.061, "step": 30400 }, { "epoch": 0.22663942039754784, "grad_norm": 0.8353273272514343, "learning_rate": 4.666747493077609e-05, "loss": 0.0503, "step": 30500 }, { "epoch": 0.22738250046442504, "grad_norm": 0.5048010349273682, "learning_rate": 4.6647920153935206e-05, "loss": 0.0452, "step": 30600 }, { "epoch": 0.22812558053130225, "grad_norm": 2.223921537399292, "learning_rate": 4.662836537709432e-05, "loss": 0.0576, "step": 30700 }, { "epoch": 0.22886866059817945, "grad_norm": 0.130301833152771, "learning_rate": 4.6608810600253436e-05, "loss": 0.0698, "step": 30800 }, { "epoch": 0.22961174066505666, "grad_norm": 0.4333307445049286, "learning_rate": 4.658925582341255e-05, "loss": 0.0562, "step": 30900 }, { "epoch": 0.23035482073193386, "grad_norm": 0.3135068118572235, "learning_rate": 4.656970104657166e-05, "loss": 0.0576, "step": 31000 }, { "epoch": 0.23109790079881107, "grad_norm": 0.25818172097206116, "learning_rate": 4.655014626973077e-05, "loss": 0.0461, "step": 31100 }, { "epoch": 0.23184098086568827, "grad_norm": 0.9816430807113647, "learning_rate": 4.653059149288989e-05, "loss": 0.0668, "step": 31200 }, { "epoch": 0.23258406093256548, "grad_norm": 1.7160446643829346, "learning_rate": 4.6511036716049e-05, "loss": 0.0555, "step": 31300 }, { "epoch": 0.23332714099944268, "grad_norm": 0.5461785197257996, "learning_rate": 4.649148193920812e-05, "loss": 0.0625, "step": 31400 }, { "epoch": 0.2340702210663199, "grad_norm": 0.25582554936408997, "learning_rate": 4.647192716236722e-05, "loss": 0.0512, "step": 31500 }, { "epoch": 0.2348133011331971, "grad_norm": 0.777850329875946, "learning_rate": 4.645237238552634e-05, "loss": 0.0591, "step": 31600 }, { "epoch": 0.2355563812000743, "grad_norm": 0.17021729052066803, "learning_rate": 4.643281760868545e-05, "loss": 0.066, "step": 31700 }, { "epoch": 0.2362994612669515, "grad_norm": 0.5151445865631104, "learning_rate": 4.641326283184457e-05, "loss": 0.0462, "step": 31800 }, { "epoch": 0.2370425413338287, "grad_norm": 0.5324143767356873, "learning_rate": 4.6393708055003675e-05, "loss": 0.0539, "step": 31900 }, { "epoch": 0.23778562140070592, "grad_norm": 0.29288551211357117, "learning_rate": 4.637415327816279e-05, "loss": 0.0611, "step": 32000 }, { "epoch": 0.23852870146758312, "grad_norm": 0.5757864117622375, "learning_rate": 4.6354598501321904e-05, "loss": 0.0538, "step": 32100 }, { "epoch": 0.23927178153446033, "grad_norm": 0.33358100056648254, "learning_rate": 4.633504372448102e-05, "loss": 0.0461, "step": 32200 }, { "epoch": 0.24001486160133753, "grad_norm": 0.5818902850151062, "learning_rate": 4.6315488947640134e-05, "loss": 0.0508, "step": 32300 }, { "epoch": 0.24075794166821474, "grad_norm": 0.6068462133407593, "learning_rate": 4.6295934170799245e-05, "loss": 0.0451, "step": 32400 }, { "epoch": 0.24150102173509197, "grad_norm": 0.28094497323036194, "learning_rate": 4.6276379393958357e-05, "loss": 0.0557, "step": 32500 }, { "epoch": 0.24224410180196917, "grad_norm": 0.2638779282569885, "learning_rate": 4.6256824617117475e-05, "loss": 0.0567, "step": 32600 }, { "epoch": 0.24298718186884638, "grad_norm": 1.550369143486023, "learning_rate": 4.6237269840276586e-05, "loss": 0.0535, "step": 32700 }, { "epoch": 0.24373026193572359, "grad_norm": 1.7364603281021118, "learning_rate": 4.62177150634357e-05, "loss": 0.0545, "step": 32800 }, { "epoch": 0.2444733420026008, "grad_norm": 0.9635714888572693, "learning_rate": 4.619816028659481e-05, "loss": 0.0551, "step": 32900 }, { "epoch": 0.245216422069478, "grad_norm": 0.25181087851524353, "learning_rate": 4.617860550975393e-05, "loss": 0.0584, "step": 33000 }, { "epoch": 0.2459595021363552, "grad_norm": 0.26599863171577454, "learning_rate": 4.615905073291304e-05, "loss": 0.0608, "step": 33100 }, { "epoch": 0.2467025822032324, "grad_norm": 0.68401038646698, "learning_rate": 4.6139495956072156e-05, "loss": 0.0509, "step": 33200 }, { "epoch": 0.2474456622701096, "grad_norm": 0.46863794326782227, "learning_rate": 4.611994117923126e-05, "loss": 0.0503, "step": 33300 }, { "epoch": 0.24818874233698682, "grad_norm": 0.7865766882896423, "learning_rate": 4.610038640239038e-05, "loss": 0.0542, "step": 33400 }, { "epoch": 0.24893182240386402, "grad_norm": 1.8174749612808228, "learning_rate": 4.608083162554949e-05, "loss": 0.0483, "step": 33500 }, { "epoch": 0.24967490247074123, "grad_norm": 0.3382914364337921, "learning_rate": 4.606127684870861e-05, "loss": 0.0582, "step": 33600 }, { "epoch": 0.25041798253761843, "grad_norm": 0.3148025870323181, "learning_rate": 4.604172207186771e-05, "loss": 0.0498, "step": 33700 }, { "epoch": 0.2511610626044956, "grad_norm": 1.3851782083511353, "learning_rate": 4.602216729502683e-05, "loss": 0.0554, "step": 33800 }, { "epoch": 0.25190414267137284, "grad_norm": 0.550360918045044, "learning_rate": 4.600261251818594e-05, "loss": 0.0538, "step": 33900 }, { "epoch": 0.25264722273825, "grad_norm": 0.28863242268562317, "learning_rate": 4.598305774134506e-05, "loss": 0.0525, "step": 34000 }, { "epoch": 0.25339030280512725, "grad_norm": 0.3711775541305542, "learning_rate": 4.596350296450417e-05, "loss": 0.0614, "step": 34100 }, { "epoch": 0.25413338287200443, "grad_norm": 0.3662073612213135, "learning_rate": 4.5943948187663284e-05, "loss": 0.0637, "step": 34200 }, { "epoch": 0.25487646293888166, "grad_norm": 0.2129790484905243, "learning_rate": 4.5924393410822395e-05, "loss": 0.0477, "step": 34300 }, { "epoch": 0.2556195430057589, "grad_norm": 0.14818350970745087, "learning_rate": 4.590483863398151e-05, "loss": 0.0567, "step": 34400 }, { "epoch": 0.2563626230726361, "grad_norm": 0.41121742129325867, "learning_rate": 4.5885283857140625e-05, "loss": 0.0744, "step": 34500 }, { "epoch": 0.2571057031395133, "grad_norm": 0.21580670773983002, "learning_rate": 4.5865729080299736e-05, "loss": 0.0578, "step": 34600 }, { "epoch": 0.2578487832063905, "grad_norm": 0.3314516544342041, "learning_rate": 4.584617430345885e-05, "loss": 0.0579, "step": 34700 }, { "epoch": 0.2585918632732677, "grad_norm": 0.5411693453788757, "learning_rate": 4.5826619526617966e-05, "loss": 0.0444, "step": 34800 }, { "epoch": 0.2593349433401449, "grad_norm": 0.4351077973842621, "learning_rate": 4.580706474977708e-05, "loss": 0.0518, "step": 34900 }, { "epoch": 0.26007802340702213, "grad_norm": 0.2806299924850464, "learning_rate": 4.5787509972936195e-05, "loss": 0.0451, "step": 35000 }, { "epoch": 0.2608211034738993, "grad_norm": 0.3335905373096466, "learning_rate": 4.57679551960953e-05, "loss": 0.0608, "step": 35100 }, { "epoch": 0.26156418354077654, "grad_norm": 0.44310262799263, "learning_rate": 4.574840041925442e-05, "loss": 0.0563, "step": 35200 }, { "epoch": 0.2623072636076537, "grad_norm": 0.3662008047103882, "learning_rate": 4.572884564241353e-05, "loss": 0.0621, "step": 35300 }, { "epoch": 0.26305034367453095, "grad_norm": 0.18301662802696228, "learning_rate": 4.570929086557265e-05, "loss": 0.0591, "step": 35400 }, { "epoch": 0.26379342374140813, "grad_norm": 0.4242526590824127, "learning_rate": 4.568973608873175e-05, "loss": 0.0516, "step": 35500 }, { "epoch": 0.26453650380828536, "grad_norm": 0.519944965839386, "learning_rate": 4.567018131189087e-05, "loss": 0.0554, "step": 35600 }, { "epoch": 0.26527958387516254, "grad_norm": 2.540412187576294, "learning_rate": 4.565062653504998e-05, "loss": 0.0539, "step": 35700 }, { "epoch": 0.26602266394203977, "grad_norm": 0.5997206568717957, "learning_rate": 4.56310717582091e-05, "loss": 0.0524, "step": 35800 }, { "epoch": 0.26676574400891695, "grad_norm": 0.389510840177536, "learning_rate": 4.561151698136821e-05, "loss": 0.0683, "step": 35900 }, { "epoch": 0.2675088240757942, "grad_norm": 0.3453822433948517, "learning_rate": 4.559196220452732e-05, "loss": 0.0535, "step": 36000 }, { "epoch": 0.26825190414267136, "grad_norm": 0.5080516338348389, "learning_rate": 4.5572407427686434e-05, "loss": 0.0476, "step": 36100 }, { "epoch": 0.2689949842095486, "grad_norm": 0.286623477935791, "learning_rate": 4.555285265084555e-05, "loss": 0.0489, "step": 36200 }, { "epoch": 0.26973806427642577, "grad_norm": 0.4629890024662018, "learning_rate": 4.553329787400466e-05, "loss": 0.0499, "step": 36300 }, { "epoch": 0.270481144343303, "grad_norm": 0.713910698890686, "learning_rate": 4.5513743097163775e-05, "loss": 0.0586, "step": 36400 }, { "epoch": 0.2712242244101802, "grad_norm": 1.3540496826171875, "learning_rate": 4.5494188320322886e-05, "loss": 0.0481, "step": 36500 }, { "epoch": 0.2719673044770574, "grad_norm": 0.8264902234077454, "learning_rate": 4.5474633543482004e-05, "loss": 0.0455, "step": 36600 }, { "epoch": 0.2727103845439346, "grad_norm": 0.4084279239177704, "learning_rate": 4.5455078766641116e-05, "loss": 0.0483, "step": 36700 }, { "epoch": 0.2734534646108118, "grad_norm": 0.2832131087779999, "learning_rate": 4.5435523989800234e-05, "loss": 0.054, "step": 36800 }, { "epoch": 0.274196544677689, "grad_norm": 1.2446478605270386, "learning_rate": 4.541596921295934e-05, "loss": 0.0468, "step": 36900 }, { "epoch": 0.27493962474456624, "grad_norm": 0.34506335854530334, "learning_rate": 4.5396414436118457e-05, "loss": 0.0521, "step": 37000 }, { "epoch": 0.2756827048114434, "grad_norm": 0.2624497413635254, "learning_rate": 4.537685965927757e-05, "loss": 0.0521, "step": 37100 }, { "epoch": 0.27642578487832065, "grad_norm": 0.5324000716209412, "learning_rate": 4.5357304882436686e-05, "loss": 0.0545, "step": 37200 }, { "epoch": 0.2771688649451978, "grad_norm": 0.15775395929813385, "learning_rate": 4.53377501055958e-05, "loss": 0.0451, "step": 37300 }, { "epoch": 0.27791194501207506, "grad_norm": 0.3161396086215973, "learning_rate": 4.531819532875491e-05, "loss": 0.0519, "step": 37400 }, { "epoch": 0.27865502507895223, "grad_norm": 0.420009970664978, "learning_rate": 4.529864055191402e-05, "loss": 0.0565, "step": 37500 }, { "epoch": 0.27939810514582947, "grad_norm": 0.38775837421417236, "learning_rate": 4.527908577507314e-05, "loss": 0.056, "step": 37600 }, { "epoch": 0.28014118521270664, "grad_norm": 0.43952253460884094, "learning_rate": 4.525953099823225e-05, "loss": 0.0524, "step": 37700 }, { "epoch": 0.2808842652795839, "grad_norm": 0.2567485272884369, "learning_rate": 4.523997622139136e-05, "loss": 0.0614, "step": 37800 }, { "epoch": 0.28162734534646106, "grad_norm": 0.3646445870399475, "learning_rate": 4.522042144455048e-05, "loss": 0.0579, "step": 37900 }, { "epoch": 0.2823704254133383, "grad_norm": 1.7531551122665405, "learning_rate": 4.520086666770959e-05, "loss": 0.0604, "step": 38000 }, { "epoch": 0.28311350548021547, "grad_norm": 2.1964550018310547, "learning_rate": 4.51813118908687e-05, "loss": 0.0519, "step": 38100 }, { "epoch": 0.2838565855470927, "grad_norm": 0.6277278661727905, "learning_rate": 4.516175711402782e-05, "loss": 0.0614, "step": 38200 }, { "epoch": 0.28459966561396993, "grad_norm": 0.48903828859329224, "learning_rate": 4.514220233718693e-05, "loss": 0.0489, "step": 38300 }, { "epoch": 0.2853427456808471, "grad_norm": 0.516082227230072, "learning_rate": 4.512264756034604e-05, "loss": 0.0497, "step": 38400 }, { "epoch": 0.28608582574772434, "grad_norm": 0.1778443157672882, "learning_rate": 4.5103092783505154e-05, "loss": 0.0446, "step": 38500 }, { "epoch": 0.2868289058146015, "grad_norm": 0.34492433071136475, "learning_rate": 4.508353800666427e-05, "loss": 0.0579, "step": 38600 }, { "epoch": 0.28757198588147875, "grad_norm": 0.6422481536865234, "learning_rate": 4.5063983229823384e-05, "loss": 0.0629, "step": 38700 }, { "epoch": 0.28831506594835593, "grad_norm": 2.4476168155670166, "learning_rate": 4.5044428452982495e-05, "loss": 0.0564, "step": 38800 }, { "epoch": 0.28905814601523316, "grad_norm": 0.264011949300766, "learning_rate": 4.502487367614161e-05, "loss": 0.0566, "step": 38900 }, { "epoch": 0.28980122608211034, "grad_norm": 0.6271048188209534, "learning_rate": 4.5005318899300725e-05, "loss": 0.0531, "step": 39000 }, { "epoch": 0.2905443061489876, "grad_norm": 0.22206905484199524, "learning_rate": 4.4985764122459836e-05, "loss": 0.0505, "step": 39100 }, { "epoch": 0.29128738621586475, "grad_norm": 0.5152437686920166, "learning_rate": 4.496620934561895e-05, "loss": 0.0476, "step": 39200 }, { "epoch": 0.292030466282742, "grad_norm": 0.19508977234363556, "learning_rate": 4.4946654568778066e-05, "loss": 0.0453, "step": 39300 }, { "epoch": 0.29277354634961916, "grad_norm": 0.37318655848503113, "learning_rate": 4.492709979193718e-05, "loss": 0.0551, "step": 39400 }, { "epoch": 0.2935166264164964, "grad_norm": 0.3601136803627014, "learning_rate": 4.490754501509629e-05, "loss": 0.0536, "step": 39500 }, { "epoch": 0.2942597064833736, "grad_norm": 0.3391737937927246, "learning_rate": 4.48879902382554e-05, "loss": 0.0505, "step": 39600 }, { "epoch": 0.2950027865502508, "grad_norm": 0.659539520740509, "learning_rate": 4.486843546141452e-05, "loss": 0.0563, "step": 39700 }, { "epoch": 0.295745866617128, "grad_norm": 0.7031291127204895, "learning_rate": 4.484888068457363e-05, "loss": 0.0551, "step": 39800 }, { "epoch": 0.2964889466840052, "grad_norm": 0.531932532787323, "learning_rate": 4.482932590773275e-05, "loss": 0.0536, "step": 39900 }, { "epoch": 0.2972320267508824, "grad_norm": 1.2159888744354248, "learning_rate": 4.480977113089186e-05, "loss": 0.0469, "step": 40000 }, { "epoch": 0.2979751068177596, "grad_norm": 0.20073872804641724, "learning_rate": 4.479021635405097e-05, "loss": 0.0483, "step": 40100 }, { "epoch": 0.2987181868846368, "grad_norm": 0.4481081962585449, "learning_rate": 4.477066157721008e-05, "loss": 0.0482, "step": 40200 }, { "epoch": 0.29946126695151404, "grad_norm": 2.20951509475708, "learning_rate": 4.47511068003692e-05, "loss": 0.054, "step": 40300 }, { "epoch": 0.3002043470183912, "grad_norm": 0.5131899118423462, "learning_rate": 4.473155202352831e-05, "loss": 0.0481, "step": 40400 }, { "epoch": 0.30094742708526845, "grad_norm": 0.4056077301502228, "learning_rate": 4.471199724668742e-05, "loss": 0.0557, "step": 40500 }, { "epoch": 0.3016905071521456, "grad_norm": 0.5825193524360657, "learning_rate": 4.4692442469846534e-05, "loss": 0.047, "step": 40600 }, { "epoch": 0.30243358721902286, "grad_norm": 0.5021544098854065, "learning_rate": 4.467288769300565e-05, "loss": 0.0548, "step": 40700 }, { "epoch": 0.30317666728590004, "grad_norm": 0.37732255458831787, "learning_rate": 4.4653332916164763e-05, "loss": 0.0591, "step": 40800 }, { "epoch": 0.30391974735277727, "grad_norm": 0.8069312572479248, "learning_rate": 4.463377813932388e-05, "loss": 0.0479, "step": 40900 }, { "epoch": 0.30466282741965445, "grad_norm": 0.689922571182251, "learning_rate": 4.4614223362482986e-05, "loss": 0.0467, "step": 41000 }, { "epoch": 0.3054059074865317, "grad_norm": 0.3999854028224945, "learning_rate": 4.4594668585642104e-05, "loss": 0.0556, "step": 41100 }, { "epoch": 0.30614898755340886, "grad_norm": 0.40871724486351013, "learning_rate": 4.4575113808801216e-05, "loss": 0.0473, "step": 41200 }, { "epoch": 0.3068920676202861, "grad_norm": 0.37278008460998535, "learning_rate": 4.4555559031960334e-05, "loss": 0.059, "step": 41300 }, { "epoch": 0.30763514768716327, "grad_norm": 2.71817684173584, "learning_rate": 4.453600425511944e-05, "loss": 0.0592, "step": 41400 }, { "epoch": 0.3083782277540405, "grad_norm": 0.44062116742134094, "learning_rate": 4.4516449478278557e-05, "loss": 0.0501, "step": 41500 }, { "epoch": 0.3091213078209177, "grad_norm": 0.2359079122543335, "learning_rate": 4.449689470143767e-05, "loss": 0.0597, "step": 41600 }, { "epoch": 0.3098643878877949, "grad_norm": 0.4020438492298126, "learning_rate": 4.4477339924596786e-05, "loss": 0.0568, "step": 41700 }, { "epoch": 0.3106074679546721, "grad_norm": 1.669670581817627, "learning_rate": 4.44577851477559e-05, "loss": 0.0603, "step": 41800 }, { "epoch": 0.3113505480215493, "grad_norm": 0.15590086579322815, "learning_rate": 4.443823037091501e-05, "loss": 0.0504, "step": 41900 }, { "epoch": 0.31209362808842656, "grad_norm": 0.6613614559173584, "learning_rate": 4.441867559407412e-05, "loss": 0.0564, "step": 42000 }, { "epoch": 0.31283670815530373, "grad_norm": 0.5579034686088562, "learning_rate": 4.439912081723324e-05, "loss": 0.0613, "step": 42100 }, { "epoch": 0.31357978822218097, "grad_norm": 0.29809021949768066, "learning_rate": 4.437956604039235e-05, "loss": 0.0492, "step": 42200 }, { "epoch": 0.31432286828905814, "grad_norm": 0.37852251529693604, "learning_rate": 4.436001126355146e-05, "loss": 0.0486, "step": 42300 }, { "epoch": 0.3150659483559354, "grad_norm": 1.0866259336471558, "learning_rate": 4.434045648671057e-05, "loss": 0.0445, "step": 42400 }, { "epoch": 0.31580902842281255, "grad_norm": 1.1650820970535278, "learning_rate": 4.432090170986969e-05, "loss": 0.0568, "step": 42500 }, { "epoch": 0.3165521084896898, "grad_norm": 0.7487064003944397, "learning_rate": 4.43013469330288e-05, "loss": 0.048, "step": 42600 }, { "epoch": 0.31729518855656696, "grad_norm": 1.9535871744155884, "learning_rate": 4.428179215618792e-05, "loss": 0.0495, "step": 42700 }, { "epoch": 0.3180382686234442, "grad_norm": 1.5060514211654663, "learning_rate": 4.4262237379347025e-05, "loss": 0.0545, "step": 42800 }, { "epoch": 0.3187813486903214, "grad_norm": 0.24299561977386475, "learning_rate": 4.424268260250614e-05, "loss": 0.0589, "step": 42900 }, { "epoch": 0.3195244287571986, "grad_norm": 0.4933392107486725, "learning_rate": 4.4223127825665254e-05, "loss": 0.0472, "step": 43000 }, { "epoch": 0.3202675088240758, "grad_norm": 2.2685630321502686, "learning_rate": 4.420357304882437e-05, "loss": 0.0509, "step": 43100 }, { "epoch": 0.321010588890953, "grad_norm": 0.22440963983535767, "learning_rate": 4.4184018271983484e-05, "loss": 0.0505, "step": 43200 }, { "epoch": 0.3217536689578302, "grad_norm": 0.4035387933254242, "learning_rate": 4.4164463495142595e-05, "loss": 0.0577, "step": 43300 }, { "epoch": 0.32249674902470743, "grad_norm": 0.6078920364379883, "learning_rate": 4.414490871830171e-05, "loss": 0.0496, "step": 43400 }, { "epoch": 0.3232398290915846, "grad_norm": 0.5077763199806213, "learning_rate": 4.4125353941460825e-05, "loss": 0.0567, "step": 43500 }, { "epoch": 0.32398290915846184, "grad_norm": 0.33060282468795776, "learning_rate": 4.4105799164619936e-05, "loss": 0.0482, "step": 43600 }, { "epoch": 0.324725989225339, "grad_norm": 0.4227696657180786, "learning_rate": 4.408624438777905e-05, "loss": 0.0571, "step": 43700 }, { "epoch": 0.32546906929221625, "grad_norm": 0.41266268491744995, "learning_rate": 4.406668961093816e-05, "loss": 0.0681, "step": 43800 }, { "epoch": 0.32621214935909343, "grad_norm": 0.4292449653148651, "learning_rate": 4.404713483409728e-05, "loss": 0.0486, "step": 43900 }, { "epoch": 0.32695522942597066, "grad_norm": 0.2639881670475006, "learning_rate": 4.402758005725639e-05, "loss": 0.0524, "step": 44000 }, { "epoch": 0.32769830949284784, "grad_norm": 0.3627398908138275, "learning_rate": 4.4008025280415507e-05, "loss": 0.0535, "step": 44100 }, { "epoch": 0.32844138955972507, "grad_norm": 0.4508587121963501, "learning_rate": 4.398847050357461e-05, "loss": 0.0555, "step": 44200 }, { "epoch": 0.32918446962660225, "grad_norm": 0.5939691662788391, "learning_rate": 4.396891572673373e-05, "loss": 0.0455, "step": 44300 }, { "epoch": 0.3299275496934795, "grad_norm": 0.31450170278549194, "learning_rate": 4.394936094989284e-05, "loss": 0.0509, "step": 44400 }, { "epoch": 0.33067062976035666, "grad_norm": 0.5345723628997803, "learning_rate": 4.392980617305196e-05, "loss": 0.0494, "step": 44500 }, { "epoch": 0.3314137098272339, "grad_norm": 0.5802448391914368, "learning_rate": 4.3910251396211063e-05, "loss": 0.0592, "step": 44600 }, { "epoch": 0.33215678989411107, "grad_norm": 0.2524789273738861, "learning_rate": 4.389069661937018e-05, "loss": 0.0583, "step": 44700 }, { "epoch": 0.3328998699609883, "grad_norm": 0.5715010166168213, "learning_rate": 4.387114184252929e-05, "loss": 0.0511, "step": 44800 }, { "epoch": 0.3336429500278655, "grad_norm": 1.1681708097457886, "learning_rate": 4.385158706568841e-05, "loss": 0.0533, "step": 44900 }, { "epoch": 0.3343860300947427, "grad_norm": 0.1855054348707199, "learning_rate": 4.383203228884752e-05, "loss": 0.0564, "step": 45000 }, { "epoch": 0.3351291101616199, "grad_norm": 0.6226951479911804, "learning_rate": 4.3812477512006634e-05, "loss": 0.0422, "step": 45100 }, { "epoch": 0.3358721902284971, "grad_norm": 0.5121153593063354, "learning_rate": 4.3792922735165745e-05, "loss": 0.0527, "step": 45200 }, { "epoch": 0.3366152702953743, "grad_norm": 0.22734688222408295, "learning_rate": 4.3773367958324863e-05, "loss": 0.0483, "step": 45300 }, { "epoch": 0.33735835036225154, "grad_norm": 0.49463608860969543, "learning_rate": 4.3753813181483975e-05, "loss": 0.0474, "step": 45400 }, { "epoch": 0.3381014304291287, "grad_norm": 0.6774492263793945, "learning_rate": 4.3734258404643086e-05, "loss": 0.0671, "step": 45500 }, { "epoch": 0.33884451049600595, "grad_norm": 0.7915254235267639, "learning_rate": 4.37147036278022e-05, "loss": 0.0502, "step": 45600 }, { "epoch": 0.3395875905628831, "grad_norm": 0.8716245889663696, "learning_rate": 4.3695148850961316e-05, "loss": 0.0495, "step": 45700 }, { "epoch": 0.34033067062976036, "grad_norm": 0.5447113513946533, "learning_rate": 4.367559407412043e-05, "loss": 0.0506, "step": 45800 }, { "epoch": 0.3410737506966376, "grad_norm": 0.43864545226097107, "learning_rate": 4.3656039297279545e-05, "loss": 0.054, "step": 45900 }, { "epoch": 0.34181683076351477, "grad_norm": 0.45178645849227905, "learning_rate": 4.363648452043865e-05, "loss": 0.053, "step": 46000 }, { "epoch": 0.342559910830392, "grad_norm": 0.3202279508113861, "learning_rate": 4.361692974359777e-05, "loss": 0.0442, "step": 46100 }, { "epoch": 0.3433029908972692, "grad_norm": 0.40651416778564453, "learning_rate": 4.359737496675688e-05, "loss": 0.0482, "step": 46200 }, { "epoch": 0.3440460709641464, "grad_norm": 0.35877129435539246, "learning_rate": 4.3577820189916e-05, "loss": 0.0507, "step": 46300 }, { "epoch": 0.3447891510310236, "grad_norm": 0.36411964893341064, "learning_rate": 4.355826541307511e-05, "loss": 0.0487, "step": 46400 }, { "epoch": 0.3455322310979008, "grad_norm": 0.5476612448692322, "learning_rate": 4.353871063623422e-05, "loss": 0.0643, "step": 46500 }, { "epoch": 0.346275311164778, "grad_norm": 2.352745532989502, "learning_rate": 4.351915585939333e-05, "loss": 0.0544, "step": 46600 }, { "epoch": 0.34701839123165523, "grad_norm": 1.0919727087020874, "learning_rate": 4.349960108255245e-05, "loss": 0.0576, "step": 46700 }, { "epoch": 0.3477614712985324, "grad_norm": 0.11351816356182098, "learning_rate": 4.348004630571156e-05, "loss": 0.0514, "step": 46800 }, { "epoch": 0.34850455136540964, "grad_norm": 0.537338376045227, "learning_rate": 4.346049152887067e-05, "loss": 0.0491, "step": 46900 }, { "epoch": 0.3492476314322868, "grad_norm": 0.8616994023323059, "learning_rate": 4.3440936752029784e-05, "loss": 0.049, "step": 47000 }, { "epoch": 0.34999071149916405, "grad_norm": 1.9518449306488037, "learning_rate": 4.34213819751889e-05, "loss": 0.0556, "step": 47100 }, { "epoch": 0.35073379156604123, "grad_norm": 0.5119336843490601, "learning_rate": 4.3401827198348013e-05, "loss": 0.0528, "step": 47200 }, { "epoch": 0.35147687163291846, "grad_norm": 0.2751792371273041, "learning_rate": 4.3382272421507125e-05, "loss": 0.0572, "step": 47300 }, { "epoch": 0.35221995169979564, "grad_norm": 0.33787059783935547, "learning_rate": 4.336271764466624e-05, "loss": 0.059, "step": 47400 }, { "epoch": 0.3529630317666729, "grad_norm": 0.5348026752471924, "learning_rate": 4.3343162867825354e-05, "loss": 0.0465, "step": 47500 }, { "epoch": 0.35370611183355005, "grad_norm": 0.5484043955802917, "learning_rate": 4.3323608090984466e-05, "loss": 0.0541, "step": 47600 }, { "epoch": 0.3544491919004273, "grad_norm": 0.23299308121204376, "learning_rate": 4.3304053314143584e-05, "loss": 0.0538, "step": 47700 }, { "epoch": 0.35519227196730446, "grad_norm": 0.3517209589481354, "learning_rate": 4.3284498537302695e-05, "loss": 0.0495, "step": 47800 }, { "epoch": 0.3559353520341817, "grad_norm": 0.2922353446483612, "learning_rate": 4.326494376046181e-05, "loss": 0.0556, "step": 47900 }, { "epoch": 0.3566784321010589, "grad_norm": 0.48551109433174133, "learning_rate": 4.324538898362092e-05, "loss": 0.0464, "step": 48000 }, { "epoch": 0.3574215121679361, "grad_norm": 0.4104091227054596, "learning_rate": 4.3225834206780036e-05, "loss": 0.0574, "step": 48100 }, { "epoch": 0.3581645922348133, "grad_norm": 0.6586582660675049, "learning_rate": 4.320627942993915e-05, "loss": 0.0498, "step": 48200 }, { "epoch": 0.3589076723016905, "grad_norm": 0.2507067322731018, "learning_rate": 4.318672465309826e-05, "loss": 0.0477, "step": 48300 }, { "epoch": 0.3596507523685677, "grad_norm": 0.15842106938362122, "learning_rate": 4.316716987625738e-05, "loss": 0.0514, "step": 48400 }, { "epoch": 0.3603938324354449, "grad_norm": 0.3669281005859375, "learning_rate": 4.314761509941649e-05, "loss": 0.0532, "step": 48500 }, { "epoch": 0.3611369125023221, "grad_norm": 0.5353506803512573, "learning_rate": 4.31280603225756e-05, "loss": 0.0555, "step": 48600 }, { "epoch": 0.36187999256919934, "grad_norm": 0.6007232069969177, "learning_rate": 4.310850554573471e-05, "loss": 0.0463, "step": 48700 }, { "epoch": 0.3626230726360765, "grad_norm": 0.3174857795238495, "learning_rate": 4.308895076889383e-05, "loss": 0.044, "step": 48800 }, { "epoch": 0.36336615270295375, "grad_norm": 0.6919599771499634, "learning_rate": 4.306939599205294e-05, "loss": 0.0509, "step": 48900 }, { "epoch": 0.3641092327698309, "grad_norm": 0.537041962146759, "learning_rate": 4.304984121521206e-05, "loss": 0.0512, "step": 49000 }, { "epoch": 0.36485231283670816, "grad_norm": 0.2799455523490906, "learning_rate": 4.303028643837117e-05, "loss": 0.0462, "step": 49100 }, { "epoch": 0.36559539290358534, "grad_norm": 0.5123608708381653, "learning_rate": 4.301073166153028e-05, "loss": 0.0492, "step": 49200 }, { "epoch": 0.36633847297046257, "grad_norm": 0.5680269002914429, "learning_rate": 4.299117688468939e-05, "loss": 0.052, "step": 49300 }, { "epoch": 0.36708155303733975, "grad_norm": 0.33773285150527954, "learning_rate": 4.297162210784851e-05, "loss": 0.045, "step": 49400 }, { "epoch": 0.367824633104217, "grad_norm": 0.5373088717460632, "learning_rate": 4.295206733100762e-05, "loss": 0.0608, "step": 49500 }, { "epoch": 0.36856771317109416, "grad_norm": 0.3244791328907013, "learning_rate": 4.2932512554166734e-05, "loss": 0.0429, "step": 49600 }, { "epoch": 0.3693107932379714, "grad_norm": 0.20065180957317352, "learning_rate": 4.2912957777325845e-05, "loss": 0.0462, "step": 49700 }, { "epoch": 0.3700538733048486, "grad_norm": 0.20999634265899658, "learning_rate": 4.2893403000484964e-05, "loss": 0.0517, "step": 49800 }, { "epoch": 0.3707969533717258, "grad_norm": 0.21268391609191895, "learning_rate": 4.2873848223644075e-05, "loss": 0.043, "step": 49900 }, { "epoch": 0.37154003343860303, "grad_norm": 0.3692857623100281, "learning_rate": 4.285429344680319e-05, "loss": 0.0458, "step": 50000 }, { "epoch": 0.3722831135054802, "grad_norm": 0.47103145718574524, "learning_rate": 4.28347386699623e-05, "loss": 0.0489, "step": 50100 }, { "epoch": 0.37302619357235745, "grad_norm": 0.8009338974952698, "learning_rate": 4.2815183893121416e-05, "loss": 0.0556, "step": 50200 }, { "epoch": 0.3737692736392346, "grad_norm": 0.3066909611225128, "learning_rate": 4.279562911628053e-05, "loss": 0.045, "step": 50300 }, { "epoch": 0.37451235370611186, "grad_norm": 2.738007068634033, "learning_rate": 4.2776074339439645e-05, "loss": 0.0619, "step": 50400 }, { "epoch": 0.37525543377298903, "grad_norm": 0.5047169327735901, "learning_rate": 4.275651956259875e-05, "loss": 0.053, "step": 50500 }, { "epoch": 0.37599851383986627, "grad_norm": 0.3745966851711273, "learning_rate": 4.273696478575787e-05, "loss": 0.0553, "step": 50600 }, { "epoch": 0.37674159390674344, "grad_norm": 0.6611438393592834, "learning_rate": 4.271741000891698e-05, "loss": 0.0486, "step": 50700 }, { "epoch": 0.3774846739736207, "grad_norm": 0.29721882939338684, "learning_rate": 4.26978552320761e-05, "loss": 0.0477, "step": 50800 }, { "epoch": 0.37822775404049785, "grad_norm": 0.6091318130493164, "learning_rate": 4.267830045523521e-05, "loss": 0.054, "step": 50900 }, { "epoch": 0.3789708341073751, "grad_norm": 1.1901607513427734, "learning_rate": 4.265874567839432e-05, "loss": 0.054, "step": 51000 }, { "epoch": 0.37971391417425226, "grad_norm": 0.424010306596756, "learning_rate": 4.263919090155343e-05, "loss": 0.0471, "step": 51100 }, { "epoch": 0.3804569942411295, "grad_norm": 0.41334280371665955, "learning_rate": 4.261963612471255e-05, "loss": 0.0486, "step": 51200 }, { "epoch": 0.3812000743080067, "grad_norm": 0.9591194987297058, "learning_rate": 4.260008134787166e-05, "loss": 0.0483, "step": 51300 }, { "epoch": 0.3819431543748839, "grad_norm": 0.2007627636194229, "learning_rate": 4.258052657103077e-05, "loss": 0.055, "step": 51400 }, { "epoch": 0.3826862344417611, "grad_norm": 0.3082067370414734, "learning_rate": 4.2560971794189884e-05, "loss": 0.0488, "step": 51500 }, { "epoch": 0.3834293145086383, "grad_norm": 0.5362095832824707, "learning_rate": 4.2541417017349e-05, "loss": 0.0474, "step": 51600 }, { "epoch": 0.3841723945755155, "grad_norm": 0.7190250754356384, "learning_rate": 4.2521862240508114e-05, "loss": 0.0477, "step": 51700 }, { "epoch": 0.38491547464239273, "grad_norm": 0.2965118885040283, "learning_rate": 4.250230746366723e-05, "loss": 0.0474, "step": 51800 }, { "epoch": 0.3856585547092699, "grad_norm": 0.38370129466056824, "learning_rate": 4.2482752686826336e-05, "loss": 0.0489, "step": 51900 }, { "epoch": 0.38640163477614714, "grad_norm": 0.2531053125858307, "learning_rate": 4.2463197909985454e-05, "loss": 0.0501, "step": 52000 }, { "epoch": 0.3871447148430243, "grad_norm": 0.42529937624931335, "learning_rate": 4.2443643133144566e-05, "loss": 0.0634, "step": 52100 }, { "epoch": 0.38788779490990155, "grad_norm": 0.4972382187843323, "learning_rate": 4.2424088356303684e-05, "loss": 0.0516, "step": 52200 }, { "epoch": 0.38863087497677873, "grad_norm": 0.3964420258998871, "learning_rate": 4.240453357946279e-05, "loss": 0.0527, "step": 52300 }, { "epoch": 0.38937395504365596, "grad_norm": 0.5604862570762634, "learning_rate": 4.238497880262191e-05, "loss": 0.0523, "step": 52400 }, { "epoch": 0.39011703511053314, "grad_norm": 0.29236412048339844, "learning_rate": 4.236542402578102e-05, "loss": 0.0459, "step": 52500 }, { "epoch": 0.39086011517741037, "grad_norm": 0.5310792922973633, "learning_rate": 4.2345869248940136e-05, "loss": 0.0435, "step": 52600 }, { "epoch": 0.39160319524428755, "grad_norm": 0.21463729441165924, "learning_rate": 4.232631447209925e-05, "loss": 0.0604, "step": 52700 }, { "epoch": 0.3923462753111648, "grad_norm": 0.46253806352615356, "learning_rate": 4.230675969525836e-05, "loss": 0.0463, "step": 52800 }, { "epoch": 0.39308935537804196, "grad_norm": 0.40688246488571167, "learning_rate": 4.228720491841747e-05, "loss": 0.0614, "step": 52900 }, { "epoch": 0.3938324354449192, "grad_norm": 0.27436912059783936, "learning_rate": 4.226765014157659e-05, "loss": 0.0423, "step": 53000 }, { "epoch": 0.39457551551179637, "grad_norm": 0.44446733593940735, "learning_rate": 4.22480953647357e-05, "loss": 0.0496, "step": 53100 }, { "epoch": 0.3953185955786736, "grad_norm": 0.32725200057029724, "learning_rate": 4.222854058789481e-05, "loss": 0.0424, "step": 53200 }, { "epoch": 0.3960616756455508, "grad_norm": 0.2456185668706894, "learning_rate": 4.220898581105392e-05, "loss": 0.0502, "step": 53300 }, { "epoch": 0.396804755712428, "grad_norm": 0.6656317710876465, "learning_rate": 4.218943103421304e-05, "loss": 0.0518, "step": 53400 }, { "epoch": 0.39754783577930525, "grad_norm": 0.6988815069198608, "learning_rate": 4.216987625737215e-05, "loss": 0.0486, "step": 53500 }, { "epoch": 0.3982909158461824, "grad_norm": 0.5668619275093079, "learning_rate": 4.215032148053127e-05, "loss": 0.0465, "step": 53600 }, { "epoch": 0.39903399591305966, "grad_norm": 0.6128275394439697, "learning_rate": 4.2130766703690375e-05, "loss": 0.0562, "step": 53700 }, { "epoch": 0.39977707597993684, "grad_norm": 0.9399946928024292, "learning_rate": 4.211121192684949e-05, "loss": 0.0406, "step": 53800 }, { "epoch": 0.40052015604681407, "grad_norm": 2.378962516784668, "learning_rate": 4.2091657150008604e-05, "loss": 0.0444, "step": 53900 }, { "epoch": 0.40126323611369125, "grad_norm": 0.3574617803096771, "learning_rate": 4.207210237316772e-05, "loss": 0.0469, "step": 54000 }, { "epoch": 0.4020063161805685, "grad_norm": 1.5786116123199463, "learning_rate": 4.205254759632683e-05, "loss": 0.0616, "step": 54100 }, { "epoch": 0.40274939624744566, "grad_norm": 0.17924316227436066, "learning_rate": 4.2032992819485945e-05, "loss": 0.0413, "step": 54200 }, { "epoch": 0.4034924763143229, "grad_norm": 0.4728910028934479, "learning_rate": 4.201343804264506e-05, "loss": 0.0487, "step": 54300 }, { "epoch": 0.40423555638120007, "grad_norm": 0.4187438488006592, "learning_rate": 4.1993883265804175e-05, "loss": 0.0435, "step": 54400 }, { "epoch": 0.4049786364480773, "grad_norm": 2.4100944995880127, "learning_rate": 4.1974328488963286e-05, "loss": 0.0465, "step": 54500 }, { "epoch": 0.4057217165149545, "grad_norm": 0.7984603643417358, "learning_rate": 4.19547737121224e-05, "loss": 0.051, "step": 54600 }, { "epoch": 0.4064647965818317, "grad_norm": 0.13470029830932617, "learning_rate": 4.193521893528151e-05, "loss": 0.0575, "step": 54700 }, { "epoch": 0.4072078766487089, "grad_norm": 1.2021604776382446, "learning_rate": 4.191566415844063e-05, "loss": 0.0488, "step": 54800 }, { "epoch": 0.4079509567155861, "grad_norm": 2.0110795497894287, "learning_rate": 4.189610938159974e-05, "loss": 0.0457, "step": 54900 }, { "epoch": 0.4086940367824633, "grad_norm": 0.26087769865989685, "learning_rate": 4.187655460475885e-05, "loss": 0.0469, "step": 55000 }, { "epoch": 0.40943711684934053, "grad_norm": 0.3563740849494934, "learning_rate": 4.185699982791796e-05, "loss": 0.0505, "step": 55100 }, { "epoch": 0.4101801969162177, "grad_norm": 0.40667274594306946, "learning_rate": 4.183744505107708e-05, "loss": 0.057, "step": 55200 }, { "epoch": 0.41092327698309494, "grad_norm": 0.34398332238197327, "learning_rate": 4.181789027423619e-05, "loss": 0.0543, "step": 55300 }, { "epoch": 0.4116663570499721, "grad_norm": 0.12510478496551514, "learning_rate": 4.179833549739531e-05, "loss": 0.0501, "step": 55400 }, { "epoch": 0.41240943711684935, "grad_norm": 0.5249769687652588, "learning_rate": 4.1778780720554414e-05, "loss": 0.0398, "step": 55500 }, { "epoch": 0.41315251718372653, "grad_norm": 0.31901395320892334, "learning_rate": 4.175922594371353e-05, "loss": 0.054, "step": 55600 }, { "epoch": 0.41389559725060376, "grad_norm": 0.17675960063934326, "learning_rate": 4.173967116687264e-05, "loss": 0.0376, "step": 55700 }, { "epoch": 0.41463867731748094, "grad_norm": 2.6645877361297607, "learning_rate": 4.172011639003176e-05, "loss": 0.0523, "step": 55800 }, { "epoch": 0.4153817573843582, "grad_norm": 0.5366520285606384, "learning_rate": 4.170056161319087e-05, "loss": 0.0458, "step": 55900 }, { "epoch": 0.41612483745123535, "grad_norm": 0.4103834331035614, "learning_rate": 4.1681006836349984e-05, "loss": 0.0443, "step": 56000 }, { "epoch": 0.4168679175181126, "grad_norm": 0.5385734438896179, "learning_rate": 4.1661452059509095e-05, "loss": 0.0573, "step": 56100 }, { "epoch": 0.41761099758498976, "grad_norm": 0.22407326102256775, "learning_rate": 4.1641897282668214e-05, "loss": 0.0547, "step": 56200 }, { "epoch": 0.418354077651867, "grad_norm": 0.5456708669662476, "learning_rate": 4.1622342505827325e-05, "loss": 0.0491, "step": 56300 }, { "epoch": 0.4190971577187442, "grad_norm": 0.25281211733818054, "learning_rate": 4.1602787728986436e-05, "loss": 0.0428, "step": 56400 }, { "epoch": 0.4198402377856214, "grad_norm": 0.21558062732219696, "learning_rate": 4.158323295214555e-05, "loss": 0.0519, "step": 56500 }, { "epoch": 0.4205833178524986, "grad_norm": 0.3431067168712616, "learning_rate": 4.1563678175304666e-05, "loss": 0.0446, "step": 56600 }, { "epoch": 0.4213263979193758, "grad_norm": 0.49375441670417786, "learning_rate": 4.154412339846378e-05, "loss": 0.0644, "step": 56700 }, { "epoch": 0.422069477986253, "grad_norm": 0.31847721338272095, "learning_rate": 4.1524568621622895e-05, "loss": 0.0492, "step": 56800 }, { "epoch": 0.4228125580531302, "grad_norm": 0.3070487976074219, "learning_rate": 4.150501384478201e-05, "loss": 0.0553, "step": 56900 }, { "epoch": 0.4235556381200074, "grad_norm": 0.3735882043838501, "learning_rate": 4.148545906794112e-05, "loss": 0.0565, "step": 57000 }, { "epoch": 0.42429871818688464, "grad_norm": 0.3322221338748932, "learning_rate": 4.146590429110023e-05, "loss": 0.0523, "step": 57100 }, { "epoch": 0.4250417982537618, "grad_norm": 0.12384632229804993, "learning_rate": 4.144634951425935e-05, "loss": 0.0597, "step": 57200 }, { "epoch": 0.42578487832063905, "grad_norm": 0.400459885597229, "learning_rate": 4.142679473741846e-05, "loss": 0.0501, "step": 57300 }, { "epoch": 0.4265279583875163, "grad_norm": 0.19004705548286438, "learning_rate": 4.140723996057757e-05, "loss": 0.0515, "step": 57400 }, { "epoch": 0.42727103845439346, "grad_norm": 1.4170711040496826, "learning_rate": 4.138768518373669e-05, "loss": 0.0702, "step": 57500 }, { "epoch": 0.4280141185212707, "grad_norm": 0.4857993423938751, "learning_rate": 4.13681304068958e-05, "loss": 0.0555, "step": 57600 }, { "epoch": 0.42875719858814787, "grad_norm": 0.27585703134536743, "learning_rate": 4.134857563005491e-05, "loss": 0.0362, "step": 57700 }, { "epoch": 0.4295002786550251, "grad_norm": 0.5156170129776001, "learning_rate": 4.132902085321402e-05, "loss": 0.0387, "step": 57800 }, { "epoch": 0.4302433587219023, "grad_norm": 0.5553490519523621, "learning_rate": 4.130946607637314e-05, "loss": 0.0456, "step": 57900 }, { "epoch": 0.4309864387887795, "grad_norm": 0.8524369597434998, "learning_rate": 4.128991129953225e-05, "loss": 0.0417, "step": 58000 }, { "epoch": 0.4317295188556567, "grad_norm": 0.1602414846420288, "learning_rate": 4.1270356522691364e-05, "loss": 0.0414, "step": 58100 }, { "epoch": 0.4324725989225339, "grad_norm": 0.7190741896629333, "learning_rate": 4.1250801745850475e-05, "loss": 0.0464, "step": 58200 }, { "epoch": 0.4332156789894111, "grad_norm": 0.37583059072494507, "learning_rate": 4.123124696900959e-05, "loss": 0.0551, "step": 58300 }, { "epoch": 0.43395875905628833, "grad_norm": 1.294334888458252, "learning_rate": 4.1211692192168705e-05, "loss": 0.0619, "step": 58400 }, { "epoch": 0.4347018391231655, "grad_norm": 0.9259421229362488, "learning_rate": 4.119213741532782e-05, "loss": 0.0401, "step": 58500 }, { "epoch": 0.43544491919004275, "grad_norm": 0.1912134885787964, "learning_rate": 4.1172582638486934e-05, "loss": 0.0508, "step": 58600 }, { "epoch": 0.4361879992569199, "grad_norm": 0.44470953941345215, "learning_rate": 4.1153027861646045e-05, "loss": 0.0487, "step": 58700 }, { "epoch": 0.43693107932379716, "grad_norm": 0.4522186517715454, "learning_rate": 4.113347308480516e-05, "loss": 0.0413, "step": 58800 }, { "epoch": 0.43767415939067433, "grad_norm": 0.41382989287376404, "learning_rate": 4.1113918307964275e-05, "loss": 0.0593, "step": 58900 }, { "epoch": 0.43841723945755157, "grad_norm": 0.3119601011276245, "learning_rate": 4.1094363531123386e-05, "loss": 0.0507, "step": 59000 }, { "epoch": 0.43916031952442874, "grad_norm": 0.4759248197078705, "learning_rate": 4.10748087542825e-05, "loss": 0.0499, "step": 59100 }, { "epoch": 0.439903399591306, "grad_norm": 1.6466630697250366, "learning_rate": 4.105525397744161e-05, "loss": 0.0547, "step": 59200 }, { "epoch": 0.44064647965818315, "grad_norm": 0.10137461125850677, "learning_rate": 4.103569920060073e-05, "loss": 0.0389, "step": 59300 }, { "epoch": 0.4413895597250604, "grad_norm": 0.3006949722766876, "learning_rate": 4.101614442375984e-05, "loss": 0.0448, "step": 59400 }, { "epoch": 0.44213263979193757, "grad_norm": 0.16074690222740173, "learning_rate": 4.099658964691896e-05, "loss": 0.0572, "step": 59500 }, { "epoch": 0.4428757198588148, "grad_norm": 0.315758615732193, "learning_rate": 4.097703487007806e-05, "loss": 0.043, "step": 59600 }, { "epoch": 0.443618799925692, "grad_norm": 0.4650103747844696, "learning_rate": 4.095748009323718e-05, "loss": 0.0533, "step": 59700 }, { "epoch": 0.4443618799925692, "grad_norm": 0.40119385719299316, "learning_rate": 4.093792531639629e-05, "loss": 0.0531, "step": 59800 }, { "epoch": 0.4451049600594464, "grad_norm": 0.335913747549057, "learning_rate": 4.091837053955541e-05, "loss": 0.0494, "step": 59900 }, { "epoch": 0.4458480401263236, "grad_norm": 0.1338331401348114, "learning_rate": 4.0898815762714514e-05, "loss": 0.0517, "step": 60000 }, { "epoch": 0.4465911201932008, "grad_norm": 1.488953709602356, "learning_rate": 4.087926098587363e-05, "loss": 0.0483, "step": 60100 }, { "epoch": 0.44733420026007803, "grad_norm": 0.5532567501068115, "learning_rate": 4.085970620903274e-05, "loss": 0.0411, "step": 60200 }, { "epoch": 0.4480772803269552, "grad_norm": 2.11478328704834, "learning_rate": 4.084015143219186e-05, "loss": 0.0487, "step": 60300 }, { "epoch": 0.44882036039383244, "grad_norm": 0.3136877715587616, "learning_rate": 4.082059665535097e-05, "loss": 0.0452, "step": 60400 }, { "epoch": 0.4495634404607096, "grad_norm": 0.5356770753860474, "learning_rate": 4.0801041878510084e-05, "loss": 0.0523, "step": 60500 }, { "epoch": 0.45030652052758685, "grad_norm": 0.6519300937652588, "learning_rate": 4.0781487101669195e-05, "loss": 0.0439, "step": 60600 }, { "epoch": 0.45104960059446403, "grad_norm": 0.24728168547153473, "learning_rate": 4.0761932324828314e-05, "loss": 0.053, "step": 60700 }, { "epoch": 0.45179268066134126, "grad_norm": 0.40874984860420227, "learning_rate": 4.0742377547987425e-05, "loss": 0.0542, "step": 60800 }, { "epoch": 0.45253576072821844, "grad_norm": 0.2637019753456116, "learning_rate": 4.0722822771146536e-05, "loss": 0.057, "step": 60900 }, { "epoch": 0.45327884079509567, "grad_norm": 1.4464181661605835, "learning_rate": 4.070326799430565e-05, "loss": 0.0486, "step": 61000 }, { "epoch": 0.4540219208619729, "grad_norm": 0.346955806016922, "learning_rate": 4.0683713217464766e-05, "loss": 0.0505, "step": 61100 }, { "epoch": 0.4547650009288501, "grad_norm": 0.5490242838859558, "learning_rate": 4.066415844062388e-05, "loss": 0.0516, "step": 61200 }, { "epoch": 0.4555080809957273, "grad_norm": 0.6603181958198547, "learning_rate": 4.0644603663782995e-05, "loss": 0.0474, "step": 61300 }, { "epoch": 0.4562511610626045, "grad_norm": 0.5088388919830322, "learning_rate": 4.06250488869421e-05, "loss": 0.0563, "step": 61400 }, { "epoch": 0.4569942411294817, "grad_norm": 0.7016160488128662, "learning_rate": 4.060549411010122e-05, "loss": 0.0421, "step": 61500 }, { "epoch": 0.4577373211963589, "grad_norm": 0.2689754366874695, "learning_rate": 4.058593933326033e-05, "loss": 0.0501, "step": 61600 }, { "epoch": 0.45848040126323614, "grad_norm": 0.4317912459373474, "learning_rate": 4.056638455641945e-05, "loss": 0.0558, "step": 61700 }, { "epoch": 0.4592234813301133, "grad_norm": 0.5949551463127136, "learning_rate": 4.054682977957856e-05, "loss": 0.0549, "step": 61800 }, { "epoch": 0.45996656139699055, "grad_norm": 0.5264861583709717, "learning_rate": 4.052727500273767e-05, "loss": 0.0536, "step": 61900 }, { "epoch": 0.4607096414638677, "grad_norm": 0.6791458129882812, "learning_rate": 4.050772022589678e-05, "loss": 0.0468, "step": 62000 }, { "epoch": 0.46145272153074496, "grad_norm": 1.923580527305603, "learning_rate": 4.04881654490559e-05, "loss": 0.0513, "step": 62100 }, { "epoch": 0.46219580159762214, "grad_norm": 0.6379066109657288, "learning_rate": 4.046861067221501e-05, "loss": 0.0437, "step": 62200 }, { "epoch": 0.46293888166449937, "grad_norm": 0.27543535828590393, "learning_rate": 4.044905589537412e-05, "loss": 0.0492, "step": 62300 }, { "epoch": 0.46368196173137655, "grad_norm": 0.30594921112060547, "learning_rate": 4.0429501118533234e-05, "loss": 0.0437, "step": 62400 }, { "epoch": 0.4644250417982538, "grad_norm": 0.25598981976509094, "learning_rate": 4.040994634169235e-05, "loss": 0.0473, "step": 62500 }, { "epoch": 0.46516812186513096, "grad_norm": 0.26071321964263916, "learning_rate": 4.0390391564851464e-05, "loss": 0.0477, "step": 62600 }, { "epoch": 0.4659112019320082, "grad_norm": 0.16913869976997375, "learning_rate": 4.037083678801058e-05, "loss": 0.05, "step": 62700 }, { "epoch": 0.46665428199888537, "grad_norm": 0.3094201683998108, "learning_rate": 4.0351282011169686e-05, "loss": 0.0462, "step": 62800 }, { "epoch": 0.4673973620657626, "grad_norm": 0.5129991173744202, "learning_rate": 4.0331727234328805e-05, "loss": 0.0593, "step": 62900 }, { "epoch": 0.4681404421326398, "grad_norm": 2.5883710384368896, "learning_rate": 4.0312172457487916e-05, "loss": 0.0452, "step": 63000 }, { "epoch": 0.468883522199517, "grad_norm": 1.143601417541504, "learning_rate": 4.0292617680647034e-05, "loss": 0.0494, "step": 63100 }, { "epoch": 0.4696266022663942, "grad_norm": 0.4917498826980591, "learning_rate": 4.027306290380614e-05, "loss": 0.0449, "step": 63200 }, { "epoch": 0.4703696823332714, "grad_norm": 0.5166087746620178, "learning_rate": 4.025350812696526e-05, "loss": 0.0549, "step": 63300 }, { "epoch": 0.4711127624001486, "grad_norm": 0.2395673245191574, "learning_rate": 4.023395335012437e-05, "loss": 0.0414, "step": 63400 }, { "epoch": 0.47185584246702583, "grad_norm": 0.29241639375686646, "learning_rate": 4.0214398573283486e-05, "loss": 0.0437, "step": 63500 }, { "epoch": 0.472598922533903, "grad_norm": 0.7502799034118652, "learning_rate": 4.01948437964426e-05, "loss": 0.0417, "step": 63600 }, { "epoch": 0.47334200260078024, "grad_norm": 0.557222843170166, "learning_rate": 4.017528901960171e-05, "loss": 0.0497, "step": 63700 }, { "epoch": 0.4740850826676574, "grad_norm": 0.4230397939682007, "learning_rate": 4.015573424276082e-05, "loss": 0.0397, "step": 63800 }, { "epoch": 0.47482816273453465, "grad_norm": 0.11751323938369751, "learning_rate": 4.013617946591994e-05, "loss": 0.0429, "step": 63900 }, { "epoch": 0.47557124280141183, "grad_norm": 1.3228096961975098, "learning_rate": 4.011662468907905e-05, "loss": 0.0541, "step": 64000 }, { "epoch": 0.47631432286828906, "grad_norm": 0.5340604782104492, "learning_rate": 4.009706991223816e-05, "loss": 0.0531, "step": 64100 }, { "epoch": 0.47705740293516624, "grad_norm": 0.5906882286071777, "learning_rate": 4.007751513539727e-05, "loss": 0.0526, "step": 64200 }, { "epoch": 0.4778004830020435, "grad_norm": 0.3976479768753052, "learning_rate": 4.005796035855639e-05, "loss": 0.0485, "step": 64300 }, { "epoch": 0.47854356306892065, "grad_norm": 1.0794645547866821, "learning_rate": 4.00384055817155e-05, "loss": 0.0472, "step": 64400 }, { "epoch": 0.4792866431357979, "grad_norm": 0.5931153893470764, "learning_rate": 4.001885080487462e-05, "loss": 0.0503, "step": 64500 }, { "epoch": 0.48002972320267506, "grad_norm": 0.5998044610023499, "learning_rate": 3.9999296028033725e-05, "loss": 0.0486, "step": 64600 }, { "epoch": 0.4807728032695523, "grad_norm": 0.5339343547821045, "learning_rate": 3.997974125119284e-05, "loss": 0.0449, "step": 64700 }, { "epoch": 0.4815158833364295, "grad_norm": 1.3431754112243652, "learning_rate": 3.9960186474351955e-05, "loss": 0.0477, "step": 64800 }, { "epoch": 0.4822589634033067, "grad_norm": 0.37152108550071716, "learning_rate": 3.994063169751107e-05, "loss": 0.0457, "step": 64900 }, { "epoch": 0.48300204347018394, "grad_norm": 0.33666571974754333, "learning_rate": 3.992107692067018e-05, "loss": 0.0387, "step": 65000 }, { "epoch": 0.4837451235370611, "grad_norm": 0.3073904514312744, "learning_rate": 3.9901522143829296e-05, "loss": 0.0487, "step": 65100 }, { "epoch": 0.48448820360393835, "grad_norm": 1.1334384679794312, "learning_rate": 3.988196736698841e-05, "loss": 0.0438, "step": 65200 }, { "epoch": 0.4852312836708155, "grad_norm": 0.2836994230747223, "learning_rate": 3.9862412590147525e-05, "loss": 0.0567, "step": 65300 }, { "epoch": 0.48597436373769276, "grad_norm": 0.8312345743179321, "learning_rate": 3.9842857813306636e-05, "loss": 0.053, "step": 65400 }, { "epoch": 0.48671744380456994, "grad_norm": 0.4401642680168152, "learning_rate": 3.982330303646575e-05, "loss": 0.0555, "step": 65500 }, { "epoch": 0.48746052387144717, "grad_norm": 2.980755090713501, "learning_rate": 3.980374825962486e-05, "loss": 0.0503, "step": 65600 }, { "epoch": 0.48820360393832435, "grad_norm": 0.9461953639984131, "learning_rate": 3.978419348278398e-05, "loss": 0.0483, "step": 65700 }, { "epoch": 0.4889466840052016, "grad_norm": 0.4320688545703888, "learning_rate": 3.976463870594309e-05, "loss": 0.0487, "step": 65800 }, { "epoch": 0.48968976407207876, "grad_norm": 0.13726454973220825, "learning_rate": 3.97450839291022e-05, "loss": 0.0393, "step": 65900 }, { "epoch": 0.490432844138956, "grad_norm": 0.3220929801464081, "learning_rate": 3.972552915226132e-05, "loss": 0.0538, "step": 66000 }, { "epoch": 0.49117592420583317, "grad_norm": 0.5036290884017944, "learning_rate": 3.970597437542043e-05, "loss": 0.0489, "step": 66100 }, { "epoch": 0.4919190042727104, "grad_norm": 2.3884825706481934, "learning_rate": 3.968641959857954e-05, "loss": 0.0477, "step": 66200 }, { "epoch": 0.4926620843395876, "grad_norm": 0.39560332894325256, "learning_rate": 3.966686482173866e-05, "loss": 0.0395, "step": 66300 }, { "epoch": 0.4934051644064648, "grad_norm": 0.31281840801239014, "learning_rate": 3.964731004489777e-05, "loss": 0.0465, "step": 66400 }, { "epoch": 0.494148244473342, "grad_norm": 0.2070046216249466, "learning_rate": 3.962775526805688e-05, "loss": 0.0635, "step": 66500 }, { "epoch": 0.4948913245402192, "grad_norm": 1.634631633758545, "learning_rate": 3.960820049121599e-05, "loss": 0.0504, "step": 66600 }, { "epoch": 0.4956344046070964, "grad_norm": 0.8710480332374573, "learning_rate": 3.958864571437511e-05, "loss": 0.0406, "step": 66700 }, { "epoch": 0.49637748467397363, "grad_norm": 0.9337303042411804, "learning_rate": 3.956909093753422e-05, "loss": 0.0428, "step": 66800 }, { "epoch": 0.4971205647408508, "grad_norm": 0.7190823554992676, "learning_rate": 3.9549536160693334e-05, "loss": 0.0483, "step": 66900 }, { "epoch": 0.49786364480772805, "grad_norm": 0.1426297128200531, "learning_rate": 3.952998138385245e-05, "loss": 0.0449, "step": 67000 }, { "epoch": 0.4986067248746052, "grad_norm": 0.6703498363494873, "learning_rate": 3.9510426607011564e-05, "loss": 0.0419, "step": 67100 }, { "epoch": 0.49934980494148246, "grad_norm": 0.3443589210510254, "learning_rate": 3.9490871830170675e-05, "loss": 0.0472, "step": 67200 }, { "epoch": 0.5000928850083597, "grad_norm": 0.7301865816116333, "learning_rate": 3.9471317053329786e-05, "loss": 0.0551, "step": 67300 }, { "epoch": 0.5008359650752369, "grad_norm": 1.6140941381454468, "learning_rate": 3.9451762276488905e-05, "loss": 0.05, "step": 67400 }, { "epoch": 0.501579045142114, "grad_norm": 0.37013405561447144, "learning_rate": 3.9432207499648016e-05, "loss": 0.0433, "step": 67500 }, { "epoch": 0.5023221252089912, "grad_norm": 2.0560615062713623, "learning_rate": 3.941265272280713e-05, "loss": 0.0547, "step": 67600 }, { "epoch": 0.5030652052758685, "grad_norm": 0.2093905359506607, "learning_rate": 3.9393097945966246e-05, "loss": 0.0485, "step": 67700 }, { "epoch": 0.5038082853427457, "grad_norm": 0.8874582648277283, "learning_rate": 3.937354316912536e-05, "loss": 0.0518, "step": 67800 }, { "epoch": 0.5045513654096229, "grad_norm": 1.3039653301239014, "learning_rate": 3.935398839228447e-05, "loss": 0.049, "step": 67900 }, { "epoch": 0.5052944454765, "grad_norm": 0.5506293177604675, "learning_rate": 3.9334433615443586e-05, "loss": 0.0443, "step": 68000 }, { "epoch": 0.5060375255433773, "grad_norm": 0.650161862373352, "learning_rate": 3.93148788386027e-05, "loss": 0.0387, "step": 68100 }, { "epoch": 0.5067806056102545, "grad_norm": 0.5854061245918274, "learning_rate": 3.929532406176181e-05, "loss": 0.0549, "step": 68200 }, { "epoch": 0.5075236856771317, "grad_norm": 0.5551206469535828, "learning_rate": 3.927576928492092e-05, "loss": 0.0492, "step": 68300 }, { "epoch": 0.5082667657440089, "grad_norm": 0.2527898848056793, "learning_rate": 3.925621450808004e-05, "loss": 0.0491, "step": 68400 }, { "epoch": 0.5090098458108862, "grad_norm": 0.1446671336889267, "learning_rate": 3.923665973123915e-05, "loss": 0.0468, "step": 68500 }, { "epoch": 0.5097529258777633, "grad_norm": 0.1547258198261261, "learning_rate": 3.921710495439827e-05, "loss": 0.0526, "step": 68600 }, { "epoch": 0.5104960059446405, "grad_norm": 0.6983484029769897, "learning_rate": 3.919755017755737e-05, "loss": 0.0512, "step": 68700 }, { "epoch": 0.5112390860115178, "grad_norm": 0.48527413606643677, "learning_rate": 3.917799540071649e-05, "loss": 0.0491, "step": 68800 }, { "epoch": 0.511982166078395, "grad_norm": 4.668185710906982, "learning_rate": 3.91584406238756e-05, "loss": 0.0466, "step": 68900 }, { "epoch": 0.5127252461452722, "grad_norm": 0.5454621911048889, "learning_rate": 3.913888584703472e-05, "loss": 0.0457, "step": 69000 }, { "epoch": 0.5134683262121493, "grad_norm": 0.29545125365257263, "learning_rate": 3.9119331070193825e-05, "loss": 0.0443, "step": 69100 }, { "epoch": 0.5142114062790266, "grad_norm": 0.5727939009666443, "learning_rate": 3.909977629335294e-05, "loss": 0.0383, "step": 69200 }, { "epoch": 0.5149544863459038, "grad_norm": 0.5184394121170044, "learning_rate": 3.9080221516512055e-05, "loss": 0.0434, "step": 69300 }, { "epoch": 0.515697566412781, "grad_norm": 0.20744960010051727, "learning_rate": 3.906066673967117e-05, "loss": 0.0572, "step": 69400 }, { "epoch": 0.5164406464796581, "grad_norm": 1.5046900510787964, "learning_rate": 3.9041111962830284e-05, "loss": 0.0593, "step": 69500 }, { "epoch": 0.5171837265465354, "grad_norm": 1.5338259935379028, "learning_rate": 3.9021557185989396e-05, "loss": 0.0504, "step": 69600 }, { "epoch": 0.5179268066134126, "grad_norm": 0.6871222257614136, "learning_rate": 3.900200240914851e-05, "loss": 0.0427, "step": 69700 }, { "epoch": 0.5186698866802898, "grad_norm": 0.20519977807998657, "learning_rate": 3.8982447632307625e-05, "loss": 0.0422, "step": 69800 }, { "epoch": 0.519412966747167, "grad_norm": 0.31508350372314453, "learning_rate": 3.8962892855466736e-05, "loss": 0.0473, "step": 69900 }, { "epoch": 0.5201560468140443, "grad_norm": 0.8541378974914551, "learning_rate": 3.894333807862585e-05, "loss": 0.0459, "step": 70000 }, { "epoch": 0.5208991268809214, "grad_norm": 0.688602089881897, "learning_rate": 3.892378330178496e-05, "loss": 0.0556, "step": 70100 }, { "epoch": 0.5216422069477986, "grad_norm": 1.8429560661315918, "learning_rate": 3.890422852494408e-05, "loss": 0.0437, "step": 70200 }, { "epoch": 0.5223852870146758, "grad_norm": 0.9722315073013306, "learning_rate": 3.888467374810319e-05, "loss": 0.0493, "step": 70300 }, { "epoch": 0.5231283670815531, "grad_norm": 0.4588809907436371, "learning_rate": 3.886511897126231e-05, "loss": 0.0532, "step": 70400 }, { "epoch": 0.5238714471484303, "grad_norm": 0.5589662194252014, "learning_rate": 3.884556419442141e-05, "loss": 0.0499, "step": 70500 }, { "epoch": 0.5246145272153074, "grad_norm": 3.347090482711792, "learning_rate": 3.882600941758053e-05, "loss": 0.0475, "step": 70600 }, { "epoch": 0.5253576072821846, "grad_norm": 0.627650797367096, "learning_rate": 3.880645464073964e-05, "loss": 0.0379, "step": 70700 }, { "epoch": 0.5261006873490619, "grad_norm": 0.4859529137611389, "learning_rate": 3.878689986389876e-05, "loss": 0.0523, "step": 70800 }, { "epoch": 0.5268437674159391, "grad_norm": 0.8911608457565308, "learning_rate": 3.8767345087057864e-05, "loss": 0.0396, "step": 70900 }, { "epoch": 0.5275868474828163, "grad_norm": 0.7122796177864075, "learning_rate": 3.874779031021698e-05, "loss": 0.0496, "step": 71000 }, { "epoch": 0.5283299275496934, "grad_norm": 0.5159083008766174, "learning_rate": 3.872823553337609e-05, "loss": 0.046, "step": 71100 }, { "epoch": 0.5290730076165707, "grad_norm": 0.31705939769744873, "learning_rate": 3.870868075653521e-05, "loss": 0.0472, "step": 71200 }, { "epoch": 0.5298160876834479, "grad_norm": 0.266495943069458, "learning_rate": 3.868912597969432e-05, "loss": 0.0536, "step": 71300 }, { "epoch": 0.5305591677503251, "grad_norm": 0.5142518877983093, "learning_rate": 3.8669571202853434e-05, "loss": 0.0455, "step": 71400 }, { "epoch": 0.5313022478172023, "grad_norm": 0.571115255355835, "learning_rate": 3.8650016426012546e-05, "loss": 0.0443, "step": 71500 }, { "epoch": 0.5320453278840795, "grad_norm": 1.3116741180419922, "learning_rate": 3.8630461649171664e-05, "loss": 0.0414, "step": 71600 }, { "epoch": 0.5327884079509567, "grad_norm": 4.404228687286377, "learning_rate": 3.8610906872330775e-05, "loss": 0.0452, "step": 71700 }, { "epoch": 0.5335314880178339, "grad_norm": 0.4464043378829956, "learning_rate": 3.8591352095489886e-05, "loss": 0.0551, "step": 71800 }, { "epoch": 0.5342745680847111, "grad_norm": 0.44871872663497925, "learning_rate": 3.8571797318649e-05, "loss": 0.0446, "step": 71900 }, { "epoch": 0.5350176481515884, "grad_norm": 0.19444753229618073, "learning_rate": 3.8552242541808116e-05, "loss": 0.0447, "step": 72000 }, { "epoch": 0.5357607282184655, "grad_norm": 0.3987314999103546, "learning_rate": 3.853268776496723e-05, "loss": 0.0443, "step": 72100 }, { "epoch": 0.5365038082853427, "grad_norm": 3.8910162448883057, "learning_rate": 3.8513132988126346e-05, "loss": 0.0385, "step": 72200 }, { "epoch": 0.5372468883522199, "grad_norm": 0.44960686564445496, "learning_rate": 3.849357821128545e-05, "loss": 0.0414, "step": 72300 }, { "epoch": 0.5379899684190972, "grad_norm": 0.5040250420570374, "learning_rate": 3.847402343444457e-05, "loss": 0.0624, "step": 72400 }, { "epoch": 0.5387330484859744, "grad_norm": 0.41746217012405396, "learning_rate": 3.845446865760368e-05, "loss": 0.059, "step": 72500 }, { "epoch": 0.5394761285528515, "grad_norm": 0.4056994915008545, "learning_rate": 3.84349138807628e-05, "loss": 0.0517, "step": 72600 }, { "epoch": 0.5402192086197288, "grad_norm": 0.18131037056446075, "learning_rate": 3.84153591039219e-05, "loss": 0.0397, "step": 72700 }, { "epoch": 0.540962288686606, "grad_norm": 0.1910189837217331, "learning_rate": 3.839580432708102e-05, "loss": 0.0452, "step": 72800 }, { "epoch": 0.5417053687534832, "grad_norm": 0.5492351651191711, "learning_rate": 3.837624955024013e-05, "loss": 0.0504, "step": 72900 }, { "epoch": 0.5424484488203604, "grad_norm": 0.29312145709991455, "learning_rate": 3.835669477339925e-05, "loss": 0.0384, "step": 73000 }, { "epoch": 0.5431915288872377, "grad_norm": 0.37678098678588867, "learning_rate": 3.833713999655836e-05, "loss": 0.0537, "step": 73100 }, { "epoch": 0.5439346089541148, "grad_norm": 0.44250813126564026, "learning_rate": 3.831758521971747e-05, "loss": 0.0421, "step": 73200 }, { "epoch": 0.544677689020992, "grad_norm": 0.6379761099815369, "learning_rate": 3.8298030442876584e-05, "loss": 0.0503, "step": 73300 }, { "epoch": 0.5454207690878692, "grad_norm": 0.2446812093257904, "learning_rate": 3.82784756660357e-05, "loss": 0.0464, "step": 73400 }, { "epoch": 0.5461638491547465, "grad_norm": 0.6237298250198364, "learning_rate": 3.8258920889194814e-05, "loss": 0.052, "step": 73500 }, { "epoch": 0.5469069292216236, "grad_norm": 0.11565306782722473, "learning_rate": 3.823936611235393e-05, "loss": 0.0408, "step": 73600 }, { "epoch": 0.5476500092885008, "grad_norm": 0.40984010696411133, "learning_rate": 3.8219811335513037e-05, "loss": 0.0453, "step": 73700 }, { "epoch": 0.548393089355378, "grad_norm": 0.75331050157547, "learning_rate": 3.8200256558672155e-05, "loss": 0.0378, "step": 73800 }, { "epoch": 0.5491361694222553, "grad_norm": 0.668346643447876, "learning_rate": 3.8180701781831266e-05, "loss": 0.0472, "step": 73900 }, { "epoch": 0.5498792494891325, "grad_norm": 0.3825382888317108, "learning_rate": 3.8161147004990384e-05, "loss": 0.048, "step": 74000 }, { "epoch": 0.5506223295560096, "grad_norm": 0.3120979368686676, "learning_rate": 3.814159222814949e-05, "loss": 0.0456, "step": 74100 }, { "epoch": 0.5513654096228868, "grad_norm": 2.0016062259674072, "learning_rate": 3.812203745130861e-05, "loss": 0.0533, "step": 74200 }, { "epoch": 0.5521084896897641, "grad_norm": 0.6304114460945129, "learning_rate": 3.810248267446772e-05, "loss": 0.0499, "step": 74300 }, { "epoch": 0.5528515697566413, "grad_norm": 0.1702193021774292, "learning_rate": 3.8082927897626836e-05, "loss": 0.0508, "step": 74400 }, { "epoch": 0.5535946498235185, "grad_norm": 0.4090425670146942, "learning_rate": 3.806337312078595e-05, "loss": 0.0468, "step": 74500 }, { "epoch": 0.5543377298903956, "grad_norm": 0.440932959318161, "learning_rate": 3.804381834394506e-05, "loss": 0.0462, "step": 74600 }, { "epoch": 0.5550808099572729, "grad_norm": 0.17545293271541595, "learning_rate": 3.802426356710417e-05, "loss": 0.0532, "step": 74700 }, { "epoch": 0.5558238900241501, "grad_norm": 0.3638128340244293, "learning_rate": 3.800470879026329e-05, "loss": 0.048, "step": 74800 }, { "epoch": 0.5565669700910273, "grad_norm": 0.4932428002357483, "learning_rate": 3.79851540134224e-05, "loss": 0.0364, "step": 74900 }, { "epoch": 0.5573100501579045, "grad_norm": 0.8359973430633545, "learning_rate": 3.796559923658151e-05, "loss": 0.0451, "step": 75000 }, { "epoch": 0.5580531302247818, "grad_norm": 1.6820683479309082, "learning_rate": 3.794604445974062e-05, "loss": 0.0482, "step": 75100 }, { "epoch": 0.5587962102916589, "grad_norm": 0.5544984936714172, "learning_rate": 3.792648968289974e-05, "loss": 0.0569, "step": 75200 }, { "epoch": 0.5595392903585361, "grad_norm": 0.3740028142929077, "learning_rate": 3.790693490605885e-05, "loss": 0.0544, "step": 75300 }, { "epoch": 0.5602823704254133, "grad_norm": 0.2644384205341339, "learning_rate": 3.788738012921797e-05, "loss": 0.0546, "step": 75400 }, { "epoch": 0.5610254504922906, "grad_norm": 0.4032941460609436, "learning_rate": 3.786782535237708e-05, "loss": 0.0481, "step": 75500 }, { "epoch": 0.5617685305591678, "grad_norm": 1.3598161935806274, "learning_rate": 3.784827057553619e-05, "loss": 0.0528, "step": 75600 }, { "epoch": 0.5625116106260449, "grad_norm": 2.9403254985809326, "learning_rate": 3.7828715798695305e-05, "loss": 0.0508, "step": 75700 }, { "epoch": 0.5632546906929221, "grad_norm": 1.8048450946807861, "learning_rate": 3.780916102185442e-05, "loss": 0.048, "step": 75800 }, { "epoch": 0.5639977707597994, "grad_norm": 0.49516135454177856, "learning_rate": 3.7789606245013534e-05, "loss": 0.0443, "step": 75900 }, { "epoch": 0.5647408508266766, "grad_norm": 0.6021585464477539, "learning_rate": 3.7770051468172646e-05, "loss": 0.0487, "step": 76000 }, { "epoch": 0.5654839308935538, "grad_norm": 0.25116273760795593, "learning_rate": 3.775049669133176e-05, "loss": 0.0487, "step": 76100 }, { "epoch": 0.5662270109604309, "grad_norm": 0.6313378810882568, "learning_rate": 3.7730941914490875e-05, "loss": 0.0481, "step": 76200 }, { "epoch": 0.5669700910273082, "grad_norm": 0.373523473739624, "learning_rate": 3.7711387137649987e-05, "loss": 0.0456, "step": 76300 }, { "epoch": 0.5677131710941854, "grad_norm": 3.0069401264190674, "learning_rate": 3.76918323608091e-05, "loss": 0.0479, "step": 76400 }, { "epoch": 0.5684562511610626, "grad_norm": 1.0755634307861328, "learning_rate": 3.7672277583968216e-05, "loss": 0.0427, "step": 76500 }, { "epoch": 0.5691993312279399, "grad_norm": 0.6286352872848511, "learning_rate": 3.765272280712733e-05, "loss": 0.0533, "step": 76600 }, { "epoch": 0.569942411294817, "grad_norm": 0.6744710803031921, "learning_rate": 3.763316803028644e-05, "loss": 0.0532, "step": 76700 }, { "epoch": 0.5706854913616942, "grad_norm": 0.21620097756385803, "learning_rate": 3.761361325344555e-05, "loss": 0.0422, "step": 76800 }, { "epoch": 0.5714285714285714, "grad_norm": 1.0422821044921875, "learning_rate": 3.759405847660467e-05, "loss": 0.0539, "step": 76900 }, { "epoch": 0.5721716514954487, "grad_norm": 2.037611484527588, "learning_rate": 3.757450369976378e-05, "loss": 0.0412, "step": 77000 }, { "epoch": 0.5729147315623259, "grad_norm": 0.2640922963619232, "learning_rate": 3.75549489229229e-05, "loss": 0.0454, "step": 77100 }, { "epoch": 0.573657811629203, "grad_norm": 0.5805174708366394, "learning_rate": 3.753539414608201e-05, "loss": 0.0472, "step": 77200 }, { "epoch": 0.5744008916960802, "grad_norm": 0.7103205323219299, "learning_rate": 3.751583936924112e-05, "loss": 0.0387, "step": 77300 }, { "epoch": 0.5751439717629575, "grad_norm": 0.10632061213254929, "learning_rate": 3.749628459240023e-05, "loss": 0.0421, "step": 77400 }, { "epoch": 0.5758870518298347, "grad_norm": 0.32918354868888855, "learning_rate": 3.747672981555935e-05, "loss": 0.049, "step": 77500 }, { "epoch": 0.5766301318967119, "grad_norm": 0.158243790268898, "learning_rate": 3.745717503871846e-05, "loss": 0.0546, "step": 77600 }, { "epoch": 0.577373211963589, "grad_norm": 0.4716877341270447, "learning_rate": 3.743762026187757e-05, "loss": 0.044, "step": 77700 }, { "epoch": 0.5781162920304663, "grad_norm": 0.3039693236351013, "learning_rate": 3.7418065485036684e-05, "loss": 0.051, "step": 77800 }, { "epoch": 0.5788593720973435, "grad_norm": 0.5305482745170593, "learning_rate": 3.73985107081958e-05, "loss": 0.0511, "step": 77900 }, { "epoch": 0.5796024521642207, "grad_norm": 0.5830110907554626, "learning_rate": 3.7378955931354914e-05, "loss": 0.0489, "step": 78000 }, { "epoch": 0.5803455322310979, "grad_norm": 0.39950165152549744, "learning_rate": 3.735940115451403e-05, "loss": 0.0434, "step": 78100 }, { "epoch": 0.5810886122979751, "grad_norm": 1.9517604112625122, "learning_rate": 3.7339846377673137e-05, "loss": 0.0481, "step": 78200 }, { "epoch": 0.5818316923648523, "grad_norm": 0.12026433646678925, "learning_rate": 3.7320291600832255e-05, "loss": 0.0439, "step": 78300 }, { "epoch": 0.5825747724317295, "grad_norm": 0.49367329478263855, "learning_rate": 3.7300736823991366e-05, "loss": 0.0395, "step": 78400 }, { "epoch": 0.5833178524986067, "grad_norm": 0.2561337947845459, "learning_rate": 3.7281182047150484e-05, "loss": 0.0419, "step": 78500 }, { "epoch": 0.584060932565484, "grad_norm": 0.14756043255329132, "learning_rate": 3.726162727030959e-05, "loss": 0.0448, "step": 78600 }, { "epoch": 0.5848040126323611, "grad_norm": 0.33207398653030396, "learning_rate": 3.724207249346871e-05, "loss": 0.0476, "step": 78700 }, { "epoch": 0.5855470926992383, "grad_norm": 0.40394625067710876, "learning_rate": 3.722251771662782e-05, "loss": 0.0513, "step": 78800 }, { "epoch": 0.5862901727661155, "grad_norm": 0.7332561016082764, "learning_rate": 3.7202962939786937e-05, "loss": 0.0454, "step": 78900 }, { "epoch": 0.5870332528329928, "grad_norm": 0.20013241469860077, "learning_rate": 3.718340816294605e-05, "loss": 0.0422, "step": 79000 }, { "epoch": 0.58777633289987, "grad_norm": 0.5450400710105896, "learning_rate": 3.716385338610516e-05, "loss": 0.043, "step": 79100 }, { "epoch": 0.5885194129667471, "grad_norm": 0.31620481610298157, "learning_rate": 3.714429860926427e-05, "loss": 0.0438, "step": 79200 }, { "epoch": 0.5892624930336243, "grad_norm": 0.49958211183547974, "learning_rate": 3.712474383242339e-05, "loss": 0.0446, "step": 79300 }, { "epoch": 0.5900055731005016, "grad_norm": 0.3462698459625244, "learning_rate": 3.71051890555825e-05, "loss": 0.0345, "step": 79400 }, { "epoch": 0.5907486531673788, "grad_norm": 0.9915865659713745, "learning_rate": 3.708563427874161e-05, "loss": 0.0444, "step": 79500 }, { "epoch": 0.591491733234256, "grad_norm": 0.8359519243240356, "learning_rate": 3.706607950190072e-05, "loss": 0.0538, "step": 79600 }, { "epoch": 0.5922348133011331, "grad_norm": 0.42004525661468506, "learning_rate": 3.704652472505984e-05, "loss": 0.0471, "step": 79700 }, { "epoch": 0.5929778933680104, "grad_norm": 0.6156326532363892, "learning_rate": 3.702696994821895e-05, "loss": 0.0421, "step": 79800 }, { "epoch": 0.5937209734348876, "grad_norm": 0.49050307273864746, "learning_rate": 3.700741517137807e-05, "loss": 0.0447, "step": 79900 }, { "epoch": 0.5944640535017648, "grad_norm": 0.6744931936264038, "learning_rate": 3.6987860394537175e-05, "loss": 0.0463, "step": 80000 }, { "epoch": 0.595207133568642, "grad_norm": 1.9162582159042358, "learning_rate": 3.6968305617696293e-05, "loss": 0.0524, "step": 80100 }, { "epoch": 0.5959502136355193, "grad_norm": 1.3160343170166016, "learning_rate": 3.6948750840855405e-05, "loss": 0.0462, "step": 80200 }, { "epoch": 0.5966932937023964, "grad_norm": 0.2921721637248993, "learning_rate": 3.692919606401452e-05, "loss": 0.0435, "step": 80300 }, { "epoch": 0.5974363737692736, "grad_norm": 1.7545665502548218, "learning_rate": 3.6909641287173634e-05, "loss": 0.0413, "step": 80400 }, { "epoch": 0.5981794538361509, "grad_norm": 5.170703887939453, "learning_rate": 3.6890086510332746e-05, "loss": 0.0496, "step": 80500 }, { "epoch": 0.5989225339030281, "grad_norm": 0.14962519705295563, "learning_rate": 3.687053173349186e-05, "loss": 0.04, "step": 80600 }, { "epoch": 0.5996656139699053, "grad_norm": 1.7956011295318604, "learning_rate": 3.6850976956650975e-05, "loss": 0.0443, "step": 80700 }, { "epoch": 0.6004086940367824, "grad_norm": 0.34586215019226074, "learning_rate": 3.6831422179810087e-05, "loss": 0.0424, "step": 80800 }, { "epoch": 0.6011517741036597, "grad_norm": 0.7476115226745605, "learning_rate": 3.68118674029692e-05, "loss": 0.041, "step": 80900 }, { "epoch": 0.6018948541705369, "grad_norm": 0.3212285339832306, "learning_rate": 3.679231262612831e-05, "loss": 0.0489, "step": 81000 }, { "epoch": 0.6026379342374141, "grad_norm": 0.4903147518634796, "learning_rate": 3.677275784928743e-05, "loss": 0.0419, "step": 81100 }, { "epoch": 0.6033810143042913, "grad_norm": 0.3634468913078308, "learning_rate": 3.675320307244654e-05, "loss": 0.0445, "step": 81200 }, { "epoch": 0.6041240943711685, "grad_norm": 0.3146348297595978, "learning_rate": 3.673364829560566e-05, "loss": 0.0533, "step": 81300 }, { "epoch": 0.6048671744380457, "grad_norm": 0.28772807121276855, "learning_rate": 3.671409351876476e-05, "loss": 0.0471, "step": 81400 }, { "epoch": 0.6056102545049229, "grad_norm": 0.5339851379394531, "learning_rate": 3.669453874192388e-05, "loss": 0.0457, "step": 81500 }, { "epoch": 0.6063533345718001, "grad_norm": 0.39948976039886475, "learning_rate": 3.667498396508299e-05, "loss": 0.0525, "step": 81600 }, { "epoch": 0.6070964146386774, "grad_norm": 0.4601937234401703, "learning_rate": 3.665542918824211e-05, "loss": 0.047, "step": 81700 }, { "epoch": 0.6078394947055545, "grad_norm": 0.3977317810058594, "learning_rate": 3.6635874411401214e-05, "loss": 0.046, "step": 81800 }, { "epoch": 0.6085825747724317, "grad_norm": 0.3781704604625702, "learning_rate": 3.661631963456033e-05, "loss": 0.0432, "step": 81900 }, { "epoch": 0.6093256548393089, "grad_norm": 0.3988845646381378, "learning_rate": 3.6596764857719443e-05, "loss": 0.0537, "step": 82000 }, { "epoch": 0.6100687349061862, "grad_norm": 0.8191292881965637, "learning_rate": 3.657721008087856e-05, "loss": 0.0469, "step": 82100 }, { "epoch": 0.6108118149730634, "grad_norm": 0.7962155342102051, "learning_rate": 3.655765530403767e-05, "loss": 0.044, "step": 82200 }, { "epoch": 0.6115548950399405, "grad_norm": 0.3765295147895813, "learning_rate": 3.6538100527196784e-05, "loss": 0.0491, "step": 82300 }, { "epoch": 0.6122979751068177, "grad_norm": 0.7137246131896973, "learning_rate": 3.6518545750355896e-05, "loss": 0.04, "step": 82400 }, { "epoch": 0.613041055173695, "grad_norm": 0.30367806553840637, "learning_rate": 3.6498990973515014e-05, "loss": 0.0422, "step": 82500 }, { "epoch": 0.6137841352405722, "grad_norm": 1.504367709159851, "learning_rate": 3.6479436196674125e-05, "loss": 0.0458, "step": 82600 }, { "epoch": 0.6145272153074494, "grad_norm": 0.3797600269317627, "learning_rate": 3.645988141983324e-05, "loss": 0.0415, "step": 82700 }, { "epoch": 0.6152702953743265, "grad_norm": 1.0168997049331665, "learning_rate": 3.644032664299235e-05, "loss": 0.0451, "step": 82800 }, { "epoch": 0.6160133754412038, "grad_norm": 0.15634655952453613, "learning_rate": 3.6420771866151466e-05, "loss": 0.0424, "step": 82900 }, { "epoch": 0.616756455508081, "grad_norm": 0.7947319746017456, "learning_rate": 3.640121708931058e-05, "loss": 0.0407, "step": 83000 }, { "epoch": 0.6174995355749582, "grad_norm": 11.290752410888672, "learning_rate": 3.6381662312469696e-05, "loss": 0.0504, "step": 83100 }, { "epoch": 0.6182426156418354, "grad_norm": 0.5697255730628967, "learning_rate": 3.63621075356288e-05, "loss": 0.0444, "step": 83200 }, { "epoch": 0.6189856957087126, "grad_norm": 0.3050697147846222, "learning_rate": 3.634255275878792e-05, "loss": 0.0462, "step": 83300 }, { "epoch": 0.6197287757755898, "grad_norm": 0.33098092675209045, "learning_rate": 3.632299798194703e-05, "loss": 0.0426, "step": 83400 }, { "epoch": 0.620471855842467, "grad_norm": 0.4476211667060852, "learning_rate": 3.630344320510615e-05, "loss": 0.0514, "step": 83500 }, { "epoch": 0.6212149359093442, "grad_norm": 0.8756594657897949, "learning_rate": 3.628388842826525e-05, "loss": 0.0436, "step": 83600 }, { "epoch": 0.6219580159762215, "grad_norm": 6.3221516609191895, "learning_rate": 3.626433365142437e-05, "loss": 0.0382, "step": 83700 }, { "epoch": 0.6227010960430986, "grad_norm": 0.2640119194984436, "learning_rate": 3.624477887458348e-05, "loss": 0.0464, "step": 83800 }, { "epoch": 0.6234441761099758, "grad_norm": 0.4289034605026245, "learning_rate": 3.62252240977426e-05, "loss": 0.0403, "step": 83900 }, { "epoch": 0.6241872561768531, "grad_norm": 0.30275195837020874, "learning_rate": 3.620566932090171e-05, "loss": 0.0539, "step": 84000 }, { "epoch": 0.6249303362437303, "grad_norm": 1.0072731971740723, "learning_rate": 3.618611454406082e-05, "loss": 0.0456, "step": 84100 }, { "epoch": 0.6256734163106075, "grad_norm": 3.300447702407837, "learning_rate": 3.6166559767219934e-05, "loss": 0.0452, "step": 84200 }, { "epoch": 0.6264164963774846, "grad_norm": 0.6900713443756104, "learning_rate": 3.614700499037905e-05, "loss": 0.0417, "step": 84300 }, { "epoch": 0.6271595764443619, "grad_norm": 0.239619180560112, "learning_rate": 3.6127450213538164e-05, "loss": 0.0404, "step": 84400 }, { "epoch": 0.6279026565112391, "grad_norm": 0.24124091863632202, "learning_rate": 3.6107895436697275e-05, "loss": 0.0422, "step": 84500 }, { "epoch": 0.6286457365781163, "grad_norm": 2.03228759765625, "learning_rate": 3.608834065985639e-05, "loss": 0.0468, "step": 84600 }, { "epoch": 0.6293888166449935, "grad_norm": 2.745407819747925, "learning_rate": 3.6068785883015505e-05, "loss": 0.045, "step": 84700 }, { "epoch": 0.6301318967118708, "grad_norm": 0.43008753657341003, "learning_rate": 3.6049231106174616e-05, "loss": 0.0423, "step": 84800 }, { "epoch": 0.6308749767787479, "grad_norm": 0.5743459463119507, "learning_rate": 3.6029676329333734e-05, "loss": 0.0438, "step": 84900 }, { "epoch": 0.6316180568456251, "grad_norm": 0.40773773193359375, "learning_rate": 3.6010121552492846e-05, "loss": 0.0508, "step": 85000 }, { "epoch": 0.6323611369125023, "grad_norm": 0.5808705687522888, "learning_rate": 3.599056677565196e-05, "loss": 0.04, "step": 85100 }, { "epoch": 0.6331042169793796, "grad_norm": 0.45666179060935974, "learning_rate": 3.597101199881107e-05, "loss": 0.0504, "step": 85200 }, { "epoch": 0.6338472970462568, "grad_norm": 0.6470193862915039, "learning_rate": 3.595145722197019e-05, "loss": 0.0424, "step": 85300 }, { "epoch": 0.6345903771131339, "grad_norm": 0.5555102229118347, "learning_rate": 3.59319024451293e-05, "loss": 0.0405, "step": 85400 }, { "epoch": 0.6353334571800111, "grad_norm": 0.30563172698020935, "learning_rate": 3.591234766828841e-05, "loss": 0.0421, "step": 85500 }, { "epoch": 0.6360765372468884, "grad_norm": 0.2433183491230011, "learning_rate": 3.589279289144753e-05, "loss": 0.0564, "step": 85600 }, { "epoch": 0.6368196173137656, "grad_norm": 0.6994438767433167, "learning_rate": 3.587323811460664e-05, "loss": 0.0394, "step": 85700 }, { "epoch": 0.6375626973806428, "grad_norm": 0.33403280377388, "learning_rate": 3.585368333776575e-05, "loss": 0.0511, "step": 85800 }, { "epoch": 0.6383057774475199, "grad_norm": 0.3223717510700226, "learning_rate": 3.583412856092486e-05, "loss": 0.0478, "step": 85900 }, { "epoch": 0.6390488575143972, "grad_norm": 0.2600286602973938, "learning_rate": 3.581457378408398e-05, "loss": 0.0425, "step": 86000 }, { "epoch": 0.6397919375812744, "grad_norm": 0.5646936297416687, "learning_rate": 3.579501900724309e-05, "loss": 0.0413, "step": 86100 }, { "epoch": 0.6405350176481516, "grad_norm": 1.2295092344284058, "learning_rate": 3.57754642304022e-05, "loss": 0.0471, "step": 86200 }, { "epoch": 0.6412780977150287, "grad_norm": 0.35272443294525146, "learning_rate": 3.575590945356132e-05, "loss": 0.0424, "step": 86300 }, { "epoch": 0.642021177781906, "grad_norm": 0.2655154764652252, "learning_rate": 3.573635467672043e-05, "loss": 0.041, "step": 86400 }, { "epoch": 0.6427642578487832, "grad_norm": 0.6777417659759521, "learning_rate": 3.5716799899879543e-05, "loss": 0.051, "step": 86500 }, { "epoch": 0.6435073379156604, "grad_norm": 0.6530482172966003, "learning_rate": 3.569724512303866e-05, "loss": 0.051, "step": 86600 }, { "epoch": 0.6442504179825376, "grad_norm": 0.5525156855583191, "learning_rate": 3.567769034619777e-05, "loss": 0.052, "step": 86700 }, { "epoch": 0.6449934980494149, "grad_norm": 0.5975711345672607, "learning_rate": 3.5658135569356884e-05, "loss": 0.0474, "step": 86800 }, { "epoch": 0.645736578116292, "grad_norm": 0.20195943117141724, "learning_rate": 3.5638580792515996e-05, "loss": 0.0389, "step": 86900 }, { "epoch": 0.6464796581831692, "grad_norm": 1.16473388671875, "learning_rate": 3.5619026015675114e-05, "loss": 0.0443, "step": 87000 }, { "epoch": 0.6472227382500464, "grad_norm": 0.4573332667350769, "learning_rate": 3.5599471238834225e-05, "loss": 0.0483, "step": 87100 }, { "epoch": 0.6479658183169237, "grad_norm": 0.6161136031150818, "learning_rate": 3.5579916461993343e-05, "loss": 0.0315, "step": 87200 }, { "epoch": 0.6487088983838009, "grad_norm": 0.5839945077896118, "learning_rate": 3.556036168515245e-05, "loss": 0.0375, "step": 87300 }, { "epoch": 0.649451978450678, "grad_norm": 0.4524543583393097, "learning_rate": 3.5540806908311566e-05, "loss": 0.0422, "step": 87400 }, { "epoch": 0.6501950585175552, "grad_norm": 0.4384470582008362, "learning_rate": 3.552125213147068e-05, "loss": 0.0474, "step": 87500 }, { "epoch": 0.6509381385844325, "grad_norm": 0.11209399998188019, "learning_rate": 3.5501697354629796e-05, "loss": 0.0455, "step": 87600 }, { "epoch": 0.6516812186513097, "grad_norm": 0.6053728461265564, "learning_rate": 3.54821425777889e-05, "loss": 0.0412, "step": 87700 }, { "epoch": 0.6524242987181869, "grad_norm": 0.11012202501296997, "learning_rate": 3.546258780094802e-05, "loss": 0.0449, "step": 87800 }, { "epoch": 0.6531673787850641, "grad_norm": 0.4069393277168274, "learning_rate": 3.544303302410713e-05, "loss": 0.0491, "step": 87900 }, { "epoch": 0.6539104588519413, "grad_norm": 0.22591173648834229, "learning_rate": 3.542347824726625e-05, "loss": 0.0514, "step": 88000 }, { "epoch": 0.6546535389188185, "grad_norm": 0.44217488169670105, "learning_rate": 3.540392347042536e-05, "loss": 0.0526, "step": 88100 }, { "epoch": 0.6553966189856957, "grad_norm": 0.686578094959259, "learning_rate": 3.538436869358447e-05, "loss": 0.0432, "step": 88200 }, { "epoch": 0.656139699052573, "grad_norm": 0.15330742299556732, "learning_rate": 3.536481391674358e-05, "loss": 0.0449, "step": 88300 }, { "epoch": 0.6568827791194501, "grad_norm": 0.2532951235771179, "learning_rate": 3.53452591399027e-05, "loss": 0.0455, "step": 88400 }, { "epoch": 0.6576258591863273, "grad_norm": 0.22048042714595795, "learning_rate": 3.532570436306181e-05, "loss": 0.056, "step": 88500 }, { "epoch": 0.6583689392532045, "grad_norm": 0.8291521072387695, "learning_rate": 3.530614958622092e-05, "loss": 0.036, "step": 88600 }, { "epoch": 0.6591120193200818, "grad_norm": 0.36509400606155396, "learning_rate": 3.5286594809380034e-05, "loss": 0.0429, "step": 88700 }, { "epoch": 0.659855099386959, "grad_norm": 0.5093159079551697, "learning_rate": 3.526704003253915e-05, "loss": 0.0414, "step": 88800 }, { "epoch": 0.6605981794538361, "grad_norm": 0.879650354385376, "learning_rate": 3.5247485255698264e-05, "loss": 0.043, "step": 88900 }, { "epoch": 0.6613412595207133, "grad_norm": 0.2649843990802765, "learning_rate": 3.522793047885738e-05, "loss": 0.0408, "step": 89000 }, { "epoch": 0.6620843395875906, "grad_norm": 0.5701449513435364, "learning_rate": 3.520837570201649e-05, "loss": 0.0444, "step": 89100 }, { "epoch": 0.6628274196544678, "grad_norm": 0.34187614917755127, "learning_rate": 3.5188820925175605e-05, "loss": 0.0385, "step": 89200 }, { "epoch": 0.663570499721345, "grad_norm": 1.5775477886199951, "learning_rate": 3.5169266148334716e-05, "loss": 0.0527, "step": 89300 }, { "epoch": 0.6643135797882221, "grad_norm": 0.30522620677948, "learning_rate": 3.5149711371493834e-05, "loss": 0.043, "step": 89400 }, { "epoch": 0.6650566598550994, "grad_norm": 0.1581784188747406, "learning_rate": 3.513015659465294e-05, "loss": 0.0412, "step": 89500 }, { "epoch": 0.6657997399219766, "grad_norm": 0.1614101231098175, "learning_rate": 3.511060181781206e-05, "loss": 0.0431, "step": 89600 }, { "epoch": 0.6665428199888538, "grad_norm": 0.3544206917285919, "learning_rate": 3.509104704097117e-05, "loss": 0.0444, "step": 89700 }, { "epoch": 0.667285900055731, "grad_norm": 2.772639274597168, "learning_rate": 3.507149226413029e-05, "loss": 0.0476, "step": 89800 }, { "epoch": 0.6680289801226083, "grad_norm": 1.4811934232711792, "learning_rate": 3.50519374872894e-05, "loss": 0.0436, "step": 89900 }, { "epoch": 0.6687720601894854, "grad_norm": 0.3753783106803894, "learning_rate": 3.503238271044851e-05, "loss": 0.0474, "step": 90000 }, { "epoch": 0.6695151402563626, "grad_norm": 0.580274760723114, "learning_rate": 3.501282793360762e-05, "loss": 0.0489, "step": 90100 }, { "epoch": 0.6702582203232398, "grad_norm": 0.4221446216106415, "learning_rate": 3.499327315676674e-05, "loss": 0.0464, "step": 90200 }, { "epoch": 0.6710013003901171, "grad_norm": 0.2914683222770691, "learning_rate": 3.497371837992585e-05, "loss": 0.0506, "step": 90300 }, { "epoch": 0.6717443804569943, "grad_norm": 0.2871812582015991, "learning_rate": 3.495416360308496e-05, "loss": 0.0374, "step": 90400 }, { "epoch": 0.6724874605238714, "grad_norm": 0.21389980614185333, "learning_rate": 3.493460882624407e-05, "loss": 0.0473, "step": 90500 }, { "epoch": 0.6732305405907486, "grad_norm": 0.44926896691322327, "learning_rate": 3.491505404940319e-05, "loss": 0.0508, "step": 90600 }, { "epoch": 0.6739736206576259, "grad_norm": 0.23165352642536163, "learning_rate": 3.48954992725623e-05, "loss": 0.0494, "step": 90700 }, { "epoch": 0.6747167007245031, "grad_norm": 0.4013068974018097, "learning_rate": 3.487594449572142e-05, "loss": 0.0441, "step": 90800 }, { "epoch": 0.6754597807913802, "grad_norm": 0.3344004452228546, "learning_rate": 3.4856389718880525e-05, "loss": 0.0416, "step": 90900 }, { "epoch": 0.6762028608582574, "grad_norm": 1.4763915538787842, "learning_rate": 3.4836834942039644e-05, "loss": 0.0472, "step": 91000 }, { "epoch": 0.6769459409251347, "grad_norm": 0.08701636642217636, "learning_rate": 3.4817280165198755e-05, "loss": 0.0387, "step": 91100 }, { "epoch": 0.6776890209920119, "grad_norm": 0.5263895988464355, "learning_rate": 3.479772538835787e-05, "loss": 0.04, "step": 91200 }, { "epoch": 0.6784321010588891, "grad_norm": 0.3336668312549591, "learning_rate": 3.477817061151698e-05, "loss": 0.0456, "step": 91300 }, { "epoch": 0.6791751811257662, "grad_norm": 3.3314902782440186, "learning_rate": 3.4758615834676096e-05, "loss": 0.0402, "step": 91400 }, { "epoch": 0.6799182611926435, "grad_norm": 0.40646541118621826, "learning_rate": 3.473906105783521e-05, "loss": 0.0453, "step": 91500 }, { "epoch": 0.6806613412595207, "grad_norm": 0.8153696060180664, "learning_rate": 3.4719506280994325e-05, "loss": 0.0481, "step": 91600 }, { "epoch": 0.6814044213263979, "grad_norm": 0.4665226936340332, "learning_rate": 3.469995150415344e-05, "loss": 0.0483, "step": 91700 }, { "epoch": 0.6821475013932752, "grad_norm": 0.10660091787576675, "learning_rate": 3.468039672731255e-05, "loss": 0.0441, "step": 91800 }, { "epoch": 0.6828905814601524, "grad_norm": 4.398245811462402, "learning_rate": 3.466084195047166e-05, "loss": 0.0591, "step": 91900 }, { "epoch": 0.6836336615270295, "grad_norm": 0.28610602021217346, "learning_rate": 3.464128717363078e-05, "loss": 0.0469, "step": 92000 }, { "epoch": 0.6843767415939067, "grad_norm": 0.279502272605896, "learning_rate": 3.462173239678989e-05, "loss": 0.0575, "step": 92100 }, { "epoch": 0.685119821660784, "grad_norm": 2.008476734161377, "learning_rate": 3.460217761994901e-05, "loss": 0.046, "step": 92200 }, { "epoch": 0.6858629017276612, "grad_norm": 0.7492668628692627, "learning_rate": 3.458262284310811e-05, "loss": 0.0476, "step": 92300 }, { "epoch": 0.6866059817945384, "grad_norm": 0.3135048449039459, "learning_rate": 3.456306806626723e-05, "loss": 0.0506, "step": 92400 }, { "epoch": 0.6873490618614155, "grad_norm": 2.0013582706451416, "learning_rate": 3.454351328942634e-05, "loss": 0.0453, "step": 92500 }, { "epoch": 0.6880921419282928, "grad_norm": 3.9885740280151367, "learning_rate": 3.452395851258546e-05, "loss": 0.0433, "step": 92600 }, { "epoch": 0.68883522199517, "grad_norm": 0.223055899143219, "learning_rate": 3.4504403735744564e-05, "loss": 0.0496, "step": 92700 }, { "epoch": 0.6895783020620472, "grad_norm": 0.35352155566215515, "learning_rate": 3.448484895890368e-05, "loss": 0.051, "step": 92800 }, { "epoch": 0.6903213821289244, "grad_norm": 0.2943922281265259, "learning_rate": 3.4465294182062794e-05, "loss": 0.045, "step": 92900 }, { "epoch": 0.6910644621958016, "grad_norm": 0.32264962792396545, "learning_rate": 3.444573940522191e-05, "loss": 0.0398, "step": 93000 }, { "epoch": 0.6918075422626788, "grad_norm": 0.48716050386428833, "learning_rate": 3.442618462838102e-05, "loss": 0.0446, "step": 93100 }, { "epoch": 0.692550622329556, "grad_norm": 0.6209288835525513, "learning_rate": 3.4406629851540134e-05, "loss": 0.0427, "step": 93200 }, { "epoch": 0.6932937023964332, "grad_norm": 0.14264117181301117, "learning_rate": 3.4387075074699246e-05, "loss": 0.0395, "step": 93300 }, { "epoch": 0.6940367824633105, "grad_norm": 0.44311922788619995, "learning_rate": 3.4367520297858364e-05, "loss": 0.0403, "step": 93400 }, { "epoch": 0.6947798625301876, "grad_norm": 0.42145398259162903, "learning_rate": 3.4347965521017475e-05, "loss": 0.0565, "step": 93500 }, { "epoch": 0.6955229425970648, "grad_norm": 0.37768375873565674, "learning_rate": 3.432841074417659e-05, "loss": 0.0403, "step": 93600 }, { "epoch": 0.696266022663942, "grad_norm": 0.40025150775909424, "learning_rate": 3.43088559673357e-05, "loss": 0.0399, "step": 93700 }, { "epoch": 0.6970091027308193, "grad_norm": 0.4069999158382416, "learning_rate": 3.4289301190494816e-05, "loss": 0.0456, "step": 93800 }, { "epoch": 0.6977521827976965, "grad_norm": 0.4744893014431, "learning_rate": 3.426974641365393e-05, "loss": 0.0463, "step": 93900 }, { "epoch": 0.6984952628645736, "grad_norm": 3.510014295578003, "learning_rate": 3.4250191636813046e-05, "loss": 0.043, "step": 94000 }, { "epoch": 0.6992383429314508, "grad_norm": 0.4999483823776245, "learning_rate": 3.423063685997216e-05, "loss": 0.0498, "step": 94100 }, { "epoch": 0.6999814229983281, "grad_norm": 0.6015092134475708, "learning_rate": 3.421108208313127e-05, "loss": 0.0604, "step": 94200 }, { "epoch": 0.7007245030652053, "grad_norm": 0.6383658051490784, "learning_rate": 3.419152730629038e-05, "loss": 0.0482, "step": 94300 }, { "epoch": 0.7014675831320825, "grad_norm": 3.2382450103759766, "learning_rate": 3.41719725294495e-05, "loss": 0.0412, "step": 94400 }, { "epoch": 0.7022106631989596, "grad_norm": 1.6235198974609375, "learning_rate": 3.415241775260861e-05, "loss": 0.046, "step": 94500 }, { "epoch": 0.7029537432658369, "grad_norm": 0.6692045331001282, "learning_rate": 3.413286297576772e-05, "loss": 0.0443, "step": 94600 }, { "epoch": 0.7036968233327141, "grad_norm": 0.14579874277114868, "learning_rate": 3.411330819892683e-05, "loss": 0.0446, "step": 94700 }, { "epoch": 0.7044399033995913, "grad_norm": 0.3242962658405304, "learning_rate": 3.409375342208595e-05, "loss": 0.0365, "step": 94800 }, { "epoch": 0.7051829834664685, "grad_norm": 0.5690775513648987, "learning_rate": 3.407419864524506e-05, "loss": 0.0432, "step": 94900 }, { "epoch": 0.7059260635333457, "grad_norm": 0.7420532703399658, "learning_rate": 3.405464386840417e-05, "loss": 0.0499, "step": 95000 }, { "epoch": 0.7066691436002229, "grad_norm": 0.2678348422050476, "learning_rate": 3.403508909156329e-05, "loss": 0.0408, "step": 95100 }, { "epoch": 0.7074122236671001, "grad_norm": 0.3802770972251892, "learning_rate": 3.40155343147224e-05, "loss": 0.0489, "step": 95200 }, { "epoch": 0.7081553037339773, "grad_norm": 1.927528738975525, "learning_rate": 3.3995979537881514e-05, "loss": 0.0475, "step": 95300 }, { "epoch": 0.7088983838008546, "grad_norm": 0.2352389693260193, "learning_rate": 3.3976424761040625e-05, "loss": 0.0412, "step": 95400 }, { "epoch": 0.7096414638677317, "grad_norm": 0.14121490716934204, "learning_rate": 3.3956869984199744e-05, "loss": 0.0526, "step": 95500 }, { "epoch": 0.7103845439346089, "grad_norm": 0.47256824374198914, "learning_rate": 3.3937315207358855e-05, "loss": 0.0511, "step": 95600 }, { "epoch": 0.7111276240014862, "grad_norm": 0.3019408881664276, "learning_rate": 3.391776043051797e-05, "loss": 0.0411, "step": 95700 }, { "epoch": 0.7118707040683634, "grad_norm": 0.3463144898414612, "learning_rate": 3.3898205653677084e-05, "loss": 0.0428, "step": 95800 }, { "epoch": 0.7126137841352406, "grad_norm": 0.38097718358039856, "learning_rate": 3.3878650876836196e-05, "loss": 0.0455, "step": 95900 }, { "epoch": 0.7133568642021177, "grad_norm": 0.42194485664367676, "learning_rate": 3.385909609999531e-05, "loss": 0.0425, "step": 96000 }, { "epoch": 0.714099944268995, "grad_norm": 2.412665605545044, "learning_rate": 3.3839541323154425e-05, "loss": 0.041, "step": 96100 }, { "epoch": 0.7148430243358722, "grad_norm": 0.7553698420524597, "learning_rate": 3.381998654631354e-05, "loss": 0.0441, "step": 96200 }, { "epoch": 0.7155861044027494, "grad_norm": 0.44968587160110474, "learning_rate": 3.380043176947265e-05, "loss": 0.0466, "step": 96300 }, { "epoch": 0.7163291844696266, "grad_norm": 0.3241223990917206, "learning_rate": 3.378087699263176e-05, "loss": 0.0364, "step": 96400 }, { "epoch": 0.7170722645365039, "grad_norm": 5.413496017456055, "learning_rate": 3.376132221579088e-05, "loss": 0.0391, "step": 96500 }, { "epoch": 0.717815344603381, "grad_norm": 0.7810749411582947, "learning_rate": 3.374176743894999e-05, "loss": 0.0528, "step": 96600 }, { "epoch": 0.7185584246702582, "grad_norm": 0.23911458253860474, "learning_rate": 3.372221266210911e-05, "loss": 0.0402, "step": 96700 }, { "epoch": 0.7193015047371354, "grad_norm": 0.5730475783348083, "learning_rate": 3.370265788526821e-05, "loss": 0.0431, "step": 96800 }, { "epoch": 0.7200445848040127, "grad_norm": 0.989937961101532, "learning_rate": 3.368310310842733e-05, "loss": 0.0406, "step": 96900 }, { "epoch": 0.7207876648708899, "grad_norm": 0.7794008851051331, "learning_rate": 3.366354833158644e-05, "loss": 0.0463, "step": 97000 }, { "epoch": 0.721530744937767, "grad_norm": 0.676996648311615, "learning_rate": 3.364399355474556e-05, "loss": 0.0469, "step": 97100 }, { "epoch": 0.7222738250046442, "grad_norm": 0.5104778409004211, "learning_rate": 3.3624438777904664e-05, "loss": 0.051, "step": 97200 }, { "epoch": 0.7230169050715215, "grad_norm": 0.4581410586833954, "learning_rate": 3.360488400106378e-05, "loss": 0.0472, "step": 97300 }, { "epoch": 0.7237599851383987, "grad_norm": 2.252575159072876, "learning_rate": 3.3585329224222894e-05, "loss": 0.052, "step": 97400 }, { "epoch": 0.7245030652052759, "grad_norm": 1.6395001411437988, "learning_rate": 3.356577444738201e-05, "loss": 0.0433, "step": 97500 }, { "epoch": 0.725246145272153, "grad_norm": 2.734894037246704, "learning_rate": 3.354621967054112e-05, "loss": 0.0482, "step": 97600 }, { "epoch": 0.7259892253390303, "grad_norm": 0.685563325881958, "learning_rate": 3.3526664893700234e-05, "loss": 0.0484, "step": 97700 }, { "epoch": 0.7267323054059075, "grad_norm": 0.24773761630058289, "learning_rate": 3.3507110116859346e-05, "loss": 0.041, "step": 97800 }, { "epoch": 0.7274753854727847, "grad_norm": 0.23973163962364197, "learning_rate": 3.3487555340018464e-05, "loss": 0.051, "step": 97900 }, { "epoch": 0.7282184655396619, "grad_norm": 0.4687383472919464, "learning_rate": 3.3468000563177575e-05, "loss": 0.0443, "step": 98000 }, { "epoch": 0.7289615456065391, "grad_norm": 1.3939294815063477, "learning_rate": 3.3448445786336694e-05, "loss": 0.0474, "step": 98100 }, { "epoch": 0.7297046256734163, "grad_norm": 0.26695799827575684, "learning_rate": 3.34288910094958e-05, "loss": 0.0417, "step": 98200 }, { "epoch": 0.7304477057402935, "grad_norm": 0.25455090403556824, "learning_rate": 3.3409336232654916e-05, "loss": 0.0476, "step": 98300 }, { "epoch": 0.7311907858071707, "grad_norm": 0.44291216135025024, "learning_rate": 3.338978145581403e-05, "loss": 0.0414, "step": 98400 }, { "epoch": 0.731933865874048, "grad_norm": 2.7482566833496094, "learning_rate": 3.3370226678973146e-05, "loss": 0.054, "step": 98500 }, { "epoch": 0.7326769459409251, "grad_norm": 0.4472244083881378, "learning_rate": 3.335067190213225e-05, "loss": 0.0382, "step": 98600 }, { "epoch": 0.7334200260078023, "grad_norm": 0.2790171802043915, "learning_rate": 3.333111712529137e-05, "loss": 0.0423, "step": 98700 }, { "epoch": 0.7341631060746795, "grad_norm": 0.8662721514701843, "learning_rate": 3.331156234845048e-05, "loss": 0.0436, "step": 98800 }, { "epoch": 0.7349061861415568, "grad_norm": 2.2780773639678955, "learning_rate": 3.32920075716096e-05, "loss": 0.0529, "step": 98900 }, { "epoch": 0.735649266208434, "grad_norm": 0.17346450686454773, "learning_rate": 3.327245279476871e-05, "loss": 0.054, "step": 99000 }, { "epoch": 0.7363923462753111, "grad_norm": 0.32980892062187195, "learning_rate": 3.325289801792782e-05, "loss": 0.0409, "step": 99100 }, { "epoch": 0.7371354263421883, "grad_norm": 0.6823229789733887, "learning_rate": 3.323334324108693e-05, "loss": 0.0532, "step": 99200 }, { "epoch": 0.7378785064090656, "grad_norm": 1.2808688879013062, "learning_rate": 3.321378846424605e-05, "loss": 0.042, "step": 99300 }, { "epoch": 0.7386215864759428, "grad_norm": 0.2812482416629791, "learning_rate": 3.319423368740516e-05, "loss": 0.0448, "step": 99400 }, { "epoch": 0.73936466654282, "grad_norm": 0.5779196619987488, "learning_rate": 3.317467891056427e-05, "loss": 0.0507, "step": 99500 }, { "epoch": 0.7401077466096972, "grad_norm": 0.24663344025611877, "learning_rate": 3.3155124133723385e-05, "loss": 0.0413, "step": 99600 }, { "epoch": 0.7408508266765744, "grad_norm": 0.9244890809059143, "learning_rate": 3.31355693568825e-05, "loss": 0.042, "step": 99700 }, { "epoch": 0.7415939067434516, "grad_norm": 2.709878444671631, "learning_rate": 3.3116014580041614e-05, "loss": 0.0421, "step": 99800 }, { "epoch": 0.7423369868103288, "grad_norm": 0.2552315592765808, "learning_rate": 3.309645980320073e-05, "loss": 0.0429, "step": 99900 }, { "epoch": 0.7430800668772061, "grad_norm": 0.3115057051181793, "learning_rate": 3.307690502635984e-05, "loss": 0.0378, "step": 100000 }, { "epoch": 0.7438231469440832, "grad_norm": 0.5570692420005798, "learning_rate": 3.3057350249518955e-05, "loss": 0.0385, "step": 100100 }, { "epoch": 0.7445662270109604, "grad_norm": 0.5305907130241394, "learning_rate": 3.3037795472678066e-05, "loss": 0.0516, "step": 100200 }, { "epoch": 0.7453093070778376, "grad_norm": 0.3750559687614441, "learning_rate": 3.3018240695837185e-05, "loss": 0.0437, "step": 100300 }, { "epoch": 0.7460523871447149, "grad_norm": 0.21851783990859985, "learning_rate": 3.299868591899629e-05, "loss": 0.0375, "step": 100400 }, { "epoch": 0.7467954672115921, "grad_norm": 0.9948326349258423, "learning_rate": 3.297913114215541e-05, "loss": 0.0452, "step": 100500 }, { "epoch": 0.7475385472784692, "grad_norm": 0.3856034576892853, "learning_rate": 3.295957636531452e-05, "loss": 0.047, "step": 100600 }, { "epoch": 0.7482816273453464, "grad_norm": 0.6751347780227661, "learning_rate": 3.294002158847364e-05, "loss": 0.0411, "step": 100700 }, { "epoch": 0.7490247074122237, "grad_norm": 0.2636367678642273, "learning_rate": 3.292046681163275e-05, "loss": 0.042, "step": 100800 }, { "epoch": 0.7497677874791009, "grad_norm": 0.1783607304096222, "learning_rate": 3.290091203479186e-05, "loss": 0.0473, "step": 100900 }, { "epoch": 0.7505108675459781, "grad_norm": 0.652965784072876, "learning_rate": 3.288135725795097e-05, "loss": 0.051, "step": 101000 }, { "epoch": 0.7512539476128552, "grad_norm": 0.481058806180954, "learning_rate": 3.286180248111009e-05, "loss": 0.0428, "step": 101100 }, { "epoch": 0.7519970276797325, "grad_norm": 0.6089324355125427, "learning_rate": 3.28422477042692e-05, "loss": 0.0373, "step": 101200 }, { "epoch": 0.7527401077466097, "grad_norm": 0.5342264175415039, "learning_rate": 3.282269292742831e-05, "loss": 0.0406, "step": 101300 }, { "epoch": 0.7534831878134869, "grad_norm": 0.644764244556427, "learning_rate": 3.280313815058742e-05, "loss": 0.0443, "step": 101400 }, { "epoch": 0.7542262678803641, "grad_norm": 0.26991894841194153, "learning_rate": 3.278358337374654e-05, "loss": 0.0421, "step": 101500 }, { "epoch": 0.7549693479472414, "grad_norm": 0.3762785792350769, "learning_rate": 3.276402859690565e-05, "loss": 0.0463, "step": 101600 }, { "epoch": 0.7557124280141185, "grad_norm": 2.5329983234405518, "learning_rate": 3.274447382006477e-05, "loss": 0.0452, "step": 101700 }, { "epoch": 0.7564555080809957, "grad_norm": 0.5806246399879456, "learning_rate": 3.2724919043223875e-05, "loss": 0.0371, "step": 101800 }, { "epoch": 0.7571985881478729, "grad_norm": 0.2789587080478668, "learning_rate": 3.2705364266382994e-05, "loss": 0.0413, "step": 101900 }, { "epoch": 0.7579416682147502, "grad_norm": 0.3755458891391754, "learning_rate": 3.2685809489542105e-05, "loss": 0.0416, "step": 102000 }, { "epoch": 0.7586847482816274, "grad_norm": 0.3323933780193329, "learning_rate": 3.266625471270122e-05, "loss": 0.0427, "step": 102100 }, { "epoch": 0.7594278283485045, "grad_norm": 2.0292532444000244, "learning_rate": 3.264669993586033e-05, "loss": 0.0439, "step": 102200 }, { "epoch": 0.7601709084153817, "grad_norm": 0.25676774978637695, "learning_rate": 3.2627145159019446e-05, "loss": 0.0539, "step": 102300 }, { "epoch": 0.760913988482259, "grad_norm": 0.8279637694358826, "learning_rate": 3.260759038217856e-05, "loss": 0.04, "step": 102400 }, { "epoch": 0.7616570685491362, "grad_norm": 2.194382667541504, "learning_rate": 3.2588035605337675e-05, "loss": 0.04, "step": 102500 }, { "epoch": 0.7624001486160134, "grad_norm": 2.6649348735809326, "learning_rate": 3.256848082849679e-05, "loss": 0.0451, "step": 102600 }, { "epoch": 0.7631432286828905, "grad_norm": 0.48039016127586365, "learning_rate": 3.25489260516559e-05, "loss": 0.0509, "step": 102700 }, { "epoch": 0.7638863087497678, "grad_norm": 0.581303060054779, "learning_rate": 3.252937127481501e-05, "loss": 0.0419, "step": 102800 }, { "epoch": 0.764629388816645, "grad_norm": 0.1978641152381897, "learning_rate": 3.250981649797413e-05, "loss": 0.0387, "step": 102900 }, { "epoch": 0.7653724688835222, "grad_norm": 0.6493159532546997, "learning_rate": 3.249026172113324e-05, "loss": 0.0495, "step": 103000 }, { "epoch": 0.7661155489503995, "grad_norm": 2.189177989959717, "learning_rate": 3.247070694429235e-05, "loss": 0.0542, "step": 103100 }, { "epoch": 0.7668586290172766, "grad_norm": 0.45917460322380066, "learning_rate": 3.245115216745146e-05, "loss": 0.0445, "step": 103200 }, { "epoch": 0.7676017090841538, "grad_norm": 1.4128780364990234, "learning_rate": 3.243159739061058e-05, "loss": 0.0455, "step": 103300 }, { "epoch": 0.768344789151031, "grad_norm": 0.3462684750556946, "learning_rate": 3.241204261376969e-05, "loss": 0.038, "step": 103400 }, { "epoch": 0.7690878692179083, "grad_norm": 0.2707890570163727, "learning_rate": 3.239248783692881e-05, "loss": 0.0444, "step": 103500 }, { "epoch": 0.7698309492847855, "grad_norm": 0.22586743533611298, "learning_rate": 3.237293306008792e-05, "loss": 0.0385, "step": 103600 }, { "epoch": 0.7705740293516626, "grad_norm": 0.7855823040008545, "learning_rate": 3.235337828324703e-05, "loss": 0.0433, "step": 103700 }, { "epoch": 0.7713171094185398, "grad_norm": 0.703031599521637, "learning_rate": 3.2333823506406144e-05, "loss": 0.0387, "step": 103800 }, { "epoch": 0.7720601894854171, "grad_norm": 1.3763829469680786, "learning_rate": 3.231426872956526e-05, "loss": 0.0545, "step": 103900 }, { "epoch": 0.7728032695522943, "grad_norm": 0.47925326228141785, "learning_rate": 3.229471395272437e-05, "loss": 0.0479, "step": 104000 }, { "epoch": 0.7735463496191715, "grad_norm": 0.2822260856628418, "learning_rate": 3.2275159175883485e-05, "loss": 0.041, "step": 104100 }, { "epoch": 0.7742894296860486, "grad_norm": 0.41683799028396606, "learning_rate": 3.22556043990426e-05, "loss": 0.04, "step": 104200 }, { "epoch": 0.7750325097529259, "grad_norm": 0.6567792892456055, "learning_rate": 3.2236049622201714e-05, "loss": 0.041, "step": 104300 }, { "epoch": 0.7757755898198031, "grad_norm": 0.3067520260810852, "learning_rate": 3.2216494845360825e-05, "loss": 0.0448, "step": 104400 }, { "epoch": 0.7765186698866803, "grad_norm": 0.43431055545806885, "learning_rate": 3.219694006851994e-05, "loss": 0.0472, "step": 104500 }, { "epoch": 0.7772617499535575, "grad_norm": 0.6027314066886902, "learning_rate": 3.2177385291679055e-05, "loss": 0.0437, "step": 104600 }, { "epoch": 0.7780048300204347, "grad_norm": 0.2590998709201813, "learning_rate": 3.2157830514838166e-05, "loss": 0.0365, "step": 104700 }, { "epoch": 0.7787479100873119, "grad_norm": 0.30552467703819275, "learning_rate": 3.213827573799728e-05, "loss": 0.0408, "step": 104800 }, { "epoch": 0.7794909901541891, "grad_norm": 0.46247702836990356, "learning_rate": 3.2118720961156396e-05, "loss": 0.0437, "step": 104900 }, { "epoch": 0.7802340702210663, "grad_norm": 1.4189189672470093, "learning_rate": 3.209916618431551e-05, "loss": 0.0447, "step": 105000 }, { "epoch": 0.7809771502879436, "grad_norm": 0.16958165168762207, "learning_rate": 3.207961140747462e-05, "loss": 0.0388, "step": 105100 }, { "epoch": 0.7817202303548207, "grad_norm": 0.6507385969161987, "learning_rate": 3.206005663063374e-05, "loss": 0.0416, "step": 105200 }, { "epoch": 0.7824633104216979, "grad_norm": 0.0672924593091011, "learning_rate": 3.204050185379285e-05, "loss": 0.0449, "step": 105300 }, { "epoch": 0.7832063904885751, "grad_norm": 0.70489901304245, "learning_rate": 3.202094707695196e-05, "loss": 0.0382, "step": 105400 }, { "epoch": 0.7839494705554524, "grad_norm": 0.17336411774158478, "learning_rate": 3.200139230011107e-05, "loss": 0.0476, "step": 105500 }, { "epoch": 0.7846925506223296, "grad_norm": 0.6560261249542236, "learning_rate": 3.198183752327019e-05, "loss": 0.0361, "step": 105600 }, { "epoch": 0.7854356306892067, "grad_norm": 0.2691417634487152, "learning_rate": 3.19622827464293e-05, "loss": 0.0466, "step": 105700 }, { "epoch": 0.7861787107560839, "grad_norm": 0.18952178955078125, "learning_rate": 3.194272796958841e-05, "loss": 0.0388, "step": 105800 }, { "epoch": 0.7869217908229612, "grad_norm": 1.619096279144287, "learning_rate": 3.192317319274752e-05, "loss": 0.0465, "step": 105900 }, { "epoch": 0.7876648708898384, "grad_norm": 0.4931752383708954, "learning_rate": 3.190361841590664e-05, "loss": 0.0463, "step": 106000 }, { "epoch": 0.7884079509567156, "grad_norm": 0.3216817378997803, "learning_rate": 3.188406363906575e-05, "loss": 0.0498, "step": 106100 }, { "epoch": 0.7891510310235927, "grad_norm": 0.7513154745101929, "learning_rate": 3.186450886222487e-05, "loss": 0.0475, "step": 106200 }, { "epoch": 0.78989411109047, "grad_norm": 2.0174620151519775, "learning_rate": 3.1844954085383976e-05, "loss": 0.0458, "step": 106300 }, { "epoch": 0.7906371911573472, "grad_norm": 0.1989470273256302, "learning_rate": 3.1825399308543094e-05, "loss": 0.0521, "step": 106400 }, { "epoch": 0.7913802712242244, "grad_norm": 0.41141870617866516, "learning_rate": 3.1805844531702205e-05, "loss": 0.044, "step": 106500 }, { "epoch": 0.7921233512911016, "grad_norm": 0.16395708918571472, "learning_rate": 3.178628975486132e-05, "loss": 0.0446, "step": 106600 }, { "epoch": 0.7928664313579789, "grad_norm": 0.14501582086086273, "learning_rate": 3.1766734978020435e-05, "loss": 0.0439, "step": 106700 }, { "epoch": 0.793609511424856, "grad_norm": 0.19982104003429413, "learning_rate": 3.1747180201179546e-05, "loss": 0.0412, "step": 106800 }, { "epoch": 0.7943525914917332, "grad_norm": 0.1115763857960701, "learning_rate": 3.172762542433866e-05, "loss": 0.0342, "step": 106900 }, { "epoch": 0.7950956715586105, "grad_norm": 0.4395133852958679, "learning_rate": 3.1708070647497775e-05, "loss": 0.0444, "step": 107000 }, { "epoch": 0.7958387516254877, "grad_norm": 2.647773027420044, "learning_rate": 3.168851587065689e-05, "loss": 0.0533, "step": 107100 }, { "epoch": 0.7965818316923649, "grad_norm": 0.31254205107688904, "learning_rate": 3.1668961093816e-05, "loss": 0.0379, "step": 107200 }, { "epoch": 0.797324911759242, "grad_norm": 0.41884899139404297, "learning_rate": 3.164940631697511e-05, "loss": 0.038, "step": 107300 }, { "epoch": 0.7980679918261193, "grad_norm": 0.27936241030693054, "learning_rate": 3.162985154013423e-05, "loss": 0.0363, "step": 107400 }, { "epoch": 0.7988110718929965, "grad_norm": 0.6014065146446228, "learning_rate": 3.161029676329334e-05, "loss": 0.0428, "step": 107500 }, { "epoch": 0.7995541519598737, "grad_norm": 0.24908004701137543, "learning_rate": 3.159074198645246e-05, "loss": 0.0529, "step": 107600 }, { "epoch": 0.8002972320267508, "grad_norm": 0.2135940045118332, "learning_rate": 3.157118720961156e-05, "loss": 0.0468, "step": 107700 }, { "epoch": 0.8010403120936281, "grad_norm": 0.28829512000083923, "learning_rate": 3.155163243277068e-05, "loss": 0.0467, "step": 107800 }, { "epoch": 0.8017833921605053, "grad_norm": 0.42426687479019165, "learning_rate": 3.153207765592979e-05, "loss": 0.0523, "step": 107900 }, { "epoch": 0.8025264722273825, "grad_norm": 0.31698179244995117, "learning_rate": 3.151252287908891e-05, "loss": 0.0406, "step": 108000 }, { "epoch": 0.8032695522942597, "grad_norm": 0.3129708170890808, "learning_rate": 3.1492968102248014e-05, "loss": 0.0444, "step": 108100 }, { "epoch": 0.804012632361137, "grad_norm": 0.5504332780838013, "learning_rate": 3.147341332540713e-05, "loss": 0.0446, "step": 108200 }, { "epoch": 0.8047557124280141, "grad_norm": 0.3634461760520935, "learning_rate": 3.1453858548566244e-05, "loss": 0.0425, "step": 108300 }, { "epoch": 0.8054987924948913, "grad_norm": 0.622724711894989, "learning_rate": 3.143430377172536e-05, "loss": 0.0465, "step": 108400 }, { "epoch": 0.8062418725617685, "grad_norm": 0.2516930401325226, "learning_rate": 3.141474899488447e-05, "loss": 0.0467, "step": 108500 }, { "epoch": 0.8069849526286458, "grad_norm": 0.23470531404018402, "learning_rate": 3.1395194218043585e-05, "loss": 0.041, "step": 108600 }, { "epoch": 0.807728032695523, "grad_norm": 0.6466637849807739, "learning_rate": 3.1375639441202696e-05, "loss": 0.0529, "step": 108700 }, { "epoch": 0.8084711127624001, "grad_norm": 1.730777621269226, "learning_rate": 3.1356084664361814e-05, "loss": 0.042, "step": 108800 }, { "epoch": 0.8092141928292773, "grad_norm": 0.5445126891136169, "learning_rate": 3.1336529887520926e-05, "loss": 0.0349, "step": 108900 }, { "epoch": 0.8099572728961546, "grad_norm": 1.447895884513855, "learning_rate": 3.131697511068004e-05, "loss": 0.0473, "step": 109000 }, { "epoch": 0.8107003529630318, "grad_norm": 0.4344726502895355, "learning_rate": 3.129742033383915e-05, "loss": 0.051, "step": 109100 }, { "epoch": 0.811443433029909, "grad_norm": 0.847717821598053, "learning_rate": 3.1277865556998266e-05, "loss": 0.0426, "step": 109200 }, { "epoch": 0.8121865130967861, "grad_norm": 0.6739615201950073, "learning_rate": 3.125831078015738e-05, "loss": 0.0411, "step": 109300 }, { "epoch": 0.8129295931636634, "grad_norm": 0.7524177432060242, "learning_rate": 3.1238756003316496e-05, "loss": 0.0408, "step": 109400 }, { "epoch": 0.8136726732305406, "grad_norm": 0.38680708408355713, "learning_rate": 3.12192012264756e-05, "loss": 0.0372, "step": 109500 }, { "epoch": 0.8144157532974178, "grad_norm": 0.2958082854747772, "learning_rate": 3.119964644963472e-05, "loss": 0.0418, "step": 109600 }, { "epoch": 0.815158833364295, "grad_norm": 2.1549975872039795, "learning_rate": 3.118009167279383e-05, "loss": 0.042, "step": 109700 }, { "epoch": 0.8159019134311722, "grad_norm": 0.6249364018440247, "learning_rate": 3.116053689595295e-05, "loss": 0.0402, "step": 109800 }, { "epoch": 0.8166449934980494, "grad_norm": 0.5673858523368835, "learning_rate": 3.114098211911205e-05, "loss": 0.0436, "step": 109900 }, { "epoch": 0.8173880735649266, "grad_norm": 0.32838788628578186, "learning_rate": 3.112142734227117e-05, "loss": 0.0378, "step": 110000 }, { "epoch": 0.8181311536318038, "grad_norm": 0.4909050166606903, "learning_rate": 3.110187256543028e-05, "loss": 0.0389, "step": 110100 }, { "epoch": 0.8188742336986811, "grad_norm": 0.5867769718170166, "learning_rate": 3.10823177885894e-05, "loss": 0.0424, "step": 110200 }, { "epoch": 0.8196173137655582, "grad_norm": 1.011805772781372, "learning_rate": 3.106276301174851e-05, "loss": 0.0499, "step": 110300 }, { "epoch": 0.8203603938324354, "grad_norm": 0.19045884907245636, "learning_rate": 3.104320823490762e-05, "loss": 0.0414, "step": 110400 }, { "epoch": 0.8211034738993126, "grad_norm": 3.34407114982605, "learning_rate": 3.1023653458066735e-05, "loss": 0.0474, "step": 110500 }, { "epoch": 0.8218465539661899, "grad_norm": 0.6601481437683105, "learning_rate": 3.100409868122585e-05, "loss": 0.0473, "step": 110600 }, { "epoch": 0.8225896340330671, "grad_norm": 1.7228869199752808, "learning_rate": 3.0984543904384964e-05, "loss": 0.0399, "step": 110700 }, { "epoch": 0.8233327140999442, "grad_norm": 0.2937987148761749, "learning_rate": 3.096498912754408e-05, "loss": 0.037, "step": 110800 }, { "epoch": 0.8240757941668215, "grad_norm": 0.672170102596283, "learning_rate": 3.094543435070319e-05, "loss": 0.05, "step": 110900 }, { "epoch": 0.8248188742336987, "grad_norm": 0.7423655390739441, "learning_rate": 3.0925879573862305e-05, "loss": 0.0394, "step": 111000 }, { "epoch": 0.8255619543005759, "grad_norm": 0.627516508102417, "learning_rate": 3.0906324797021416e-05, "loss": 0.0513, "step": 111100 }, { "epoch": 0.8263050343674531, "grad_norm": 0.3133392632007599, "learning_rate": 3.0886770020180535e-05, "loss": 0.0399, "step": 111200 }, { "epoch": 0.8270481144343304, "grad_norm": 0.4979921281337738, "learning_rate": 3.086721524333964e-05, "loss": 0.0473, "step": 111300 }, { "epoch": 0.8277911945012075, "grad_norm": 0.4969426989555359, "learning_rate": 3.084766046649876e-05, "loss": 0.0405, "step": 111400 }, { "epoch": 0.8285342745680847, "grad_norm": 0.7501845359802246, "learning_rate": 3.082810568965787e-05, "loss": 0.0437, "step": 111500 }, { "epoch": 0.8292773546349619, "grad_norm": 0.30016160011291504, "learning_rate": 3.080855091281699e-05, "loss": 0.0386, "step": 111600 }, { "epoch": 0.8300204347018392, "grad_norm": 0.4853677749633789, "learning_rate": 3.07889961359761e-05, "loss": 0.0494, "step": 111700 }, { "epoch": 0.8307635147687163, "grad_norm": 0.3351646661758423, "learning_rate": 3.076944135913521e-05, "loss": 0.0418, "step": 111800 }, { "epoch": 0.8315065948355935, "grad_norm": 0.3229526877403259, "learning_rate": 3.074988658229432e-05, "loss": 0.047, "step": 111900 }, { "epoch": 0.8322496749024707, "grad_norm": 0.3285624384880066, "learning_rate": 3.073033180545344e-05, "loss": 0.0484, "step": 112000 }, { "epoch": 0.832992754969348, "grad_norm": 0.4498685896396637, "learning_rate": 3.071077702861255e-05, "loss": 0.0409, "step": 112100 }, { "epoch": 0.8337358350362252, "grad_norm": 0.6998697519302368, "learning_rate": 3.069122225177166e-05, "loss": 0.0465, "step": 112200 }, { "epoch": 0.8344789151031023, "grad_norm": 0.2041853964328766, "learning_rate": 3.067166747493077e-05, "loss": 0.0462, "step": 112300 }, { "epoch": 0.8352219951699795, "grad_norm": 0.4701375961303711, "learning_rate": 3.065211269808989e-05, "loss": 0.0429, "step": 112400 }, { "epoch": 0.8359650752368568, "grad_norm": 0.2932807505130768, "learning_rate": 3.0632557921249e-05, "loss": 0.0383, "step": 112500 }, { "epoch": 0.836708155303734, "grad_norm": 0.37055501341819763, "learning_rate": 3.061300314440812e-05, "loss": 0.0376, "step": 112600 }, { "epoch": 0.8374512353706112, "grad_norm": 0.34832242131233215, "learning_rate": 3.0593448367567226e-05, "loss": 0.0382, "step": 112700 }, { "epoch": 0.8381943154374883, "grad_norm": 0.3855780065059662, "learning_rate": 3.0573893590726344e-05, "loss": 0.0449, "step": 112800 }, { "epoch": 0.8389373955043656, "grad_norm": 0.5029892325401306, "learning_rate": 3.0554338813885455e-05, "loss": 0.0495, "step": 112900 }, { "epoch": 0.8396804755712428, "grad_norm": 0.4716889262199402, "learning_rate": 3.053478403704457e-05, "loss": 0.0384, "step": 113000 }, { "epoch": 0.84042355563812, "grad_norm": 0.3301020860671997, "learning_rate": 3.051522926020368e-05, "loss": 0.0425, "step": 113100 }, { "epoch": 0.8411666357049972, "grad_norm": 0.14475572109222412, "learning_rate": 3.0495674483362796e-05, "loss": 0.0449, "step": 113200 }, { "epoch": 0.8419097157718745, "grad_norm": 0.3445386290550232, "learning_rate": 3.047611970652191e-05, "loss": 0.0425, "step": 113300 }, { "epoch": 0.8426527958387516, "grad_norm": 0.2538638710975647, "learning_rate": 3.0456564929681026e-05, "loss": 0.0399, "step": 113400 }, { "epoch": 0.8433958759056288, "grad_norm": 0.7757213115692139, "learning_rate": 3.043701015284014e-05, "loss": 0.0401, "step": 113500 }, { "epoch": 0.844138955972506, "grad_norm": 0.8526016473770142, "learning_rate": 3.041745537599925e-05, "loss": 0.0393, "step": 113600 }, { "epoch": 0.8448820360393833, "grad_norm": 0.2816007435321808, "learning_rate": 3.0397900599158363e-05, "loss": 0.0372, "step": 113700 }, { "epoch": 0.8456251161062605, "grad_norm": 0.5152889490127563, "learning_rate": 3.0378345822317478e-05, "loss": 0.0468, "step": 113800 }, { "epoch": 0.8463681961731376, "grad_norm": 0.38516074419021606, "learning_rate": 3.0358791045476593e-05, "loss": 0.0441, "step": 113900 }, { "epoch": 0.8471112762400148, "grad_norm": 0.43191707134246826, "learning_rate": 3.03392362686357e-05, "loss": 0.0384, "step": 114000 }, { "epoch": 0.8478543563068921, "grad_norm": 0.23803560435771942, "learning_rate": 3.0319681491794815e-05, "loss": 0.043, "step": 114100 }, { "epoch": 0.8485974363737693, "grad_norm": 0.22906525433063507, "learning_rate": 3.030012671495393e-05, "loss": 0.0511, "step": 114200 }, { "epoch": 0.8493405164406465, "grad_norm": 0.8549457788467407, "learning_rate": 3.0280571938113045e-05, "loss": 0.0407, "step": 114300 }, { "epoch": 0.8500835965075236, "grad_norm": 0.18412131071090698, "learning_rate": 3.026101716127216e-05, "loss": 0.0371, "step": 114400 }, { "epoch": 0.8508266765744009, "grad_norm": 0.47394225001335144, "learning_rate": 3.0241462384431268e-05, "loss": 0.0415, "step": 114500 }, { "epoch": 0.8515697566412781, "grad_norm": 0.4633588194847107, "learning_rate": 3.0221907607590382e-05, "loss": 0.0456, "step": 114600 }, { "epoch": 0.8523128367081553, "grad_norm": 0.1712949573993683, "learning_rate": 3.0202352830749497e-05, "loss": 0.0362, "step": 114700 }, { "epoch": 0.8530559167750326, "grad_norm": 0.4578275978565216, "learning_rate": 3.0182798053908612e-05, "loss": 0.0434, "step": 114800 }, { "epoch": 0.8537989968419097, "grad_norm": 0.8473727107048035, "learning_rate": 3.016324327706772e-05, "loss": 0.0429, "step": 114900 }, { "epoch": 0.8545420769087869, "grad_norm": 0.3080345690250397, "learning_rate": 3.0143688500226835e-05, "loss": 0.0458, "step": 115000 }, { "epoch": 0.8552851569756641, "grad_norm": 0.3397873640060425, "learning_rate": 3.012413372338595e-05, "loss": 0.0454, "step": 115100 }, { "epoch": 0.8560282370425414, "grad_norm": 2.243802785873413, "learning_rate": 3.0104578946545064e-05, "loss": 0.0392, "step": 115200 }, { "epoch": 0.8567713171094186, "grad_norm": 0.4054226279258728, "learning_rate": 3.008502416970418e-05, "loss": 0.0448, "step": 115300 }, { "epoch": 0.8575143971762957, "grad_norm": 0.5639569163322449, "learning_rate": 3.0065469392863287e-05, "loss": 0.047, "step": 115400 }, { "epoch": 0.8582574772431729, "grad_norm": 0.19753554463386536, "learning_rate": 3.0045914616022402e-05, "loss": 0.0465, "step": 115500 }, { "epoch": 0.8590005573100502, "grad_norm": 0.365671306848526, "learning_rate": 3.0026359839181517e-05, "loss": 0.0443, "step": 115600 }, { "epoch": 0.8597436373769274, "grad_norm": 1.4803117513656616, "learning_rate": 3.000680506234063e-05, "loss": 0.0436, "step": 115700 }, { "epoch": 0.8604867174438046, "grad_norm": 0.48384329676628113, "learning_rate": 2.998725028549974e-05, "loss": 0.0441, "step": 115800 }, { "epoch": 0.8612297975106817, "grad_norm": 0.17809706926345825, "learning_rate": 2.9967695508658854e-05, "loss": 0.0487, "step": 115900 }, { "epoch": 0.861972877577559, "grad_norm": 0.3482704162597656, "learning_rate": 2.994814073181797e-05, "loss": 0.0449, "step": 116000 }, { "epoch": 0.8627159576444362, "grad_norm": 0.41809067130088806, "learning_rate": 2.9928585954977084e-05, "loss": 0.0416, "step": 116100 }, { "epoch": 0.8634590377113134, "grad_norm": 0.7231165766716003, "learning_rate": 2.99090311781362e-05, "loss": 0.0343, "step": 116200 }, { "epoch": 0.8642021177781906, "grad_norm": 0.3746318221092224, "learning_rate": 2.9889476401295306e-05, "loss": 0.04, "step": 116300 }, { "epoch": 0.8649451978450678, "grad_norm": 0.28151172399520874, "learning_rate": 2.986992162445442e-05, "loss": 0.0426, "step": 116400 }, { "epoch": 0.865688277911945, "grad_norm": 0.4022485613822937, "learning_rate": 2.9850366847613536e-05, "loss": 0.0439, "step": 116500 }, { "epoch": 0.8664313579788222, "grad_norm": 0.1535918265581131, "learning_rate": 2.983081207077265e-05, "loss": 0.0418, "step": 116600 }, { "epoch": 0.8671744380456994, "grad_norm": 0.41668128967285156, "learning_rate": 2.9811257293931765e-05, "loss": 0.0399, "step": 116700 }, { "epoch": 0.8679175181125767, "grad_norm": 0.3129327893257141, "learning_rate": 2.9791702517090873e-05, "loss": 0.0421, "step": 116800 }, { "epoch": 0.8686605981794538, "grad_norm": 0.633822500705719, "learning_rate": 2.9772147740249988e-05, "loss": 0.0428, "step": 116900 }, { "epoch": 0.869403678246331, "grad_norm": 0.11655830591917038, "learning_rate": 2.9752592963409103e-05, "loss": 0.0502, "step": 117000 }, { "epoch": 0.8701467583132082, "grad_norm": 0.5250160694122314, "learning_rate": 2.9733038186568218e-05, "loss": 0.0386, "step": 117100 }, { "epoch": 0.8708898383800855, "grad_norm": 0.17734429240226746, "learning_rate": 2.971348340972733e-05, "loss": 0.0349, "step": 117200 }, { "epoch": 0.8716329184469627, "grad_norm": 0.6379687786102295, "learning_rate": 2.969392863288644e-05, "loss": 0.0395, "step": 117300 }, { "epoch": 0.8723759985138398, "grad_norm": 0.06623728573322296, "learning_rate": 2.9674373856045555e-05, "loss": 0.0397, "step": 117400 }, { "epoch": 0.873119078580717, "grad_norm": 0.9205065369606018, "learning_rate": 2.965481907920467e-05, "loss": 0.0366, "step": 117500 }, { "epoch": 0.8738621586475943, "grad_norm": 0.555588960647583, "learning_rate": 2.9635264302363785e-05, "loss": 0.0483, "step": 117600 }, { "epoch": 0.8746052387144715, "grad_norm": 0.673251211643219, "learning_rate": 2.9615709525522896e-05, "loss": 0.0405, "step": 117700 }, { "epoch": 0.8753483187813487, "grad_norm": 0.17188413441181183, "learning_rate": 2.9596154748682007e-05, "loss": 0.0465, "step": 117800 }, { "epoch": 0.8760913988482258, "grad_norm": 0.8362275958061218, "learning_rate": 2.9576599971841122e-05, "loss": 0.0425, "step": 117900 }, { "epoch": 0.8768344789151031, "grad_norm": 0.5028262734413147, "learning_rate": 2.9557045195000237e-05, "loss": 0.0524, "step": 118000 }, { "epoch": 0.8775775589819803, "grad_norm": 0.7266266942024231, "learning_rate": 2.953749041815935e-05, "loss": 0.0428, "step": 118100 }, { "epoch": 0.8783206390488575, "grad_norm": 0.3126924932003021, "learning_rate": 2.9517935641318463e-05, "loss": 0.0516, "step": 118200 }, { "epoch": 0.8790637191157347, "grad_norm": 0.9735151529312134, "learning_rate": 2.9498380864477575e-05, "loss": 0.0359, "step": 118300 }, { "epoch": 0.879806799182612, "grad_norm": 0.524466872215271, "learning_rate": 2.947882608763669e-05, "loss": 0.0409, "step": 118400 }, { "epoch": 0.8805498792494891, "grad_norm": 0.4400867819786072, "learning_rate": 2.9459271310795804e-05, "loss": 0.0364, "step": 118500 }, { "epoch": 0.8812929593163663, "grad_norm": 0.8360695242881775, "learning_rate": 2.9439716533954915e-05, "loss": 0.0473, "step": 118600 }, { "epoch": 0.8820360393832436, "grad_norm": 0.34002402424812317, "learning_rate": 2.942016175711403e-05, "loss": 0.0565, "step": 118700 }, { "epoch": 0.8827791194501208, "grad_norm": 0.7345854640007019, "learning_rate": 2.940060698027314e-05, "loss": 0.0414, "step": 118800 }, { "epoch": 0.883522199516998, "grad_norm": 0.9701108932495117, "learning_rate": 2.9381052203432256e-05, "loss": 0.0456, "step": 118900 }, { "epoch": 0.8842652795838751, "grad_norm": 0.7061591744422913, "learning_rate": 2.9361497426591368e-05, "loss": 0.0425, "step": 119000 }, { "epoch": 0.8850083596507524, "grad_norm": 0.4753588140010834, "learning_rate": 2.9341942649750482e-05, "loss": 0.0399, "step": 119100 }, { "epoch": 0.8857514397176296, "grad_norm": 0.16470690071582794, "learning_rate": 2.9322387872909597e-05, "loss": 0.04, "step": 119200 }, { "epoch": 0.8864945197845068, "grad_norm": 0.15938585996627808, "learning_rate": 2.9302833096068712e-05, "loss": 0.0438, "step": 119300 }, { "epoch": 0.887237599851384, "grad_norm": 0.47142156958580017, "learning_rate": 2.9283278319227823e-05, "loss": 0.0421, "step": 119400 }, { "epoch": 0.8879806799182612, "grad_norm": 0.9223182797431946, "learning_rate": 2.9263723542386935e-05, "loss": 0.0414, "step": 119500 }, { "epoch": 0.8887237599851384, "grad_norm": 1.0443511009216309, "learning_rate": 2.924416876554605e-05, "loss": 0.0379, "step": 119600 }, { "epoch": 0.8894668400520156, "grad_norm": 0.5240704417228699, "learning_rate": 2.9224613988705164e-05, "loss": 0.0558, "step": 119700 }, { "epoch": 0.8902099201188928, "grad_norm": 0.17435157299041748, "learning_rate": 2.920505921186428e-05, "loss": 0.0429, "step": 119800 }, { "epoch": 0.8909530001857701, "grad_norm": 0.5454820990562439, "learning_rate": 2.9185504435023387e-05, "loss": 0.0453, "step": 119900 }, { "epoch": 0.8916960802526472, "grad_norm": 4.058526515960693, "learning_rate": 2.9165949658182502e-05, "loss": 0.0437, "step": 120000 }, { "epoch": 0.8924391603195244, "grad_norm": 0.4385039210319519, "learning_rate": 2.9146394881341617e-05, "loss": 0.0445, "step": 120100 }, { "epoch": 0.8931822403864016, "grad_norm": 0.3762959837913513, "learning_rate": 2.912684010450073e-05, "loss": 0.0436, "step": 120200 }, { "epoch": 0.8939253204532789, "grad_norm": 0.36264607310295105, "learning_rate": 2.9107285327659846e-05, "loss": 0.0404, "step": 120300 }, { "epoch": 0.8946684005201561, "grad_norm": 4.052135944366455, "learning_rate": 2.9087730550818954e-05, "loss": 0.0453, "step": 120400 }, { "epoch": 0.8954114805870332, "grad_norm": 0.4942890703678131, "learning_rate": 2.906817577397807e-05, "loss": 0.0489, "step": 120500 }, { "epoch": 0.8961545606539104, "grad_norm": 0.3263453245162964, "learning_rate": 2.9048620997137184e-05, "loss": 0.0449, "step": 120600 }, { "epoch": 0.8968976407207877, "grad_norm": 1.6274800300598145, "learning_rate": 2.90290662202963e-05, "loss": 0.0432, "step": 120700 }, { "epoch": 0.8976407207876649, "grad_norm": 0.23678019642829895, "learning_rate": 2.9009511443455406e-05, "loss": 0.0388, "step": 120800 }, { "epoch": 0.8983838008545421, "grad_norm": 0.34514862298965454, "learning_rate": 2.898995666661452e-05, "loss": 0.0467, "step": 120900 }, { "epoch": 0.8991268809214192, "grad_norm": 0.7210121154785156, "learning_rate": 2.8970401889773636e-05, "loss": 0.0412, "step": 121000 }, { "epoch": 0.8998699609882965, "grad_norm": 1.1281719207763672, "learning_rate": 2.895084711293275e-05, "loss": 0.0451, "step": 121100 }, { "epoch": 0.9006130410551737, "grad_norm": 0.17034655809402466, "learning_rate": 2.8931292336091865e-05, "loss": 0.0418, "step": 121200 }, { "epoch": 0.9013561211220509, "grad_norm": 0.14188611507415771, "learning_rate": 2.8911737559250973e-05, "loss": 0.0443, "step": 121300 }, { "epoch": 0.9020992011889281, "grad_norm": 0.7282285690307617, "learning_rate": 2.8892182782410088e-05, "loss": 0.0418, "step": 121400 }, { "epoch": 0.9028422812558053, "grad_norm": 0.13386410474777222, "learning_rate": 2.8872628005569203e-05, "loss": 0.0411, "step": 121500 }, { "epoch": 0.9035853613226825, "grad_norm": 0.45946866273880005, "learning_rate": 2.8853073228728318e-05, "loss": 0.046, "step": 121600 }, { "epoch": 0.9043284413895597, "grad_norm": 0.49198809266090393, "learning_rate": 2.8833518451887426e-05, "loss": 0.0402, "step": 121700 }, { "epoch": 0.9050715214564369, "grad_norm": 0.6152574419975281, "learning_rate": 2.881396367504654e-05, "loss": 0.0415, "step": 121800 }, { "epoch": 0.9058146015233142, "grad_norm": 0.4153579771518707, "learning_rate": 2.8794408898205655e-05, "loss": 0.0382, "step": 121900 }, { "epoch": 0.9065576815901913, "grad_norm": 0.17249228060245514, "learning_rate": 2.877485412136477e-05, "loss": 0.0417, "step": 122000 }, { "epoch": 0.9073007616570685, "grad_norm": 0.32870739698410034, "learning_rate": 2.8755299344523885e-05, "loss": 0.0445, "step": 122100 }, { "epoch": 0.9080438417239458, "grad_norm": 0.6010507941246033, "learning_rate": 2.8735744567682993e-05, "loss": 0.0483, "step": 122200 }, { "epoch": 0.908786921790823, "grad_norm": 0.5077258348464966, "learning_rate": 2.8716189790842107e-05, "loss": 0.037, "step": 122300 }, { "epoch": 0.9095300018577002, "grad_norm": 0.15593284368515015, "learning_rate": 2.8696635014001222e-05, "loss": 0.0363, "step": 122400 }, { "epoch": 0.9102730819245773, "grad_norm": 0.5265464782714844, "learning_rate": 2.8677080237160337e-05, "loss": 0.0441, "step": 122500 }, { "epoch": 0.9110161619914546, "grad_norm": 0.22402919828891754, "learning_rate": 2.8657525460319452e-05, "loss": 0.0538, "step": 122600 }, { "epoch": 0.9117592420583318, "grad_norm": 0.7765402793884277, "learning_rate": 2.863797068347856e-05, "loss": 0.0414, "step": 122700 }, { "epoch": 0.912502322125209, "grad_norm": 0.2653655707836151, "learning_rate": 2.8618415906637675e-05, "loss": 0.0458, "step": 122800 }, { "epoch": 0.9132454021920862, "grad_norm": 0.3179582953453064, "learning_rate": 2.859886112979679e-05, "loss": 0.0395, "step": 122900 }, { "epoch": 0.9139884822589635, "grad_norm": 0.6429779529571533, "learning_rate": 2.8579306352955904e-05, "loss": 0.0466, "step": 123000 }, { "epoch": 0.9147315623258406, "grad_norm": 0.6170157790184021, "learning_rate": 2.8559751576115012e-05, "loss": 0.0362, "step": 123100 }, { "epoch": 0.9154746423927178, "grad_norm": 0.5167025327682495, "learning_rate": 2.8540196799274127e-05, "loss": 0.0455, "step": 123200 }, { "epoch": 0.916217722459595, "grad_norm": 0.466552734375, "learning_rate": 2.852064202243324e-05, "loss": 0.0614, "step": 123300 }, { "epoch": 0.9169608025264723, "grad_norm": 0.2650231420993805, "learning_rate": 2.8501087245592356e-05, "loss": 0.0474, "step": 123400 }, { "epoch": 0.9177038825933495, "grad_norm": 0.3955620527267456, "learning_rate": 2.848153246875147e-05, "loss": 0.0397, "step": 123500 }, { "epoch": 0.9184469626602266, "grad_norm": 2.033181667327881, "learning_rate": 2.846197769191058e-05, "loss": 0.0389, "step": 123600 }, { "epoch": 0.9191900427271038, "grad_norm": 2.9279379844665527, "learning_rate": 2.8442422915069694e-05, "loss": 0.0405, "step": 123700 }, { "epoch": 0.9199331227939811, "grad_norm": 3.5691843032836914, "learning_rate": 2.842286813822881e-05, "loss": 0.0443, "step": 123800 }, { "epoch": 0.9206762028608583, "grad_norm": 0.39710742235183716, "learning_rate": 2.8403313361387923e-05, "loss": 0.0406, "step": 123900 }, { "epoch": 0.9214192829277355, "grad_norm": 0.4193434715270996, "learning_rate": 2.838375858454703e-05, "loss": 0.0506, "step": 124000 }, { "epoch": 0.9221623629946126, "grad_norm": 0.4925794005393982, "learning_rate": 2.8364203807706146e-05, "loss": 0.042, "step": 124100 }, { "epoch": 0.9229054430614899, "grad_norm": 0.47151580452919006, "learning_rate": 2.834464903086526e-05, "loss": 0.0414, "step": 124200 }, { "epoch": 0.9236485231283671, "grad_norm": 0.8934033513069153, "learning_rate": 2.8325094254024376e-05, "loss": 0.0389, "step": 124300 }, { "epoch": 0.9243916031952443, "grad_norm": 0.33863556385040283, "learning_rate": 2.830553947718349e-05, "loss": 0.0361, "step": 124400 }, { "epoch": 0.9251346832621214, "grad_norm": 0.561432421207428, "learning_rate": 2.82859847003426e-05, "loss": 0.0414, "step": 124500 }, { "epoch": 0.9258777633289987, "grad_norm": 6.158478260040283, "learning_rate": 2.8266429923501713e-05, "loss": 0.0449, "step": 124600 }, { "epoch": 0.9266208433958759, "grad_norm": 0.3530873954296112, "learning_rate": 2.8246875146660828e-05, "loss": 0.0334, "step": 124700 }, { "epoch": 0.9273639234627531, "grad_norm": 0.7516241073608398, "learning_rate": 2.8227320369819943e-05, "loss": 0.037, "step": 124800 }, { "epoch": 0.9281070035296303, "grad_norm": 0.5780965089797974, "learning_rate": 2.820776559297905e-05, "loss": 0.0386, "step": 124900 }, { "epoch": 0.9288500835965076, "grad_norm": 0.23343157768249512, "learning_rate": 2.8188210816138165e-05, "loss": 0.0344, "step": 125000 }, { "epoch": 0.9295931636633847, "grad_norm": 1.3340444564819336, "learning_rate": 2.816865603929728e-05, "loss": 0.0423, "step": 125100 }, { "epoch": 0.9303362437302619, "grad_norm": 0.3049847483634949, "learning_rate": 2.8149101262456395e-05, "loss": 0.0367, "step": 125200 }, { "epoch": 0.9310793237971391, "grad_norm": 0.6216754913330078, "learning_rate": 2.812954648561551e-05, "loss": 0.0397, "step": 125300 }, { "epoch": 0.9318224038640164, "grad_norm": 0.17765510082244873, "learning_rate": 2.8109991708774618e-05, "loss": 0.0382, "step": 125400 }, { "epoch": 0.9325654839308936, "grad_norm": 0.5393523573875427, "learning_rate": 2.8090436931933733e-05, "loss": 0.0378, "step": 125500 }, { "epoch": 0.9333085639977707, "grad_norm": 0.14772993326187134, "learning_rate": 2.8070882155092847e-05, "loss": 0.0421, "step": 125600 }, { "epoch": 0.9340516440646479, "grad_norm": 0.9722098708152771, "learning_rate": 2.8051327378251962e-05, "loss": 0.0445, "step": 125700 }, { "epoch": 0.9347947241315252, "grad_norm": 1.1559133529663086, "learning_rate": 2.803177260141107e-05, "loss": 0.0435, "step": 125800 }, { "epoch": 0.9355378041984024, "grad_norm": 0.3272378742694855, "learning_rate": 2.8012217824570185e-05, "loss": 0.0547, "step": 125900 }, { "epoch": 0.9362808842652796, "grad_norm": 1.2135850191116333, "learning_rate": 2.79926630477293e-05, "loss": 0.043, "step": 126000 }, { "epoch": 0.9370239643321568, "grad_norm": 0.4183587431907654, "learning_rate": 2.7973108270888414e-05, "loss": 0.0424, "step": 126100 }, { "epoch": 0.937767044399034, "grad_norm": 0.5988611578941345, "learning_rate": 2.795355349404753e-05, "loss": 0.0453, "step": 126200 }, { "epoch": 0.9385101244659112, "grad_norm": 0.6442294716835022, "learning_rate": 2.7933998717206637e-05, "loss": 0.0547, "step": 126300 }, { "epoch": 0.9392532045327884, "grad_norm": 1.1175339221954346, "learning_rate": 2.7914443940365752e-05, "loss": 0.0396, "step": 126400 }, { "epoch": 0.9399962845996657, "grad_norm": 1.3308956623077393, "learning_rate": 2.7894889163524867e-05, "loss": 0.0533, "step": 126500 }, { "epoch": 0.9407393646665428, "grad_norm": 0.2300749272108078, "learning_rate": 2.787533438668398e-05, "loss": 0.0476, "step": 126600 }, { "epoch": 0.94148244473342, "grad_norm": 0.45765212178230286, "learning_rate": 2.7855779609843093e-05, "loss": 0.0422, "step": 126700 }, { "epoch": 0.9422255248002972, "grad_norm": 0.4886147379875183, "learning_rate": 2.7836224833002204e-05, "loss": 0.0444, "step": 126800 }, { "epoch": 0.9429686048671745, "grad_norm": 0.4330025613307953, "learning_rate": 2.781667005616132e-05, "loss": 0.0406, "step": 126900 }, { "epoch": 0.9437116849340517, "grad_norm": 0.47758743166923523, "learning_rate": 2.7797115279320434e-05, "loss": 0.0486, "step": 127000 }, { "epoch": 0.9444547650009288, "grad_norm": 0.23165947198867798, "learning_rate": 2.777756050247955e-05, "loss": 0.0418, "step": 127100 }, { "epoch": 0.945197845067806, "grad_norm": 0.47796764969825745, "learning_rate": 2.775800572563866e-05, "loss": 0.0397, "step": 127200 }, { "epoch": 0.9459409251346833, "grad_norm": 0.4682827889919281, "learning_rate": 2.773845094879777e-05, "loss": 0.0408, "step": 127300 }, { "epoch": 0.9466840052015605, "grad_norm": 0.3710649311542511, "learning_rate": 2.7718896171956886e-05, "loss": 0.0376, "step": 127400 }, { "epoch": 0.9474270852684377, "grad_norm": 0.3754519820213318, "learning_rate": 2.7699341395116e-05, "loss": 0.0404, "step": 127500 }, { "epoch": 0.9481701653353148, "grad_norm": 0.9763498306274414, "learning_rate": 2.7679786618275112e-05, "loss": 0.0396, "step": 127600 }, { "epoch": 0.9489132454021921, "grad_norm": 0.3777889907360077, "learning_rate": 2.7660231841434227e-05, "loss": 0.0394, "step": 127700 }, { "epoch": 0.9496563254690693, "grad_norm": 0.5196406245231628, "learning_rate": 2.764067706459334e-05, "loss": 0.042, "step": 127800 }, { "epoch": 0.9503994055359465, "grad_norm": 0.15122249722480774, "learning_rate": 2.7621122287752453e-05, "loss": 0.0433, "step": 127900 }, { "epoch": 0.9511424856028237, "grad_norm": 0.3191467821598053, "learning_rate": 2.7601567510911568e-05, "loss": 0.0486, "step": 128000 }, { "epoch": 0.951885565669701, "grad_norm": 0.6399619579315186, "learning_rate": 2.758201273407068e-05, "loss": 0.0445, "step": 128100 }, { "epoch": 0.9526286457365781, "grad_norm": 0.23300954699516296, "learning_rate": 2.7562457957229794e-05, "loss": 0.0396, "step": 128200 }, { "epoch": 0.9533717258034553, "grad_norm": 0.5160897970199585, "learning_rate": 2.754290318038891e-05, "loss": 0.0379, "step": 128300 }, { "epoch": 0.9541148058703325, "grad_norm": 0.08089996874332428, "learning_rate": 2.752334840354802e-05, "loss": 0.0481, "step": 128400 }, { "epoch": 0.9548578859372098, "grad_norm": 0.4286595284938812, "learning_rate": 2.7503793626707135e-05, "loss": 0.041, "step": 128500 }, { "epoch": 0.955600966004087, "grad_norm": 0.6478473544120789, "learning_rate": 2.7484238849866246e-05, "loss": 0.037, "step": 128600 }, { "epoch": 0.9563440460709641, "grad_norm": 0.0910152718424797, "learning_rate": 2.746468407302536e-05, "loss": 0.0447, "step": 128700 }, { "epoch": 0.9570871261378413, "grad_norm": 2.014763832092285, "learning_rate": 2.7445129296184476e-05, "loss": 0.0416, "step": 128800 }, { "epoch": 0.9578302062047186, "grad_norm": 0.5565311312675476, "learning_rate": 2.7425574519343587e-05, "loss": 0.0326, "step": 128900 }, { "epoch": 0.9585732862715958, "grad_norm": 0.5130266547203064, "learning_rate": 2.74060197425027e-05, "loss": 0.0407, "step": 129000 }, { "epoch": 0.959316366338473, "grad_norm": 0.37133368849754333, "learning_rate": 2.7386464965661813e-05, "loss": 0.0453, "step": 129100 }, { "epoch": 0.9600594464053501, "grad_norm": 0.527446448802948, "learning_rate": 2.7366910188820928e-05, "loss": 0.0453, "step": 129200 }, { "epoch": 0.9608025264722274, "grad_norm": 0.9273298382759094, "learning_rate": 2.7347355411980043e-05, "loss": 0.0415, "step": 129300 }, { "epoch": 0.9615456065391046, "grad_norm": 1.0435045957565308, "learning_rate": 2.7327800635139154e-05, "loss": 0.0369, "step": 129400 }, { "epoch": 0.9622886866059818, "grad_norm": 0.46657752990722656, "learning_rate": 2.7308245858298266e-05, "loss": 0.0386, "step": 129500 }, { "epoch": 0.963031766672859, "grad_norm": 0.531538724899292, "learning_rate": 2.728869108145738e-05, "loss": 0.0385, "step": 129600 }, { "epoch": 0.9637748467397362, "grad_norm": 0.6513416171073914, "learning_rate": 2.7269136304616495e-05, "loss": 0.0415, "step": 129700 }, { "epoch": 0.9645179268066134, "grad_norm": 1.8642324209213257, "learning_rate": 2.724958152777561e-05, "loss": 0.0436, "step": 129800 }, { "epoch": 0.9652610068734906, "grad_norm": 0.8569243550300598, "learning_rate": 2.7230026750934718e-05, "loss": 0.0424, "step": 129900 }, { "epoch": 0.9660040869403679, "grad_norm": 0.5236343145370483, "learning_rate": 2.7210471974093833e-05, "loss": 0.0365, "step": 130000 }, { "epoch": 0.9667471670072451, "grad_norm": 0.5481681823730469, "learning_rate": 2.7190917197252947e-05, "loss": 0.0429, "step": 130100 }, { "epoch": 0.9674902470741222, "grad_norm": 0.4324844479560852, "learning_rate": 2.7171362420412062e-05, "loss": 0.0397, "step": 130200 }, { "epoch": 0.9682333271409994, "grad_norm": 0.5807719230651855, "learning_rate": 2.7151807643571177e-05, "loss": 0.0454, "step": 130300 }, { "epoch": 0.9689764072078767, "grad_norm": 0.1405334770679474, "learning_rate": 2.7132252866730285e-05, "loss": 0.0397, "step": 130400 }, { "epoch": 0.9697194872747539, "grad_norm": 0.7031111121177673, "learning_rate": 2.71126980898894e-05, "loss": 0.0362, "step": 130500 }, { "epoch": 0.970462567341631, "grad_norm": 0.7030878663063049, "learning_rate": 2.7093143313048514e-05, "loss": 0.0423, "step": 130600 }, { "epoch": 0.9712056474085082, "grad_norm": 0.5331902503967285, "learning_rate": 2.707358853620763e-05, "loss": 0.0427, "step": 130700 }, { "epoch": 0.9719487274753855, "grad_norm": 0.2882708013057709, "learning_rate": 2.7054033759366737e-05, "loss": 0.0389, "step": 130800 }, { "epoch": 0.9726918075422627, "grad_norm": 0.39175233244895935, "learning_rate": 2.7034478982525852e-05, "loss": 0.0428, "step": 130900 }, { "epoch": 0.9734348876091399, "grad_norm": 0.9995160102844238, "learning_rate": 2.7014924205684967e-05, "loss": 0.0376, "step": 131000 }, { "epoch": 0.974177967676017, "grad_norm": 1.1922128200531006, "learning_rate": 2.699536942884408e-05, "loss": 0.0418, "step": 131100 }, { "epoch": 0.9749210477428943, "grad_norm": 2.2676737308502197, "learning_rate": 2.6975814652003196e-05, "loss": 0.0336, "step": 131200 }, { "epoch": 0.9756641278097715, "grad_norm": 0.41412976384162903, "learning_rate": 2.6956259875162304e-05, "loss": 0.0405, "step": 131300 }, { "epoch": 0.9764072078766487, "grad_norm": 0.3808678090572357, "learning_rate": 2.693670509832142e-05, "loss": 0.0432, "step": 131400 }, { "epoch": 0.9771502879435259, "grad_norm": 3.683314561843872, "learning_rate": 2.6917150321480534e-05, "loss": 0.0379, "step": 131500 }, { "epoch": 0.9778933680104032, "grad_norm": 1.0186352729797363, "learning_rate": 2.689759554463965e-05, "loss": 0.0409, "step": 131600 }, { "epoch": 0.9786364480772803, "grad_norm": 0.4463963508605957, "learning_rate": 2.6878040767798756e-05, "loss": 0.0406, "step": 131700 }, { "epoch": 0.9793795281441575, "grad_norm": 0.3784034848213196, "learning_rate": 2.685848599095787e-05, "loss": 0.0494, "step": 131800 }, { "epoch": 0.9801226082110347, "grad_norm": 0.5714205503463745, "learning_rate": 2.6838931214116986e-05, "loss": 0.0455, "step": 131900 }, { "epoch": 0.980865688277912, "grad_norm": 0.6880394220352173, "learning_rate": 2.68193764372761e-05, "loss": 0.0417, "step": 132000 }, { "epoch": 0.9816087683447892, "grad_norm": 1.1087974309921265, "learning_rate": 2.6799821660435216e-05, "loss": 0.0433, "step": 132100 }, { "epoch": 0.9823518484116663, "grad_norm": 0.19475625455379486, "learning_rate": 2.6780266883594324e-05, "loss": 0.0364, "step": 132200 }, { "epoch": 0.9830949284785435, "grad_norm": 0.5192739367485046, "learning_rate": 2.6760712106753438e-05, "loss": 0.0392, "step": 132300 }, { "epoch": 0.9838380085454208, "grad_norm": 0.3102996349334717, "learning_rate": 2.6741157329912553e-05, "loss": 0.0407, "step": 132400 }, { "epoch": 0.984581088612298, "grad_norm": 0.4617522656917572, "learning_rate": 2.6721602553071668e-05, "loss": 0.0466, "step": 132500 }, { "epoch": 0.9853241686791752, "grad_norm": 0.22478556632995605, "learning_rate": 2.6702047776230776e-05, "loss": 0.0409, "step": 132600 }, { "epoch": 0.9860672487460523, "grad_norm": 0.6935635209083557, "learning_rate": 2.668249299938989e-05, "loss": 0.0378, "step": 132700 }, { "epoch": 0.9868103288129296, "grad_norm": 0.4571967124938965, "learning_rate": 2.6662938222549005e-05, "loss": 0.0386, "step": 132800 }, { "epoch": 0.9875534088798068, "grad_norm": 0.9539922475814819, "learning_rate": 2.664338344570812e-05, "loss": 0.0338, "step": 132900 }, { "epoch": 0.988296488946684, "grad_norm": 0.5826877355575562, "learning_rate": 2.6623828668867235e-05, "loss": 0.0377, "step": 133000 }, { "epoch": 0.9890395690135612, "grad_norm": 2.026881217956543, "learning_rate": 2.6604273892026343e-05, "loss": 0.0422, "step": 133100 }, { "epoch": 0.9897826490804384, "grad_norm": 0.48808446526527405, "learning_rate": 2.6584719115185458e-05, "loss": 0.0477, "step": 133200 }, { "epoch": 0.9905257291473156, "grad_norm": 0.7616215348243713, "learning_rate": 2.6565164338344572e-05, "loss": 0.0338, "step": 133300 }, { "epoch": 0.9912688092141928, "grad_norm": 0.43862757086753845, "learning_rate": 2.6545609561503687e-05, "loss": 0.0574, "step": 133400 }, { "epoch": 0.99201188928107, "grad_norm": 1.4977893829345703, "learning_rate": 2.6526054784662795e-05, "loss": 0.0454, "step": 133500 }, { "epoch": 0.9927549693479473, "grad_norm": 0.21500712633132935, "learning_rate": 2.650650000782191e-05, "loss": 0.0393, "step": 133600 }, { "epoch": 0.9934980494148244, "grad_norm": 0.5629156827926636, "learning_rate": 2.6486945230981025e-05, "loss": 0.0407, "step": 133700 }, { "epoch": 0.9942411294817016, "grad_norm": 0.5244212746620178, "learning_rate": 2.646739045414014e-05, "loss": 0.0381, "step": 133800 }, { "epoch": 0.9949842095485789, "grad_norm": 0.1153145432472229, "learning_rate": 2.6447835677299254e-05, "loss": 0.039, "step": 133900 }, { "epoch": 0.9957272896154561, "grad_norm": 0.9481136798858643, "learning_rate": 2.6428280900458362e-05, "loss": 0.0407, "step": 134000 }, { "epoch": 0.9964703696823333, "grad_norm": 0.35142290592193604, "learning_rate": 2.6408726123617477e-05, "loss": 0.0368, "step": 134100 }, { "epoch": 0.9972134497492104, "grad_norm": 0.5583292245864868, "learning_rate": 2.6389171346776592e-05, "loss": 0.0378, "step": 134200 }, { "epoch": 0.9979565298160877, "grad_norm": 0.3817247450351715, "learning_rate": 2.6369616569935706e-05, "loss": 0.0393, "step": 134300 }, { "epoch": 0.9986996098829649, "grad_norm": 0.44154298305511475, "learning_rate": 2.6350061793094814e-05, "loss": 0.044, "step": 134400 }, { "epoch": 0.9994426899498421, "grad_norm": 0.16410627961158752, "learning_rate": 2.633050701625393e-05, "loss": 0.0366, "step": 134500 }, { "epoch": 1.0, "eval_accuracy": 0.7629602692927165, "eval_f1": 0.6652801618588536, "eval_loss": 0.03963995352387428, "eval_precision": 0.5912276368778883, "eval_recall": 0.7629602692927165, "eval_runtime": 667.9939, "eval_samples_per_second": 179.076, "eval_steps_per_second": 22.385, "step": 134575 }, { "epoch": 1.0001857700167194, "grad_norm": 1.35655677318573, "learning_rate": 2.6310952239413044e-05, "loss": 0.0463, "step": 134600 }, { "epoch": 1.0009288500835964, "grad_norm": 0.3125333786010742, "learning_rate": 2.629139746257216e-05, "loss": 0.0356, "step": 134700 }, { "epoch": 1.0016719301504737, "grad_norm": 0.6155613660812378, "learning_rate": 2.6271842685731274e-05, "loss": 0.0427, "step": 134800 }, { "epoch": 1.002415010217351, "grad_norm": 0.3894440829753876, "learning_rate": 2.625228790889038e-05, "loss": 0.0314, "step": 134900 }, { "epoch": 1.003158090284228, "grad_norm": 0.45999133586883545, "learning_rate": 2.6232733132049496e-05, "loss": 0.0355, "step": 135000 }, { "epoch": 1.0039011703511054, "grad_norm": 0.28052183985710144, "learning_rate": 2.621317835520861e-05, "loss": 0.0384, "step": 135100 }, { "epoch": 1.0046442504179824, "grad_norm": 2.483424425125122, "learning_rate": 2.6193623578367726e-05, "loss": 0.0366, "step": 135200 }, { "epoch": 1.0053873304848597, "grad_norm": 0.6149117946624756, "learning_rate": 2.617406880152684e-05, "loss": 0.0402, "step": 135300 }, { "epoch": 1.006130410551737, "grad_norm": 0.166657492518425, "learning_rate": 2.615451402468595e-05, "loss": 0.0442, "step": 135400 }, { "epoch": 1.006873490618614, "grad_norm": 0.6373744010925293, "learning_rate": 2.6134959247845063e-05, "loss": 0.0383, "step": 135500 }, { "epoch": 1.0076165706854914, "grad_norm": 0.09632986783981323, "learning_rate": 2.6115404471004178e-05, "loss": 0.0296, "step": 135600 }, { "epoch": 1.0083596507523687, "grad_norm": 0.2815961241722107, "learning_rate": 2.6095849694163293e-05, "loss": 0.0378, "step": 135700 }, { "epoch": 1.0091027308192457, "grad_norm": 4.425230503082275, "learning_rate": 2.60762949173224e-05, "loss": 0.0323, "step": 135800 }, { "epoch": 1.009845810886123, "grad_norm": 0.4867972731590271, "learning_rate": 2.6056740140481516e-05, "loss": 0.0275, "step": 135900 }, { "epoch": 1.010588890953, "grad_norm": 0.24305525422096252, "learning_rate": 2.603718536364063e-05, "loss": 0.0317, "step": 136000 }, { "epoch": 1.0113319710198774, "grad_norm": 0.7477482557296753, "learning_rate": 2.6017630586799745e-05, "loss": 0.0433, "step": 136100 }, { "epoch": 1.0120750510867547, "grad_norm": 0.8580988049507141, "learning_rate": 2.599807580995886e-05, "loss": 0.0323, "step": 136200 }, { "epoch": 1.0128181311536317, "grad_norm": 0.44249987602233887, "learning_rate": 2.597852103311797e-05, "loss": 0.0412, "step": 136300 }, { "epoch": 1.013561211220509, "grad_norm": 0.5045745372772217, "learning_rate": 2.5958966256277083e-05, "loss": 0.036, "step": 136400 }, { "epoch": 1.0143042912873863, "grad_norm": 0.6794184446334839, "learning_rate": 2.5939411479436197e-05, "loss": 0.0279, "step": 136500 }, { "epoch": 1.0150473713542634, "grad_norm": 0.3128235340118408, "learning_rate": 2.5919856702595312e-05, "loss": 0.0474, "step": 136600 }, { "epoch": 1.0157904514211407, "grad_norm": 0.2902253270149231, "learning_rate": 2.5900301925754424e-05, "loss": 0.0308, "step": 136700 }, { "epoch": 1.0165335314880177, "grad_norm": 0.3002735376358032, "learning_rate": 2.588074714891354e-05, "loss": 0.0351, "step": 136800 }, { "epoch": 1.017276611554895, "grad_norm": 2.1125378608703613, "learning_rate": 2.586119237207265e-05, "loss": 0.0358, "step": 136900 }, { "epoch": 1.0180196916217723, "grad_norm": 0.6243335008621216, "learning_rate": 2.5841637595231764e-05, "loss": 0.034, "step": 137000 }, { "epoch": 1.0187627716886494, "grad_norm": 0.7365167140960693, "learning_rate": 2.582208281839088e-05, "loss": 0.0441, "step": 137100 }, { "epoch": 1.0195058517555267, "grad_norm": 0.33042430877685547, "learning_rate": 2.580252804154999e-05, "loss": 0.0409, "step": 137200 }, { "epoch": 1.020248931822404, "grad_norm": 0.6323844194412231, "learning_rate": 2.5782973264709105e-05, "loss": 0.0472, "step": 137300 }, { "epoch": 1.020992011889281, "grad_norm": 0.6251844167709351, "learning_rate": 2.5763418487868217e-05, "loss": 0.0407, "step": 137400 }, { "epoch": 1.0217350919561583, "grad_norm": 0.40428656339645386, "learning_rate": 2.574386371102733e-05, "loss": 0.0334, "step": 137500 }, { "epoch": 1.0224781720230354, "grad_norm": 0.22695861756801605, "learning_rate": 2.5724308934186443e-05, "loss": 0.031, "step": 137600 }, { "epoch": 1.0232212520899127, "grad_norm": 0.29522642493247986, "learning_rate": 2.5704754157345558e-05, "loss": 0.0312, "step": 137700 }, { "epoch": 1.02396433215679, "grad_norm": 0.954203724861145, "learning_rate": 2.5685199380504672e-05, "loss": 0.0326, "step": 137800 }, { "epoch": 1.024707412223667, "grad_norm": 0.7621215581893921, "learning_rate": 2.5665644603663784e-05, "loss": 0.0504, "step": 137900 }, { "epoch": 1.0254504922905443, "grad_norm": 0.5916759967803955, "learning_rate": 2.56460898268229e-05, "loss": 0.0404, "step": 138000 }, { "epoch": 1.0261935723574216, "grad_norm": 0.48163267970085144, "learning_rate": 2.562653504998201e-05, "loss": 0.0311, "step": 138100 }, { "epoch": 1.0269366524242987, "grad_norm": 0.09392302483320236, "learning_rate": 2.5606980273141125e-05, "loss": 0.0379, "step": 138200 }, { "epoch": 1.027679732491176, "grad_norm": 0.48946893215179443, "learning_rate": 2.558742549630024e-05, "loss": 0.0283, "step": 138300 }, { "epoch": 1.0284228125580532, "grad_norm": 0.2751534581184387, "learning_rate": 2.556787071945935e-05, "loss": 0.0299, "step": 138400 }, { "epoch": 1.0291658926249303, "grad_norm": 0.5293516516685486, "learning_rate": 2.5548315942618462e-05, "loss": 0.0412, "step": 138500 }, { "epoch": 1.0299089726918076, "grad_norm": 0.39197584986686707, "learning_rate": 2.5528761165777577e-05, "loss": 0.0308, "step": 138600 }, { "epoch": 1.0306520527586847, "grad_norm": 0.1498292237520218, "learning_rate": 2.5509206388936692e-05, "loss": 0.0423, "step": 138700 }, { "epoch": 1.031395132825562, "grad_norm": 0.8281412720680237, "learning_rate": 2.5489651612095807e-05, "loss": 0.0317, "step": 138800 }, { "epoch": 1.0321382128924392, "grad_norm": 0.37594422698020935, "learning_rate": 2.547009683525492e-05, "loss": 0.0345, "step": 138900 }, { "epoch": 1.0328812929593163, "grad_norm": 0.9317287802696228, "learning_rate": 2.545054205841403e-05, "loss": 0.0353, "step": 139000 }, { "epoch": 1.0336243730261936, "grad_norm": 0.7673518657684326, "learning_rate": 2.5430987281573144e-05, "loss": 0.0322, "step": 139100 }, { "epoch": 1.0343674530930709, "grad_norm": 0.4195968210697174, "learning_rate": 2.541143250473226e-05, "loss": 0.0363, "step": 139200 }, { "epoch": 1.035110533159948, "grad_norm": 0.2660675644874573, "learning_rate": 2.5391877727891374e-05, "loss": 0.0365, "step": 139300 }, { "epoch": 1.0358536132268252, "grad_norm": 0.6165285706520081, "learning_rate": 2.537232295105048e-05, "loss": 0.0451, "step": 139400 }, { "epoch": 1.0365966932937023, "grad_norm": 0.3867144286632538, "learning_rate": 2.5352768174209596e-05, "loss": 0.0367, "step": 139500 }, { "epoch": 1.0373397733605796, "grad_norm": 0.6344079971313477, "learning_rate": 2.533321339736871e-05, "loss": 0.0441, "step": 139600 }, { "epoch": 1.0380828534274569, "grad_norm": 0.5577194094657898, "learning_rate": 2.5313658620527826e-05, "loss": 0.0385, "step": 139700 }, { "epoch": 1.038825933494334, "grad_norm": 0.4006801247596741, "learning_rate": 2.529410384368694e-05, "loss": 0.0464, "step": 139800 }, { "epoch": 1.0395690135612112, "grad_norm": 0.2844521701335907, "learning_rate": 2.527454906684605e-05, "loss": 0.0352, "step": 139900 }, { "epoch": 1.0403120936280885, "grad_norm": 2.2931294441223145, "learning_rate": 2.5254994290005163e-05, "loss": 0.0334, "step": 140000 }, { "epoch": 1.0410551736949656, "grad_norm": 0.9103361368179321, "learning_rate": 2.5235439513164278e-05, "loss": 0.039, "step": 140100 }, { "epoch": 1.0417982537618429, "grad_norm": 0.637322187423706, "learning_rate": 2.5215884736323393e-05, "loss": 0.0354, "step": 140200 }, { "epoch": 1.04254133382872, "grad_norm": 0.5245652794837952, "learning_rate": 2.51963299594825e-05, "loss": 0.0377, "step": 140300 }, { "epoch": 1.0432844138955972, "grad_norm": 0.3067414164543152, "learning_rate": 2.5176775182641616e-05, "loss": 0.0368, "step": 140400 }, { "epoch": 1.0440274939624745, "grad_norm": 0.23084840178489685, "learning_rate": 2.515722040580073e-05, "loss": 0.0324, "step": 140500 }, { "epoch": 1.0447705740293516, "grad_norm": 0.6069378852844238, "learning_rate": 2.5137665628959845e-05, "loss": 0.0329, "step": 140600 }, { "epoch": 1.0455136540962289, "grad_norm": 0.2943336069583893, "learning_rate": 2.511811085211896e-05, "loss": 0.0389, "step": 140700 }, { "epoch": 1.0462567341631062, "grad_norm": 0.32436391711235046, "learning_rate": 2.5098556075278068e-05, "loss": 0.0432, "step": 140800 }, { "epoch": 1.0469998142299832, "grad_norm": 0.17758707702159882, "learning_rate": 2.5079001298437183e-05, "loss": 0.0391, "step": 140900 }, { "epoch": 1.0477428942968605, "grad_norm": 0.2915591597557068, "learning_rate": 2.5059446521596297e-05, "loss": 0.0366, "step": 141000 }, { "epoch": 1.0484859743637376, "grad_norm": 0.24379968643188477, "learning_rate": 2.5039891744755412e-05, "loss": 0.033, "step": 141100 }, { "epoch": 1.0492290544306149, "grad_norm": 1.3872768878936768, "learning_rate": 2.5020336967914527e-05, "loss": 0.028, "step": 141200 }, { "epoch": 1.0499721344974922, "grad_norm": 0.14514194428920746, "learning_rate": 2.5000782191073635e-05, "loss": 0.0403, "step": 141300 }, { "epoch": 1.0507152145643692, "grad_norm": 0.594184935092926, "learning_rate": 2.498122741423275e-05, "loss": 0.0409, "step": 141400 }, { "epoch": 1.0514582946312465, "grad_norm": 4.184124946594238, "learning_rate": 2.4961672637391865e-05, "loss": 0.0402, "step": 141500 }, { "epoch": 1.0522013746981238, "grad_norm": 0.6025236248970032, "learning_rate": 2.4942117860550976e-05, "loss": 0.0279, "step": 141600 }, { "epoch": 1.0529444547650009, "grad_norm": 0.4775395095348358, "learning_rate": 2.492256308371009e-05, "loss": 0.0299, "step": 141700 }, { "epoch": 1.0536875348318782, "grad_norm": 0.3702109754085541, "learning_rate": 2.4903008306869202e-05, "loss": 0.0374, "step": 141800 }, { "epoch": 1.0544306148987554, "grad_norm": 0.9064620733261108, "learning_rate": 2.4883453530028317e-05, "loss": 0.0394, "step": 141900 }, { "epoch": 1.0551736949656325, "grad_norm": 0.383354127407074, "learning_rate": 2.4863898753187428e-05, "loss": 0.0416, "step": 142000 }, { "epoch": 1.0559167750325098, "grad_norm": 0.4391328990459442, "learning_rate": 2.4844343976346543e-05, "loss": 0.0378, "step": 142100 }, { "epoch": 1.0566598550993869, "grad_norm": 0.18355417251586914, "learning_rate": 2.4824789199505658e-05, "loss": 0.0418, "step": 142200 }, { "epoch": 1.0574029351662642, "grad_norm": 1.0761873722076416, "learning_rate": 2.480523442266477e-05, "loss": 0.0333, "step": 142300 }, { "epoch": 1.0581460152331414, "grad_norm": 2.7130227088928223, "learning_rate": 2.4785679645823884e-05, "loss": 0.0342, "step": 142400 }, { "epoch": 1.0588890953000185, "grad_norm": 0.7225663661956787, "learning_rate": 2.4766124868982995e-05, "loss": 0.0367, "step": 142500 }, { "epoch": 1.0596321753668958, "grad_norm": 1.3078348636627197, "learning_rate": 2.474657009214211e-05, "loss": 0.0318, "step": 142600 }, { "epoch": 1.060375255433773, "grad_norm": 1.3621437549591064, "learning_rate": 2.472701531530122e-05, "loss": 0.0407, "step": 142700 }, { "epoch": 1.0611183355006502, "grad_norm": 0.28861960768699646, "learning_rate": 2.4707460538460336e-05, "loss": 0.0355, "step": 142800 }, { "epoch": 1.0618614155675274, "grad_norm": 0.3325597643852234, "learning_rate": 2.468790576161945e-05, "loss": 0.0362, "step": 142900 }, { "epoch": 1.0626044956344045, "grad_norm": 2.4611587524414062, "learning_rate": 2.4668350984778562e-05, "loss": 0.0335, "step": 143000 }, { "epoch": 1.0633475757012818, "grad_norm": 0.45806363224983215, "learning_rate": 2.4648796207937677e-05, "loss": 0.0475, "step": 143100 }, { "epoch": 1.064090655768159, "grad_norm": 1.1240214109420776, "learning_rate": 2.462924143109679e-05, "loss": 0.035, "step": 143200 }, { "epoch": 1.0648337358350362, "grad_norm": 0.13347966969013214, "learning_rate": 2.4609686654255903e-05, "loss": 0.0379, "step": 143300 }, { "epoch": 1.0655768159019134, "grad_norm": 0.5631084442138672, "learning_rate": 2.4590131877415015e-05, "loss": 0.0352, "step": 143400 }, { "epoch": 1.0663198959687907, "grad_norm": 0.17790429294109344, "learning_rate": 2.457057710057413e-05, "loss": 0.0456, "step": 143500 }, { "epoch": 1.0670629760356678, "grad_norm": 0.25529009103775024, "learning_rate": 2.455102232373324e-05, "loss": 0.0327, "step": 143600 }, { "epoch": 1.067806056102545, "grad_norm": 0.34170007705688477, "learning_rate": 2.4531467546892355e-05, "loss": 0.0387, "step": 143700 }, { "epoch": 1.0685491361694222, "grad_norm": 0.388217031955719, "learning_rate": 2.451191277005147e-05, "loss": 0.0325, "step": 143800 }, { "epoch": 1.0692922162362994, "grad_norm": 0.26961347460746765, "learning_rate": 2.449235799321058e-05, "loss": 0.035, "step": 143900 }, { "epoch": 1.0700352963031767, "grad_norm": 0.3709469437599182, "learning_rate": 2.4472803216369696e-05, "loss": 0.0428, "step": 144000 }, { "epoch": 1.0707783763700538, "grad_norm": 0.3569774925708771, "learning_rate": 2.4453248439528808e-05, "loss": 0.0363, "step": 144100 }, { "epoch": 1.071521456436931, "grad_norm": 2.209057569503784, "learning_rate": 2.4433693662687923e-05, "loss": 0.034, "step": 144200 }, { "epoch": 1.0722645365038084, "grad_norm": 0.9984660148620605, "learning_rate": 2.4414138885847034e-05, "loss": 0.0347, "step": 144300 }, { "epoch": 1.0730076165706854, "grad_norm": 0.3184483051300049, "learning_rate": 2.439458410900615e-05, "loss": 0.037, "step": 144400 }, { "epoch": 1.0737506966375627, "grad_norm": 0.47557514905929565, "learning_rate": 2.437502933216526e-05, "loss": 0.0401, "step": 144500 }, { "epoch": 1.0744937767044398, "grad_norm": 1.1587680578231812, "learning_rate": 2.4355474555324375e-05, "loss": 0.0408, "step": 144600 }, { "epoch": 1.075236856771317, "grad_norm": 0.9874958395957947, "learning_rate": 2.433591977848349e-05, "loss": 0.0453, "step": 144700 }, { "epoch": 1.0759799368381944, "grad_norm": 0.3753519654273987, "learning_rate": 2.43163650016426e-05, "loss": 0.0328, "step": 144800 }, { "epoch": 1.0767230169050714, "grad_norm": 0.7671294808387756, "learning_rate": 2.4296810224801716e-05, "loss": 0.0321, "step": 144900 }, { "epoch": 1.0774660969719487, "grad_norm": 0.6509069204330444, "learning_rate": 2.4277255447960827e-05, "loss": 0.0364, "step": 145000 }, { "epoch": 1.078209177038826, "grad_norm": 0.08005964756011963, "learning_rate": 2.4257700671119942e-05, "loss": 0.0402, "step": 145100 }, { "epoch": 1.078952257105703, "grad_norm": 1.913771390914917, "learning_rate": 2.4238145894279053e-05, "loss": 0.0459, "step": 145200 }, { "epoch": 1.0796953371725804, "grad_norm": 0.28786540031433105, "learning_rate": 2.4218591117438168e-05, "loss": 0.0304, "step": 145300 }, { "epoch": 1.0804384172394577, "grad_norm": 0.6922600269317627, "learning_rate": 2.4199036340597283e-05, "loss": 0.0298, "step": 145400 }, { "epoch": 1.0811814973063347, "grad_norm": 0.2564089298248291, "learning_rate": 2.4179481563756394e-05, "loss": 0.0348, "step": 145500 }, { "epoch": 1.081924577373212, "grad_norm": 0.807911217212677, "learning_rate": 2.415992678691551e-05, "loss": 0.0465, "step": 145600 }, { "epoch": 1.082667657440089, "grad_norm": 1.55418860912323, "learning_rate": 2.414037201007462e-05, "loss": 0.0415, "step": 145700 }, { "epoch": 1.0834107375069664, "grad_norm": 0.5436394810676575, "learning_rate": 2.4120817233233735e-05, "loss": 0.0314, "step": 145800 }, { "epoch": 1.0841538175738437, "grad_norm": 0.38623565435409546, "learning_rate": 2.4101262456392846e-05, "loss": 0.0389, "step": 145900 }, { "epoch": 1.0848968976407207, "grad_norm": 1.7757738828659058, "learning_rate": 2.408170767955196e-05, "loss": 0.0471, "step": 146000 }, { "epoch": 1.085639977707598, "grad_norm": 0.5918220281600952, "learning_rate": 2.4062152902711076e-05, "loss": 0.0448, "step": 146100 }, { "epoch": 1.0863830577744753, "grad_norm": 0.13578473031520844, "learning_rate": 2.4042598125870187e-05, "loss": 0.0378, "step": 146200 }, { "epoch": 1.0871261378413524, "grad_norm": 0.38010281324386597, "learning_rate": 2.4023043349029302e-05, "loss": 0.0278, "step": 146300 }, { "epoch": 1.0878692179082297, "grad_norm": 0.1597740799188614, "learning_rate": 2.4003488572188413e-05, "loss": 0.0332, "step": 146400 }, { "epoch": 1.0886122979751067, "grad_norm": 1.475795030593872, "learning_rate": 2.3983933795347528e-05, "loss": 0.0326, "step": 146500 }, { "epoch": 1.089355378041984, "grad_norm": 0.3715950548648834, "learning_rate": 2.3964379018506643e-05, "loss": 0.0388, "step": 146600 }, { "epoch": 1.0900984581088613, "grad_norm": 0.627359926700592, "learning_rate": 2.3944824241665754e-05, "loss": 0.0304, "step": 146700 }, { "epoch": 1.0908415381757384, "grad_norm": 0.4948262870311737, "learning_rate": 2.392526946482487e-05, "loss": 0.0356, "step": 146800 }, { "epoch": 1.0915846182426157, "grad_norm": 0.39210522174835205, "learning_rate": 2.390571468798398e-05, "loss": 0.0344, "step": 146900 }, { "epoch": 1.092327698309493, "grad_norm": 0.6363001465797424, "learning_rate": 2.3886159911143095e-05, "loss": 0.0439, "step": 147000 }, { "epoch": 1.09307077837637, "grad_norm": 0.388141006231308, "learning_rate": 2.386660513430221e-05, "loss": 0.0357, "step": 147100 }, { "epoch": 1.0938138584432473, "grad_norm": 0.10537716001272202, "learning_rate": 2.384705035746132e-05, "loss": 0.0356, "step": 147200 }, { "epoch": 1.0945569385101244, "grad_norm": 0.3473242223262787, "learning_rate": 2.3827495580620436e-05, "loss": 0.0291, "step": 147300 }, { "epoch": 1.0953000185770017, "grad_norm": 0.7553157210350037, "learning_rate": 2.380794080377955e-05, "loss": 0.037, "step": 147400 }, { "epoch": 1.096043098643879, "grad_norm": 0.5281794667243958, "learning_rate": 2.3788386026938662e-05, "loss": 0.0368, "step": 147500 }, { "epoch": 1.096786178710756, "grad_norm": 0.7832167148590088, "learning_rate": 2.3768831250097777e-05, "loss": 0.0436, "step": 147600 }, { "epoch": 1.0975292587776333, "grad_norm": 0.27510908246040344, "learning_rate": 2.374927647325689e-05, "loss": 0.0322, "step": 147700 }, { "epoch": 1.0982723388445106, "grad_norm": 0.319529265165329, "learning_rate": 2.3729721696416003e-05, "loss": 0.0361, "step": 147800 }, { "epoch": 1.0990154189113877, "grad_norm": 1.0467051267623901, "learning_rate": 2.3710166919575115e-05, "loss": 0.0369, "step": 147900 }, { "epoch": 1.099758498978265, "grad_norm": 0.36631059646606445, "learning_rate": 2.369061214273423e-05, "loss": 0.0348, "step": 148000 }, { "epoch": 1.100501579045142, "grad_norm": 0.32173654437065125, "learning_rate": 2.3671057365893344e-05, "loss": 0.0427, "step": 148100 }, { "epoch": 1.1012446591120193, "grad_norm": 0.4891296327114105, "learning_rate": 2.3651502589052455e-05, "loss": 0.0343, "step": 148200 }, { "epoch": 1.1019877391788966, "grad_norm": 0.5163310170173645, "learning_rate": 2.363194781221157e-05, "loss": 0.0351, "step": 148300 }, { "epoch": 1.1027308192457737, "grad_norm": 0.5175864696502686, "learning_rate": 2.361239303537068e-05, "loss": 0.037, "step": 148400 }, { "epoch": 1.103473899312651, "grad_norm": 0.5821439623832703, "learning_rate": 2.3592838258529796e-05, "loss": 0.0339, "step": 148500 }, { "epoch": 1.1042169793795282, "grad_norm": 0.5665625333786011, "learning_rate": 2.3573283481688908e-05, "loss": 0.0373, "step": 148600 }, { "epoch": 1.1049600594464053, "grad_norm": 0.23404021561145782, "learning_rate": 2.3553728704848023e-05, "loss": 0.0361, "step": 148700 }, { "epoch": 1.1057031395132826, "grad_norm": 0.5941755175590515, "learning_rate": 2.3534173928007137e-05, "loss": 0.0416, "step": 148800 }, { "epoch": 1.1064462195801599, "grad_norm": 0.7676922678947449, "learning_rate": 2.351461915116625e-05, "loss": 0.0306, "step": 148900 }, { "epoch": 1.107189299647037, "grad_norm": 0.258245050907135, "learning_rate": 2.3495064374325363e-05, "loss": 0.0373, "step": 149000 }, { "epoch": 1.1079323797139142, "grad_norm": 0.5489481091499329, "learning_rate": 2.3475509597484475e-05, "loss": 0.0393, "step": 149100 }, { "epoch": 1.1086754597807913, "grad_norm": 2.6887807846069336, "learning_rate": 2.345595482064359e-05, "loss": 0.0422, "step": 149200 }, { "epoch": 1.1094185398476686, "grad_norm": 0.5251678228378296, "learning_rate": 2.34364000438027e-05, "loss": 0.0346, "step": 149300 }, { "epoch": 1.1101616199145459, "grad_norm": 2.026338815689087, "learning_rate": 2.3416845266961816e-05, "loss": 0.0319, "step": 149400 }, { "epoch": 1.110904699981423, "grad_norm": 0.18985234200954437, "learning_rate": 2.3397290490120927e-05, "loss": 0.0335, "step": 149500 }, { "epoch": 1.1116477800483002, "grad_norm": 0.567356288433075, "learning_rate": 2.3377735713280042e-05, "loss": 0.0465, "step": 149600 }, { "epoch": 1.1123908601151773, "grad_norm": 1.1245296001434326, "learning_rate": 2.3358180936439157e-05, "loss": 0.0398, "step": 149700 }, { "epoch": 1.1131339401820546, "grad_norm": 0.8502438068389893, "learning_rate": 2.3338626159598268e-05, "loss": 0.0342, "step": 149800 }, { "epoch": 1.1138770202489319, "grad_norm": 0.8321160674095154, "learning_rate": 2.3319071382757383e-05, "loss": 0.0375, "step": 149900 }, { "epoch": 1.114620100315809, "grad_norm": 2.9702956676483154, "learning_rate": 2.3299516605916494e-05, "loss": 0.0438, "step": 150000 }, { "epoch": 1.1153631803826862, "grad_norm": 0.3601522445678711, "learning_rate": 2.327996182907561e-05, "loss": 0.0376, "step": 150100 }, { "epoch": 1.1161062604495635, "grad_norm": 0.4370095133781433, "learning_rate": 2.326040705223472e-05, "loss": 0.0365, "step": 150200 }, { "epoch": 1.1168493405164406, "grad_norm": 0.13963592052459717, "learning_rate": 2.3240852275393835e-05, "loss": 0.0346, "step": 150300 }, { "epoch": 1.1175924205833179, "grad_norm": 0.4077025055885315, "learning_rate": 2.3221297498552946e-05, "loss": 0.037, "step": 150400 }, { "epoch": 1.1183355006501952, "grad_norm": 3.245108127593994, "learning_rate": 2.320174272171206e-05, "loss": 0.0341, "step": 150500 }, { "epoch": 1.1190785807170722, "grad_norm": 0.24773983657360077, "learning_rate": 2.3182187944871176e-05, "loss": 0.0343, "step": 150600 }, { "epoch": 1.1198216607839495, "grad_norm": 0.4352034330368042, "learning_rate": 2.3162633168030287e-05, "loss": 0.0317, "step": 150700 }, { "epoch": 1.1205647408508266, "grad_norm": 0.44821032881736755, "learning_rate": 2.3143078391189402e-05, "loss": 0.0308, "step": 150800 }, { "epoch": 1.1213078209177039, "grad_norm": 0.728609561920166, "learning_rate": 2.3123523614348513e-05, "loss": 0.038, "step": 150900 }, { "epoch": 1.1220509009845812, "grad_norm": 0.27437624335289, "learning_rate": 2.3103968837507628e-05, "loss": 0.0324, "step": 151000 }, { "epoch": 1.1227939810514582, "grad_norm": 0.4322657287120819, "learning_rate": 2.308441406066674e-05, "loss": 0.0376, "step": 151100 }, { "epoch": 1.1235370611183355, "grad_norm": 0.620232105255127, "learning_rate": 2.3064859283825854e-05, "loss": 0.0394, "step": 151200 }, { "epoch": 1.1242801411852128, "grad_norm": 0.5064946413040161, "learning_rate": 2.3045304506984966e-05, "loss": 0.042, "step": 151300 }, { "epoch": 1.1250232212520899, "grad_norm": 0.3419318199157715, "learning_rate": 2.302574973014408e-05, "loss": 0.0348, "step": 151400 }, { "epoch": 1.1257663013189672, "grad_norm": 0.42208656668663025, "learning_rate": 2.3006194953303195e-05, "loss": 0.0358, "step": 151500 }, { "epoch": 1.1265093813858442, "grad_norm": 0.3992689549922943, "learning_rate": 2.2986640176462307e-05, "loss": 0.0333, "step": 151600 }, { "epoch": 1.1272524614527215, "grad_norm": 0.9679842591285706, "learning_rate": 2.296708539962142e-05, "loss": 0.0333, "step": 151700 }, { "epoch": 1.1279955415195988, "grad_norm": 0.24478860199451447, "learning_rate": 2.2947530622780533e-05, "loss": 0.0404, "step": 151800 }, { "epoch": 1.1287386215864759, "grad_norm": 0.47963982820510864, "learning_rate": 2.2927975845939648e-05, "loss": 0.0357, "step": 151900 }, { "epoch": 1.1294817016533532, "grad_norm": 0.6389815211296082, "learning_rate": 2.290842106909876e-05, "loss": 0.0376, "step": 152000 }, { "epoch": 1.1302247817202304, "grad_norm": 0.18299856781959534, "learning_rate": 2.2888866292257874e-05, "loss": 0.0323, "step": 152100 }, { "epoch": 1.1309678617871075, "grad_norm": 0.22437183558940887, "learning_rate": 2.286931151541699e-05, "loss": 0.0333, "step": 152200 }, { "epoch": 1.1317109418539848, "grad_norm": 0.7526204586029053, "learning_rate": 2.28497567385761e-05, "loss": 0.0323, "step": 152300 }, { "epoch": 1.132454021920862, "grad_norm": 0.5435932278633118, "learning_rate": 2.2830201961735215e-05, "loss": 0.0356, "step": 152400 }, { "epoch": 1.1331971019877392, "grad_norm": 0.42265936732292175, "learning_rate": 2.2810647184894326e-05, "loss": 0.0337, "step": 152500 }, { "epoch": 1.1339401820546164, "grad_norm": 0.41524532437324524, "learning_rate": 2.279109240805344e-05, "loss": 0.034, "step": 152600 }, { "epoch": 1.1346832621214935, "grad_norm": 1.4400864839553833, "learning_rate": 2.2771537631212552e-05, "loss": 0.0329, "step": 152700 }, { "epoch": 1.1354263421883708, "grad_norm": 1.2547978162765503, "learning_rate": 2.2751982854371667e-05, "loss": 0.0385, "step": 152800 }, { "epoch": 1.136169422255248, "grad_norm": 1.3255412578582764, "learning_rate": 2.2732428077530778e-05, "loss": 0.0319, "step": 152900 }, { "epoch": 1.1369125023221252, "grad_norm": 0.14164698123931885, "learning_rate": 2.2712873300689893e-05, "loss": 0.045, "step": 153000 }, { "epoch": 1.1376555823890024, "grad_norm": 1.0323785543441772, "learning_rate": 2.2693318523849008e-05, "loss": 0.0342, "step": 153100 }, { "epoch": 1.1383986624558795, "grad_norm": 0.2735370397567749, "learning_rate": 2.267376374700812e-05, "loss": 0.0392, "step": 153200 }, { "epoch": 1.1391417425227568, "grad_norm": 0.6275005340576172, "learning_rate": 2.2654208970167234e-05, "loss": 0.0341, "step": 153300 }, { "epoch": 1.139884822589634, "grad_norm": 0.4018770158290863, "learning_rate": 2.2634654193326345e-05, "loss": 0.0362, "step": 153400 }, { "epoch": 1.1406279026565112, "grad_norm": 2.030106782913208, "learning_rate": 2.261509941648546e-05, "loss": 0.0416, "step": 153500 }, { "epoch": 1.1413709827233884, "grad_norm": 2.0044147968292236, "learning_rate": 2.259554463964457e-05, "loss": 0.0339, "step": 153600 }, { "epoch": 1.1421140627902657, "grad_norm": 0.3468457758426666, "learning_rate": 2.2575989862803686e-05, "loss": 0.0358, "step": 153700 }, { "epoch": 1.1428571428571428, "grad_norm": 1.4949098825454712, "learning_rate": 2.2556435085962798e-05, "loss": 0.0266, "step": 153800 }, { "epoch": 1.14360022292402, "grad_norm": 0.13217221200466156, "learning_rate": 2.2536880309121912e-05, "loss": 0.0348, "step": 153900 }, { "epoch": 1.1443433029908974, "grad_norm": 0.7691917419433594, "learning_rate": 2.2517325532281027e-05, "loss": 0.0306, "step": 154000 }, { "epoch": 1.1450863830577744, "grad_norm": 4.5140061378479, "learning_rate": 2.249777075544014e-05, "loss": 0.0363, "step": 154100 }, { "epoch": 1.1458294631246517, "grad_norm": 2.3634371757507324, "learning_rate": 2.2478215978599253e-05, "loss": 0.0318, "step": 154200 }, { "epoch": 1.1465725431915288, "grad_norm": 0.41061270236968994, "learning_rate": 2.2458661201758365e-05, "loss": 0.0314, "step": 154300 }, { "epoch": 1.147315623258406, "grad_norm": 0.9986578822135925, "learning_rate": 2.243910642491748e-05, "loss": 0.0417, "step": 154400 }, { "epoch": 1.1480587033252834, "grad_norm": 0.5750747919082642, "learning_rate": 2.241955164807659e-05, "loss": 0.0376, "step": 154500 }, { "epoch": 1.1488017833921604, "grad_norm": 0.104793980717659, "learning_rate": 2.2399996871235706e-05, "loss": 0.033, "step": 154600 }, { "epoch": 1.1495448634590377, "grad_norm": 3.221059799194336, "learning_rate": 2.238044209439482e-05, "loss": 0.0347, "step": 154700 }, { "epoch": 1.150287943525915, "grad_norm": 0.2757067084312439, "learning_rate": 2.2360887317553932e-05, "loss": 0.0295, "step": 154800 }, { "epoch": 1.151031023592792, "grad_norm": 0.3592912256717682, "learning_rate": 2.2341332540713046e-05, "loss": 0.035, "step": 154900 }, { "epoch": 1.1517741036596694, "grad_norm": 0.7630543112754822, "learning_rate": 2.2321777763872158e-05, "loss": 0.0341, "step": 155000 }, { "epoch": 1.1525171837265464, "grad_norm": 0.26659145951271057, "learning_rate": 2.2302222987031273e-05, "loss": 0.0328, "step": 155100 }, { "epoch": 1.1532602637934237, "grad_norm": 0.4302542805671692, "learning_rate": 2.2282668210190384e-05, "loss": 0.0329, "step": 155200 }, { "epoch": 1.154003343860301, "grad_norm": 0.05998706817626953, "learning_rate": 2.22631134333495e-05, "loss": 0.0344, "step": 155300 }, { "epoch": 1.154746423927178, "grad_norm": 0.5755786895751953, "learning_rate": 2.224355865650861e-05, "loss": 0.0377, "step": 155400 }, { "epoch": 1.1554895039940554, "grad_norm": 0.4675575792789459, "learning_rate": 2.2224003879667725e-05, "loss": 0.0383, "step": 155500 }, { "epoch": 1.1562325840609327, "grad_norm": 0.5721487998962402, "learning_rate": 2.220444910282684e-05, "loss": 0.0341, "step": 155600 }, { "epoch": 1.1569756641278097, "grad_norm": 1.4612236022949219, "learning_rate": 2.218489432598595e-05, "loss": 0.0339, "step": 155700 }, { "epoch": 1.157718744194687, "grad_norm": 0.7188844084739685, "learning_rate": 2.2165339549145066e-05, "loss": 0.0407, "step": 155800 }, { "epoch": 1.1584618242615643, "grad_norm": 1.4636353254318237, "learning_rate": 2.214578477230418e-05, "loss": 0.0282, "step": 155900 }, { "epoch": 1.1592049043284414, "grad_norm": 0.14393742382526398, "learning_rate": 2.2126229995463292e-05, "loss": 0.0363, "step": 156000 }, { "epoch": 1.1599479843953187, "grad_norm": 0.5960140824317932, "learning_rate": 2.2106675218622407e-05, "loss": 0.0351, "step": 156100 }, { "epoch": 1.1606910644621957, "grad_norm": 0.26346030831336975, "learning_rate": 2.2087120441781518e-05, "loss": 0.0374, "step": 156200 }, { "epoch": 1.161434144529073, "grad_norm": 0.655240535736084, "learning_rate": 2.2067565664940633e-05, "loss": 0.0322, "step": 156300 }, { "epoch": 1.1621772245959503, "grad_norm": 0.4094662666320801, "learning_rate": 2.2048010888099748e-05, "loss": 0.0304, "step": 156400 }, { "epoch": 1.1629203046628274, "grad_norm": 0.3913063108921051, "learning_rate": 2.202845611125886e-05, "loss": 0.0358, "step": 156500 }, { "epoch": 1.1636633847297047, "grad_norm": 0.192095547914505, "learning_rate": 2.2008901334417974e-05, "loss": 0.037, "step": 156600 }, { "epoch": 1.1644064647965817, "grad_norm": 0.6261915564537048, "learning_rate": 2.1989346557577085e-05, "loss": 0.0309, "step": 156700 }, { "epoch": 1.165149544863459, "grad_norm": 0.49710938334465027, "learning_rate": 2.19697917807362e-05, "loss": 0.0343, "step": 156800 }, { "epoch": 1.1658926249303363, "grad_norm": 0.10723802447319031, "learning_rate": 2.1950237003895315e-05, "loss": 0.0403, "step": 156900 }, { "epoch": 1.1666357049972134, "grad_norm": 0.277936190366745, "learning_rate": 2.1930682227054426e-05, "loss": 0.0318, "step": 157000 }, { "epoch": 1.1673787850640907, "grad_norm": 0.713280975818634, "learning_rate": 2.191112745021354e-05, "loss": 0.0298, "step": 157100 }, { "epoch": 1.168121865130968, "grad_norm": 0.44327324628829956, "learning_rate": 2.1891572673372652e-05, "loss": 0.0385, "step": 157200 }, { "epoch": 1.168864945197845, "grad_norm": 0.19714969396591187, "learning_rate": 2.1872017896531767e-05, "loss": 0.0329, "step": 157300 }, { "epoch": 1.1696080252647223, "grad_norm": 0.7242419719696045, "learning_rate": 2.1852463119690882e-05, "loss": 0.0375, "step": 157400 }, { "epoch": 1.1703511053315996, "grad_norm": 0.9834973216056824, "learning_rate": 2.1832908342849993e-05, "loss": 0.0329, "step": 157500 }, { "epoch": 1.1710941853984767, "grad_norm": 0.9900821447372437, "learning_rate": 2.1813353566009108e-05, "loss": 0.0368, "step": 157600 }, { "epoch": 1.171837265465354, "grad_norm": 0.7530263662338257, "learning_rate": 2.179379878916822e-05, "loss": 0.0398, "step": 157700 }, { "epoch": 1.172580345532231, "grad_norm": 0.2871798276901245, "learning_rate": 2.1774244012327334e-05, "loss": 0.041, "step": 157800 }, { "epoch": 1.1733234255991083, "grad_norm": 0.2379627674818039, "learning_rate": 2.1754689235486445e-05, "loss": 0.0387, "step": 157900 }, { "epoch": 1.1740665056659856, "grad_norm": 0.9080613255500793, "learning_rate": 2.173513445864556e-05, "loss": 0.0409, "step": 158000 }, { "epoch": 1.1748095857328626, "grad_norm": 0.10046412795782089, "learning_rate": 2.1715579681804675e-05, "loss": 0.0456, "step": 158100 }, { "epoch": 1.17555266579974, "grad_norm": 0.5864273309707642, "learning_rate": 2.1696024904963786e-05, "loss": 0.0402, "step": 158200 }, { "epoch": 1.176295745866617, "grad_norm": 0.23751531541347504, "learning_rate": 2.16764701281229e-05, "loss": 0.0362, "step": 158300 }, { "epoch": 1.1770388259334943, "grad_norm": 0.7018410563468933, "learning_rate": 2.1656915351282012e-05, "loss": 0.0386, "step": 158400 }, { "epoch": 1.1777819060003716, "grad_norm": 0.24361896514892578, "learning_rate": 2.1637360574441127e-05, "loss": 0.0362, "step": 158500 }, { "epoch": 1.1785249860672486, "grad_norm": 0.4930298626422882, "learning_rate": 2.161780579760024e-05, "loss": 0.032, "step": 158600 }, { "epoch": 1.179268066134126, "grad_norm": 0.7398736476898193, "learning_rate": 2.1598251020759353e-05, "loss": 0.0412, "step": 158700 }, { "epoch": 1.1800111462010032, "grad_norm": 0.6994415521621704, "learning_rate": 2.1578696243918465e-05, "loss": 0.035, "step": 158800 }, { "epoch": 1.1807542262678803, "grad_norm": 11.663919448852539, "learning_rate": 2.155914146707758e-05, "loss": 0.0392, "step": 158900 }, { "epoch": 1.1814973063347576, "grad_norm": 0.20914815366268158, "learning_rate": 2.1539586690236694e-05, "loss": 0.0307, "step": 159000 }, { "epoch": 1.1822403864016349, "grad_norm": 0.2456151247024536, "learning_rate": 2.1520031913395806e-05, "loss": 0.0357, "step": 159100 }, { "epoch": 1.182983466468512, "grad_norm": 0.45246168971061707, "learning_rate": 2.150047713655492e-05, "loss": 0.0393, "step": 159200 }, { "epoch": 1.1837265465353892, "grad_norm": 1.8400678634643555, "learning_rate": 2.1480922359714032e-05, "loss": 0.0327, "step": 159300 }, { "epoch": 1.1844696266022665, "grad_norm": 0.35787954926490784, "learning_rate": 2.1461367582873147e-05, "loss": 0.0362, "step": 159400 }, { "epoch": 1.1852127066691436, "grad_norm": 0.5125070214271545, "learning_rate": 2.1441812806032258e-05, "loss": 0.0427, "step": 159500 }, { "epoch": 1.1859557867360209, "grad_norm": 1.1087726354599, "learning_rate": 2.1422258029191373e-05, "loss": 0.0366, "step": 159600 }, { "epoch": 1.186698866802898, "grad_norm": 0.3815215826034546, "learning_rate": 2.1402703252350484e-05, "loss": 0.0296, "step": 159700 }, { "epoch": 1.1874419468697752, "grad_norm": 0.20366232097148895, "learning_rate": 2.13831484755096e-05, "loss": 0.0366, "step": 159800 }, { "epoch": 1.1881850269366525, "grad_norm": 0.6491705179214478, "learning_rate": 2.1363593698668714e-05, "loss": 0.0309, "step": 159900 }, { "epoch": 1.1889281070035296, "grad_norm": 0.8409714102745056, "learning_rate": 2.1344038921827825e-05, "loss": 0.032, "step": 160000 }, { "epoch": 1.1896711870704069, "grad_norm": 0.16338732838630676, "learning_rate": 2.132448414498694e-05, "loss": 0.0382, "step": 160100 }, { "epoch": 1.190414267137284, "grad_norm": 0.17091412842273712, "learning_rate": 2.130492936814605e-05, "loss": 0.0309, "step": 160200 }, { "epoch": 1.1911573472041612, "grad_norm": 0.3520488440990448, "learning_rate": 2.1285374591305166e-05, "loss": 0.0289, "step": 160300 }, { "epoch": 1.1919004272710385, "grad_norm": 2.472191333770752, "learning_rate": 2.1265819814464277e-05, "loss": 0.035, "step": 160400 }, { "epoch": 1.1926435073379156, "grad_norm": 0.25020912289619446, "learning_rate": 2.1246265037623392e-05, "loss": 0.031, "step": 160500 }, { "epoch": 1.1933865874047929, "grad_norm": 0.4794769883155823, "learning_rate": 2.1226710260782507e-05, "loss": 0.0378, "step": 160600 }, { "epoch": 1.1941296674716702, "grad_norm": 0.7542557120323181, "learning_rate": 2.1207155483941618e-05, "loss": 0.0405, "step": 160700 }, { "epoch": 1.1948727475385472, "grad_norm": 0.27544960379600525, "learning_rate": 2.1187600707100733e-05, "loss": 0.0329, "step": 160800 }, { "epoch": 1.1956158276054245, "grad_norm": 0.3808498978614807, "learning_rate": 2.1168045930259844e-05, "loss": 0.0339, "step": 160900 }, { "epoch": 1.1963589076723018, "grad_norm": 3.467193365097046, "learning_rate": 2.114849115341896e-05, "loss": 0.0451, "step": 161000 }, { "epoch": 1.1971019877391789, "grad_norm": 0.21264220774173737, "learning_rate": 2.112893637657807e-05, "loss": 0.0377, "step": 161100 }, { "epoch": 1.1978450678060562, "grad_norm": 1.3813154697418213, "learning_rate": 2.1109381599737185e-05, "loss": 0.0341, "step": 161200 }, { "epoch": 1.1985881478729332, "grad_norm": 0.0940852239727974, "learning_rate": 2.1089826822896297e-05, "loss": 0.0392, "step": 161300 }, { "epoch": 1.1993312279398105, "grad_norm": 0.7861352562904358, "learning_rate": 2.107027204605541e-05, "loss": 0.0349, "step": 161400 }, { "epoch": 1.2000743080066878, "grad_norm": 0.23223145306110382, "learning_rate": 2.1050717269214526e-05, "loss": 0.0324, "step": 161500 }, { "epoch": 1.2008173880735649, "grad_norm": 1.5023870468139648, "learning_rate": 2.1031162492373637e-05, "loss": 0.0326, "step": 161600 }, { "epoch": 1.2015604681404422, "grad_norm": 0.2249160259962082, "learning_rate": 2.1011607715532752e-05, "loss": 0.039, "step": 161700 }, { "epoch": 1.2023035482073192, "grad_norm": 0.870427131652832, "learning_rate": 2.0992052938691864e-05, "loss": 0.042, "step": 161800 }, { "epoch": 1.2030466282741965, "grad_norm": 0.34037265181541443, "learning_rate": 2.097249816185098e-05, "loss": 0.0325, "step": 161900 }, { "epoch": 1.2037897083410738, "grad_norm": 0.2891308665275574, "learning_rate": 2.095294338501009e-05, "loss": 0.0336, "step": 162000 }, { "epoch": 1.2045327884079509, "grad_norm": 0.8406009078025818, "learning_rate": 2.0933388608169205e-05, "loss": 0.0349, "step": 162100 }, { "epoch": 1.2052758684748281, "grad_norm": 0.26261410117149353, "learning_rate": 2.0913833831328316e-05, "loss": 0.0326, "step": 162200 }, { "epoch": 1.2060189485417054, "grad_norm": 0.43828654289245605, "learning_rate": 2.089427905448743e-05, "loss": 0.0402, "step": 162300 }, { "epoch": 1.2067620286085825, "grad_norm": 0.7037879824638367, "learning_rate": 2.0874724277646545e-05, "loss": 0.0317, "step": 162400 }, { "epoch": 1.2075051086754598, "grad_norm": 0.045462850481271744, "learning_rate": 2.0855169500805657e-05, "loss": 0.0386, "step": 162500 }, { "epoch": 1.208248188742337, "grad_norm": 6.583676815032959, "learning_rate": 2.083561472396477e-05, "loss": 0.034, "step": 162600 }, { "epoch": 1.2089912688092141, "grad_norm": 0.6324894428253174, "learning_rate": 2.0816059947123883e-05, "loss": 0.0325, "step": 162700 }, { "epoch": 1.2097343488760914, "grad_norm": 0.722091794013977, "learning_rate": 2.0796505170282998e-05, "loss": 0.0372, "step": 162800 }, { "epoch": 1.2104774289429687, "grad_norm": 0.8943377137184143, "learning_rate": 2.077695039344211e-05, "loss": 0.0362, "step": 162900 }, { "epoch": 1.2112205090098458, "grad_norm": 0.3651637136936188, "learning_rate": 2.0757395616601224e-05, "loss": 0.0336, "step": 163000 }, { "epoch": 1.211963589076723, "grad_norm": 0.49552422761917114, "learning_rate": 2.0737840839760335e-05, "loss": 0.0341, "step": 163100 }, { "epoch": 1.2127066691436001, "grad_norm": 0.9959818720817566, "learning_rate": 2.071828606291945e-05, "loss": 0.0332, "step": 163200 }, { "epoch": 1.2134497492104774, "grad_norm": 0.33852240443229675, "learning_rate": 2.0698731286078565e-05, "loss": 0.0532, "step": 163300 }, { "epoch": 1.2141928292773547, "grad_norm": 3.6180288791656494, "learning_rate": 2.0679176509237676e-05, "loss": 0.0336, "step": 163400 }, { "epoch": 1.2149359093442318, "grad_norm": 0.7133389711380005, "learning_rate": 2.065962173239679e-05, "loss": 0.0397, "step": 163500 }, { "epoch": 1.215678989411109, "grad_norm": 0.3096759021282196, "learning_rate": 2.0640066955555902e-05, "loss": 0.0343, "step": 163600 }, { "epoch": 1.2164220694779861, "grad_norm": 0.31376340985298157, "learning_rate": 2.0620512178715017e-05, "loss": 0.0371, "step": 163700 }, { "epoch": 1.2171651495448634, "grad_norm": 0.7873936891555786, "learning_rate": 2.060095740187413e-05, "loss": 0.0363, "step": 163800 }, { "epoch": 1.2179082296117407, "grad_norm": 0.09759629517793655, "learning_rate": 2.0581402625033243e-05, "loss": 0.0401, "step": 163900 }, { "epoch": 1.2186513096786178, "grad_norm": 0.3024936616420746, "learning_rate": 2.0561847848192358e-05, "loss": 0.0398, "step": 164000 }, { "epoch": 1.219394389745495, "grad_norm": 1.6954450607299805, "learning_rate": 2.054229307135147e-05, "loss": 0.0334, "step": 164100 }, { "epoch": 1.2201374698123724, "grad_norm": 0.764953076839447, "learning_rate": 2.0522738294510584e-05, "loss": 0.0382, "step": 164200 }, { "epoch": 1.2208805498792494, "grad_norm": 0.8107950091362, "learning_rate": 2.0503183517669695e-05, "loss": 0.0292, "step": 164300 }, { "epoch": 1.2216236299461267, "grad_norm": 0.3558790981769562, "learning_rate": 2.048362874082881e-05, "loss": 0.0335, "step": 164400 }, { "epoch": 1.222366710013004, "grad_norm": 0.653571367263794, "learning_rate": 2.046407396398792e-05, "loss": 0.0351, "step": 164500 }, { "epoch": 1.223109790079881, "grad_norm": 0.2760164439678192, "learning_rate": 2.0444519187147036e-05, "loss": 0.0423, "step": 164600 }, { "epoch": 1.2238528701467584, "grad_norm": 0.8808141946792603, "learning_rate": 2.0424964410306148e-05, "loss": 0.0355, "step": 164700 }, { "epoch": 1.2245959502136354, "grad_norm": 0.1933758705854416, "learning_rate": 2.0405409633465263e-05, "loss": 0.0258, "step": 164800 }, { "epoch": 1.2253390302805127, "grad_norm": 0.2783760130405426, "learning_rate": 2.0385854856624377e-05, "loss": 0.0398, "step": 164900 }, { "epoch": 1.22608211034739, "grad_norm": 0.9121683239936829, "learning_rate": 2.036630007978349e-05, "loss": 0.0367, "step": 165000 }, { "epoch": 1.226825190414267, "grad_norm": 0.6222537159919739, "learning_rate": 2.0346745302942603e-05, "loss": 0.0297, "step": 165100 }, { "epoch": 1.2275682704811444, "grad_norm": 0.42696240544319153, "learning_rate": 2.0327190526101715e-05, "loss": 0.0301, "step": 165200 }, { "epoch": 1.2283113505480214, "grad_norm": 0.4008450210094452, "learning_rate": 2.030763574926083e-05, "loss": 0.0362, "step": 165300 }, { "epoch": 1.2290544306148987, "grad_norm": 0.41091880202293396, "learning_rate": 2.0288080972419944e-05, "loss": 0.0314, "step": 165400 }, { "epoch": 1.229797510681776, "grad_norm": 0.5371354222297668, "learning_rate": 2.0268526195579056e-05, "loss": 0.0381, "step": 165500 }, { "epoch": 1.230540590748653, "grad_norm": 0.737773597240448, "learning_rate": 2.024897141873817e-05, "loss": 0.0367, "step": 165600 }, { "epoch": 1.2312836708155304, "grad_norm": 0.12454330921173096, "learning_rate": 2.0229416641897285e-05, "loss": 0.0286, "step": 165700 }, { "epoch": 1.2320267508824077, "grad_norm": 0.21534515917301178, "learning_rate": 2.0209861865056397e-05, "loss": 0.0338, "step": 165800 }, { "epoch": 1.2327698309492847, "grad_norm": 0.2900365889072418, "learning_rate": 2.019030708821551e-05, "loss": 0.0352, "step": 165900 }, { "epoch": 1.233512911016162, "grad_norm": 0.9813499450683594, "learning_rate": 2.0170752311374623e-05, "loss": 0.0278, "step": 166000 }, { "epoch": 1.2342559910830393, "grad_norm": 0.332019567489624, "learning_rate": 2.0151197534533738e-05, "loss": 0.0449, "step": 166100 }, { "epoch": 1.2349990711499164, "grad_norm": 0.2970161437988281, "learning_rate": 2.0131642757692852e-05, "loss": 0.0303, "step": 166200 }, { "epoch": 1.2357421512167936, "grad_norm": 2.780109405517578, "learning_rate": 2.0112087980851964e-05, "loss": 0.031, "step": 166300 }, { "epoch": 1.236485231283671, "grad_norm": 3.780625820159912, "learning_rate": 2.009253320401108e-05, "loss": 0.0395, "step": 166400 }, { "epoch": 1.237228311350548, "grad_norm": 3.4802348613739014, "learning_rate": 2.007297842717019e-05, "loss": 0.0339, "step": 166500 }, { "epoch": 1.2379713914174253, "grad_norm": 1.410647988319397, "learning_rate": 2.0053423650329305e-05, "loss": 0.037, "step": 166600 }, { "epoch": 1.2387144714843024, "grad_norm": 1.5423465967178345, "learning_rate": 2.003386887348842e-05, "loss": 0.0416, "step": 166700 }, { "epoch": 1.2394575515511796, "grad_norm": 0.8610843420028687, "learning_rate": 2.001431409664753e-05, "loss": 0.0356, "step": 166800 }, { "epoch": 1.240200631618057, "grad_norm": 0.2917674779891968, "learning_rate": 1.9994759319806645e-05, "loss": 0.0305, "step": 166900 }, { "epoch": 1.240943711684934, "grad_norm": 0.5899860858917236, "learning_rate": 1.9975204542965757e-05, "loss": 0.0371, "step": 167000 }, { "epoch": 1.2416867917518113, "grad_norm": 0.2555330991744995, "learning_rate": 1.995564976612487e-05, "loss": 0.0346, "step": 167100 }, { "epoch": 1.2424298718186884, "grad_norm": 1.187002420425415, "learning_rate": 1.9936094989283983e-05, "loss": 0.0357, "step": 167200 }, { "epoch": 1.2431729518855656, "grad_norm": 1.048977017402649, "learning_rate": 1.9916540212443098e-05, "loss": 0.033, "step": 167300 }, { "epoch": 1.243916031952443, "grad_norm": 1.1387991905212402, "learning_rate": 1.9896985435602213e-05, "loss": 0.0298, "step": 167400 }, { "epoch": 1.24465911201932, "grad_norm": 0.831271231174469, "learning_rate": 1.9877430658761324e-05, "loss": 0.03, "step": 167500 }, { "epoch": 1.2454021920861973, "grad_norm": 1.4371176958084106, "learning_rate": 1.985787588192044e-05, "loss": 0.0347, "step": 167600 }, { "epoch": 1.2461452721530746, "grad_norm": 0.257040798664093, "learning_rate": 1.983832110507955e-05, "loss": 0.0327, "step": 167700 }, { "epoch": 1.2468883522199516, "grad_norm": 0.4376273453235626, "learning_rate": 1.9818766328238665e-05, "loss": 0.035, "step": 167800 }, { "epoch": 1.247631432286829, "grad_norm": 0.5188146233558655, "learning_rate": 1.9799211551397776e-05, "loss": 0.0331, "step": 167900 }, { "epoch": 1.2483745123537062, "grad_norm": 0.32235074043273926, "learning_rate": 1.977965677455689e-05, "loss": 0.0339, "step": 168000 }, { "epoch": 1.2491175924205833, "grad_norm": 0.2912561595439911, "learning_rate": 1.9760101997716002e-05, "loss": 0.0287, "step": 168100 }, { "epoch": 1.2498606724874606, "grad_norm": 0.41709116101264954, "learning_rate": 1.9740547220875117e-05, "loss": 0.0371, "step": 168200 }, { "epoch": 1.2506037525543379, "grad_norm": 0.7136671543121338, "learning_rate": 1.9720992444034232e-05, "loss": 0.0388, "step": 168300 }, { "epoch": 1.251346832621215, "grad_norm": 0.5286296010017395, "learning_rate": 1.9701437667193343e-05, "loss": 0.0312, "step": 168400 }, { "epoch": 1.2520899126880922, "grad_norm": 0.3966440260410309, "learning_rate": 1.9681882890352458e-05, "loss": 0.0293, "step": 168500 }, { "epoch": 1.2528329927549693, "grad_norm": 0.3470055162906647, "learning_rate": 1.966232811351157e-05, "loss": 0.0339, "step": 168600 }, { "epoch": 1.2535760728218466, "grad_norm": 0.259016752243042, "learning_rate": 1.9642773336670684e-05, "loss": 0.0395, "step": 168700 }, { "epoch": 1.2543191528887236, "grad_norm": 0.34805652499198914, "learning_rate": 1.9623218559829796e-05, "loss": 0.0351, "step": 168800 }, { "epoch": 1.255062232955601, "grad_norm": 0.3001208007335663, "learning_rate": 1.960366378298891e-05, "loss": 0.0351, "step": 168900 }, { "epoch": 1.2558053130224782, "grad_norm": 0.3897779881954193, "learning_rate": 1.958410900614802e-05, "loss": 0.0361, "step": 169000 }, { "epoch": 1.2565483930893553, "grad_norm": 0.19885696470737457, "learning_rate": 1.9564554229307136e-05, "loss": 0.0278, "step": 169100 }, { "epoch": 1.2572914731562326, "grad_norm": 0.5609990954399109, "learning_rate": 1.954499945246625e-05, "loss": 0.0356, "step": 169200 }, { "epoch": 1.2580345532231099, "grad_norm": 0.24348309636116028, "learning_rate": 1.9525444675625363e-05, "loss": 0.0367, "step": 169300 }, { "epoch": 1.258777633289987, "grad_norm": 2.3442747592926025, "learning_rate": 1.9505889898784477e-05, "loss": 0.0356, "step": 169400 }, { "epoch": 1.2595207133568642, "grad_norm": 0.479087233543396, "learning_rate": 1.948633512194359e-05, "loss": 0.0318, "step": 169500 }, { "epoch": 1.2602637934237415, "grad_norm": 0.205229252576828, "learning_rate": 1.9466780345102703e-05, "loss": 0.0317, "step": 169600 }, { "epoch": 1.2610068734906186, "grad_norm": 0.11361387372016907, "learning_rate": 1.9447225568261815e-05, "loss": 0.0313, "step": 169700 }, { "epoch": 1.2617499535574959, "grad_norm": 0.4740273952484131, "learning_rate": 1.942767079142093e-05, "loss": 0.0375, "step": 169800 }, { "epoch": 1.2624930336243732, "grad_norm": 0.1905169039964676, "learning_rate": 1.9408116014580044e-05, "loss": 0.0369, "step": 169900 }, { "epoch": 1.2632361136912502, "grad_norm": 1.28603994846344, "learning_rate": 1.9388561237739156e-05, "loss": 0.042, "step": 170000 }, { "epoch": 1.2639791937581275, "grad_norm": 1.919394612312317, "learning_rate": 1.936900646089827e-05, "loss": 0.042, "step": 170100 }, { "epoch": 1.2647222738250046, "grad_norm": 0.9825271964073181, "learning_rate": 1.9349451684057382e-05, "loss": 0.0383, "step": 170200 }, { "epoch": 1.2654653538918819, "grad_norm": 0.28312695026397705, "learning_rate": 1.9329896907216497e-05, "loss": 0.0345, "step": 170300 }, { "epoch": 1.266208433958759, "grad_norm": 0.26713550090789795, "learning_rate": 1.9310342130375608e-05, "loss": 0.0299, "step": 170400 }, { "epoch": 1.2669515140256362, "grad_norm": 1.1491830348968506, "learning_rate": 1.9290787353534723e-05, "loss": 0.0374, "step": 170500 }, { "epoch": 1.2676945940925135, "grad_norm": 0.4354964792728424, "learning_rate": 1.9271232576693834e-05, "loss": 0.03, "step": 170600 }, { "epoch": 1.2684376741593906, "grad_norm": 0.3654046654701233, "learning_rate": 1.925167779985295e-05, "loss": 0.0294, "step": 170700 }, { "epoch": 1.2691807542262679, "grad_norm": 0.4584351181983948, "learning_rate": 1.9232123023012064e-05, "loss": 0.03, "step": 170800 }, { "epoch": 1.2699238342931451, "grad_norm": 1.598044991493225, "learning_rate": 1.9212568246171175e-05, "loss": 0.0434, "step": 170900 }, { "epoch": 1.2706669143600222, "grad_norm": 0.12666597962379456, "learning_rate": 1.919301346933029e-05, "loss": 0.0388, "step": 171000 }, { "epoch": 1.2714099944268995, "grad_norm": 0.9374894499778748, "learning_rate": 1.91734586924894e-05, "loss": 0.0369, "step": 171100 }, { "epoch": 1.2721530744937768, "grad_norm": 0.2855191230773926, "learning_rate": 1.9153903915648516e-05, "loss": 0.0367, "step": 171200 }, { "epoch": 1.2728961545606539, "grad_norm": 0.9730690121650696, "learning_rate": 1.9134349138807627e-05, "loss": 0.0388, "step": 171300 }, { "epoch": 1.2736392346275311, "grad_norm": 0.5412880182266235, "learning_rate": 1.9114794361966742e-05, "loss": 0.0327, "step": 171400 }, { "epoch": 1.2743823146944084, "grad_norm": 0.28647181391716003, "learning_rate": 1.9095239585125853e-05, "loss": 0.0317, "step": 171500 }, { "epoch": 1.2751253947612855, "grad_norm": 0.42807912826538086, "learning_rate": 1.9075684808284968e-05, "loss": 0.0421, "step": 171600 }, { "epoch": 1.2758684748281628, "grad_norm": 0.511730432510376, "learning_rate": 1.9056130031444083e-05, "loss": 0.0293, "step": 171700 }, { "epoch": 1.27661155489504, "grad_norm": 0.4729938507080078, "learning_rate": 1.9036575254603194e-05, "loss": 0.0422, "step": 171800 }, { "epoch": 1.2773546349619171, "grad_norm": 0.26302099227905273, "learning_rate": 1.901702047776231e-05, "loss": 0.0317, "step": 171900 }, { "epoch": 1.2780977150287944, "grad_norm": 0.2880276143550873, "learning_rate": 1.899746570092142e-05, "loss": 0.0311, "step": 172000 }, { "epoch": 1.2788407950956715, "grad_norm": 1.155226230621338, "learning_rate": 1.8977910924080535e-05, "loss": 0.0308, "step": 172100 }, { "epoch": 1.2795838751625488, "grad_norm": 1.3296387195587158, "learning_rate": 1.8958356147239647e-05, "loss": 0.0319, "step": 172200 }, { "epoch": 1.2803269552294259, "grad_norm": 1.0214375257492065, "learning_rate": 1.893880137039876e-05, "loss": 0.0397, "step": 172300 }, { "epoch": 1.2810700352963031, "grad_norm": 0.7013433575630188, "learning_rate": 1.8919246593557873e-05, "loss": 0.0334, "step": 172400 }, { "epoch": 1.2818131153631804, "grad_norm": 0.16688206791877747, "learning_rate": 1.8899691816716988e-05, "loss": 0.0367, "step": 172500 }, { "epoch": 1.2825561954300575, "grad_norm": 0.3773268163204193, "learning_rate": 1.8880137039876102e-05, "loss": 0.0362, "step": 172600 }, { "epoch": 1.2832992754969348, "grad_norm": 0.0466952919960022, "learning_rate": 1.8860582263035214e-05, "loss": 0.0362, "step": 172700 }, { "epoch": 1.284042355563812, "grad_norm": 2.628040075302124, "learning_rate": 1.884102748619433e-05, "loss": 0.0389, "step": 172800 }, { "epoch": 1.2847854356306891, "grad_norm": 0.4646158218383789, "learning_rate": 1.882147270935344e-05, "loss": 0.0302, "step": 172900 }, { "epoch": 1.2855285156975664, "grad_norm": 0.6021349430084229, "learning_rate": 1.8801917932512555e-05, "loss": 0.0366, "step": 173000 }, { "epoch": 1.2862715957644437, "grad_norm": 1.8797307014465332, "learning_rate": 1.8782363155671666e-05, "loss": 0.0398, "step": 173100 }, { "epoch": 1.2870146758313208, "grad_norm": 0.10523168742656708, "learning_rate": 1.876280837883078e-05, "loss": 0.0273, "step": 173200 }, { "epoch": 1.287757755898198, "grad_norm": 0.16543729603290558, "learning_rate": 1.8743253601989896e-05, "loss": 0.0349, "step": 173300 }, { "epoch": 1.2885008359650754, "grad_norm": 0.25691303610801697, "learning_rate": 1.8723698825149007e-05, "loss": 0.0391, "step": 173400 }, { "epoch": 1.2892439160319524, "grad_norm": 0.1685878187417984, "learning_rate": 1.870414404830812e-05, "loss": 0.035, "step": 173500 }, { "epoch": 1.2899869960988297, "grad_norm": 0.5355182886123657, "learning_rate": 1.8684589271467233e-05, "loss": 0.0438, "step": 173600 }, { "epoch": 1.2907300761657068, "grad_norm": 0.31830403208732605, "learning_rate": 1.8665034494626348e-05, "loss": 0.0367, "step": 173700 }, { "epoch": 1.291473156232584, "grad_norm": 0.9692487120628357, "learning_rate": 1.864547971778546e-05, "loss": 0.029, "step": 173800 }, { "epoch": 1.2922162362994611, "grad_norm": 1.5226008892059326, "learning_rate": 1.8625924940944574e-05, "loss": 0.0409, "step": 173900 }, { "epoch": 1.2929593163663384, "grad_norm": 0.7756367325782776, "learning_rate": 1.8606370164103685e-05, "loss": 0.0383, "step": 174000 }, { "epoch": 1.2937023964332157, "grad_norm": 0.5222955346107483, "learning_rate": 1.85868153872628e-05, "loss": 0.0297, "step": 174100 }, { "epoch": 1.2944454765000928, "grad_norm": 0.9581413865089417, "learning_rate": 1.8567260610421915e-05, "loss": 0.042, "step": 174200 }, { "epoch": 1.29518855656697, "grad_norm": 0.22693189978599548, "learning_rate": 1.8547705833581026e-05, "loss": 0.0329, "step": 174300 }, { "epoch": 1.2959316366338474, "grad_norm": 0.48964324593544006, "learning_rate": 1.852815105674014e-05, "loss": 0.0268, "step": 174400 }, { "epoch": 1.2966747167007244, "grad_norm": 0.9720394611358643, "learning_rate": 1.8508596279899252e-05, "loss": 0.0366, "step": 174500 }, { "epoch": 1.2974177967676017, "grad_norm": 0.8849733471870422, "learning_rate": 1.8489041503058367e-05, "loss": 0.0363, "step": 174600 }, { "epoch": 1.298160876834479, "grad_norm": 0.6698055267333984, "learning_rate": 1.8469486726217482e-05, "loss": 0.0339, "step": 174700 }, { "epoch": 1.298903956901356, "grad_norm": 0.35203811526298523, "learning_rate": 1.8449931949376593e-05, "loss": 0.0375, "step": 174800 }, { "epoch": 1.2996470369682334, "grad_norm": 0.31598934531211853, "learning_rate": 1.8430377172535708e-05, "loss": 0.0322, "step": 174900 }, { "epoch": 1.3003901170351106, "grad_norm": 0.5569741129875183, "learning_rate": 1.8410822395694823e-05, "loss": 0.03, "step": 175000 }, { "epoch": 1.3011331971019877, "grad_norm": 0.21456238627433777, "learning_rate": 1.8391267618853934e-05, "loss": 0.032, "step": 175100 }, { "epoch": 1.301876277168865, "grad_norm": 0.37482917308807373, "learning_rate": 1.837171284201305e-05, "loss": 0.0353, "step": 175200 }, { "epoch": 1.302619357235742, "grad_norm": 0.7771459221839905, "learning_rate": 1.835215806517216e-05, "loss": 0.0371, "step": 175300 }, { "epoch": 1.3033624373026194, "grad_norm": 0.2363997995853424, "learning_rate": 1.8332603288331275e-05, "loss": 0.0337, "step": 175400 }, { "epoch": 1.3041055173694964, "grad_norm": 0.38741201162338257, "learning_rate": 1.831304851149039e-05, "loss": 0.0334, "step": 175500 }, { "epoch": 1.3048485974363737, "grad_norm": 2.422607183456421, "learning_rate": 1.82934937346495e-05, "loss": 0.0409, "step": 175600 }, { "epoch": 1.305591677503251, "grad_norm": 0.12574341893196106, "learning_rate": 1.8273938957808616e-05, "loss": 0.0323, "step": 175700 }, { "epoch": 1.306334757570128, "grad_norm": 0.12055402994155884, "learning_rate": 1.8254384180967727e-05, "loss": 0.0349, "step": 175800 }, { "epoch": 1.3070778376370054, "grad_norm": 0.6625957489013672, "learning_rate": 1.8234829404126842e-05, "loss": 0.0322, "step": 175900 }, { "epoch": 1.3078209177038826, "grad_norm": 0.16382990777492523, "learning_rate": 1.8215274627285957e-05, "loss": 0.0342, "step": 176000 }, { "epoch": 1.3085639977707597, "grad_norm": 0.45729202032089233, "learning_rate": 1.8195719850445068e-05, "loss": 0.0312, "step": 176100 }, { "epoch": 1.309307077837637, "grad_norm": 0.635960578918457, "learning_rate": 1.8176165073604183e-05, "loss": 0.0304, "step": 176200 }, { "epoch": 1.3100501579045143, "grad_norm": 0.39315423369407654, "learning_rate": 1.8156610296763294e-05, "loss": 0.0414, "step": 176300 }, { "epoch": 1.3107932379713914, "grad_norm": 0.7515998482704163, "learning_rate": 1.813705551992241e-05, "loss": 0.042, "step": 176400 }, { "epoch": 1.3115363180382686, "grad_norm": 0.2626771032810211, "learning_rate": 1.811750074308152e-05, "loss": 0.0266, "step": 176500 }, { "epoch": 1.312279398105146, "grad_norm": 0.11907365918159485, "learning_rate": 1.8097945966240635e-05, "loss": 0.0411, "step": 176600 }, { "epoch": 1.313022478172023, "grad_norm": 0.5965194702148438, "learning_rate": 1.807839118939975e-05, "loss": 0.0363, "step": 176700 }, { "epoch": 1.3137655582389003, "grad_norm": 0.40592366456985474, "learning_rate": 1.805883641255886e-05, "loss": 0.0348, "step": 176800 }, { "epoch": 1.3145086383057776, "grad_norm": 0.558479905128479, "learning_rate": 1.8039281635717976e-05, "loss": 0.0386, "step": 176900 }, { "epoch": 1.3152517183726546, "grad_norm": 0.02807101234793663, "learning_rate": 1.8019726858877088e-05, "loss": 0.0397, "step": 177000 }, { "epoch": 1.315994798439532, "grad_norm": 2.3857409954071045, "learning_rate": 1.8000172082036202e-05, "loss": 0.0339, "step": 177100 }, { "epoch": 1.316737878506409, "grad_norm": 0.5789180994033813, "learning_rate": 1.7980617305195314e-05, "loss": 0.035, "step": 177200 }, { "epoch": 1.3174809585732863, "grad_norm": 0.8339529037475586, "learning_rate": 1.796106252835443e-05, "loss": 0.0289, "step": 177300 }, { "epoch": 1.3182240386401634, "grad_norm": 0.26358193159103394, "learning_rate": 1.794150775151354e-05, "loss": 0.0362, "step": 177400 }, { "epoch": 1.3189671187070406, "grad_norm": 0.3231770396232605, "learning_rate": 1.7921952974672655e-05, "loss": 0.029, "step": 177500 }, { "epoch": 1.319710198773918, "grad_norm": 0.18534265458583832, "learning_rate": 1.790239819783177e-05, "loss": 0.0342, "step": 177600 }, { "epoch": 1.320453278840795, "grad_norm": 0.3210926353931427, "learning_rate": 1.788284342099088e-05, "loss": 0.0388, "step": 177700 }, { "epoch": 1.3211963589076723, "grad_norm": 0.49060729146003723, "learning_rate": 1.7863288644149996e-05, "loss": 0.0361, "step": 177800 }, { "epoch": 1.3219394389745496, "grad_norm": 0.31875815987586975, "learning_rate": 1.7843733867309107e-05, "loss": 0.0383, "step": 177900 }, { "epoch": 1.3226825190414266, "grad_norm": 1.551681637763977, "learning_rate": 1.7824179090468222e-05, "loss": 0.0358, "step": 178000 }, { "epoch": 1.323425599108304, "grad_norm": 0.13733431696891785, "learning_rate": 1.7804624313627333e-05, "loss": 0.0357, "step": 178100 }, { "epoch": 1.3241686791751812, "grad_norm": 0.2755758762359619, "learning_rate": 1.7785069536786448e-05, "loss": 0.0382, "step": 178200 }, { "epoch": 1.3249117592420583, "grad_norm": 0.551645040512085, "learning_rate": 1.776551475994556e-05, "loss": 0.0382, "step": 178300 }, { "epoch": 1.3256548393089356, "grad_norm": 1.7311663627624512, "learning_rate": 1.7745959983104674e-05, "loss": 0.0381, "step": 178400 }, { "epoch": 1.3263979193758129, "grad_norm": 0.5576179623603821, "learning_rate": 1.772640520626379e-05, "loss": 0.0417, "step": 178500 }, { "epoch": 1.32714099944269, "grad_norm": 0.34843137860298157, "learning_rate": 1.77068504294229e-05, "loss": 0.0303, "step": 178600 }, { "epoch": 1.3278840795095672, "grad_norm": 0.14235766232013702, "learning_rate": 1.7687295652582015e-05, "loss": 0.0299, "step": 178700 }, { "epoch": 1.3286271595764443, "grad_norm": 0.05314256623387337, "learning_rate": 1.7667740875741126e-05, "loss": 0.0318, "step": 178800 }, { "epoch": 1.3293702396433216, "grad_norm": 0.8827135562896729, "learning_rate": 1.764818609890024e-05, "loss": 0.0354, "step": 178900 }, { "epoch": 1.3301133197101986, "grad_norm": 0.7928223609924316, "learning_rate": 1.7628631322059352e-05, "loss": 0.0355, "step": 179000 }, { "epoch": 1.330856399777076, "grad_norm": 0.20383597910404205, "learning_rate": 1.7609076545218467e-05, "loss": 0.0324, "step": 179100 }, { "epoch": 1.3315994798439532, "grad_norm": 0.40109458565711975, "learning_rate": 1.7589521768377582e-05, "loss": 0.0296, "step": 179200 }, { "epoch": 1.3323425599108303, "grad_norm": 0.6617945432662964, "learning_rate": 1.7569966991536693e-05, "loss": 0.0323, "step": 179300 }, { "epoch": 1.3330856399777076, "grad_norm": 0.24502228200435638, "learning_rate": 1.7550412214695808e-05, "loss": 0.0365, "step": 179400 }, { "epoch": 1.3338287200445849, "grad_norm": 0.630061149597168, "learning_rate": 1.753085743785492e-05, "loss": 0.0329, "step": 179500 }, { "epoch": 1.334571800111462, "grad_norm": 0.1685090959072113, "learning_rate": 1.7511302661014034e-05, "loss": 0.0418, "step": 179600 }, { "epoch": 1.3353148801783392, "grad_norm": 5.210837364196777, "learning_rate": 1.7491747884173146e-05, "loss": 0.0329, "step": 179700 }, { "epoch": 1.3360579602452165, "grad_norm": 0.22399815917015076, "learning_rate": 1.747219310733226e-05, "loss": 0.0391, "step": 179800 }, { "epoch": 1.3368010403120936, "grad_norm": 0.29514431953430176, "learning_rate": 1.7452638330491372e-05, "loss": 0.0352, "step": 179900 }, { "epoch": 1.3375441203789709, "grad_norm": 4.753234386444092, "learning_rate": 1.7433083553650487e-05, "loss": 0.0374, "step": 180000 }, { "epoch": 1.3382872004458481, "grad_norm": 2.323798894882202, "learning_rate": 1.74135287768096e-05, "loss": 0.041, "step": 180100 }, { "epoch": 1.3390302805127252, "grad_norm": 1.0183219909667969, "learning_rate": 1.7393973999968713e-05, "loss": 0.0328, "step": 180200 }, { "epoch": 1.3397733605796025, "grad_norm": 0.09760218858718872, "learning_rate": 1.7374419223127827e-05, "loss": 0.0292, "step": 180300 }, { "epoch": 1.3405164406464798, "grad_norm": 0.10886653512716293, "learning_rate": 1.735486444628694e-05, "loss": 0.0323, "step": 180400 }, { "epoch": 1.3412595207133569, "grad_norm": 1.2728900909423828, "learning_rate": 1.7335309669446054e-05, "loss": 0.0332, "step": 180500 }, { "epoch": 1.3420026007802341, "grad_norm": 2.0530734062194824, "learning_rate": 1.7315754892605165e-05, "loss": 0.0368, "step": 180600 }, { "epoch": 1.3427456808471112, "grad_norm": 0.48000404238700867, "learning_rate": 1.729620011576428e-05, "loss": 0.0459, "step": 180700 }, { "epoch": 1.3434887609139885, "grad_norm": 0.33484357595443726, "learning_rate": 1.727664533892339e-05, "loss": 0.0374, "step": 180800 }, { "epoch": 1.3442318409808656, "grad_norm": 0.2930345833301544, "learning_rate": 1.7257090562082506e-05, "loss": 0.0267, "step": 180900 }, { "epoch": 1.3449749210477429, "grad_norm": 0.432262122631073, "learning_rate": 1.723753578524162e-05, "loss": 0.0319, "step": 181000 }, { "epoch": 1.3457180011146201, "grad_norm": 0.28328123688697815, "learning_rate": 1.7217981008400732e-05, "loss": 0.037, "step": 181100 }, { "epoch": 1.3464610811814972, "grad_norm": 2.5016727447509766, "learning_rate": 1.7198426231559847e-05, "loss": 0.0395, "step": 181200 }, { "epoch": 1.3472041612483745, "grad_norm": 0.46097952127456665, "learning_rate": 1.7178871454718958e-05, "loss": 0.0365, "step": 181300 }, { "epoch": 1.3479472413152518, "grad_norm": 0.3963884115219116, "learning_rate": 1.7159316677878073e-05, "loss": 0.0279, "step": 181400 }, { "epoch": 1.3486903213821289, "grad_norm": 0.166815385222435, "learning_rate": 1.7139761901037184e-05, "loss": 0.0385, "step": 181500 }, { "epoch": 1.3494334014490061, "grad_norm": 0.20438982546329498, "learning_rate": 1.71202071241963e-05, "loss": 0.0311, "step": 181600 }, { "epoch": 1.3501764815158834, "grad_norm": 0.7310097217559814, "learning_rate": 1.710065234735541e-05, "loss": 0.0314, "step": 181700 }, { "epoch": 1.3509195615827605, "grad_norm": 0.634029746055603, "learning_rate": 1.7081097570514525e-05, "loss": 0.0339, "step": 181800 }, { "epoch": 1.3516626416496378, "grad_norm": 0.20681729912757874, "learning_rate": 1.706154279367364e-05, "loss": 0.0364, "step": 181900 }, { "epoch": 1.352405721716515, "grad_norm": 0.20008142292499542, "learning_rate": 1.704198801683275e-05, "loss": 0.0371, "step": 182000 }, { "epoch": 1.3531488017833921, "grad_norm": 1.0344927310943604, "learning_rate": 1.7022433239991866e-05, "loss": 0.0482, "step": 182100 }, { "epoch": 1.3538918818502694, "grad_norm": 0.39874157309532166, "learning_rate": 1.7002878463150977e-05, "loss": 0.0356, "step": 182200 }, { "epoch": 1.3546349619171465, "grad_norm": 0.4818481504917145, "learning_rate": 1.6983323686310092e-05, "loss": 0.0319, "step": 182300 }, { "epoch": 1.3553780419840238, "grad_norm": 0.6872866749763489, "learning_rate": 1.6963768909469204e-05, "loss": 0.0365, "step": 182400 }, { "epoch": 1.3561211220509009, "grad_norm": 0.2511899471282959, "learning_rate": 1.694421413262832e-05, "loss": 0.0388, "step": 182500 }, { "epoch": 1.3568642021177781, "grad_norm": 0.24985134601593018, "learning_rate": 1.6924659355787433e-05, "loss": 0.0358, "step": 182600 }, { "epoch": 1.3576072821846554, "grad_norm": 0.03144615888595581, "learning_rate": 1.6905104578946545e-05, "loss": 0.0324, "step": 182700 }, { "epoch": 1.3583503622515325, "grad_norm": 0.4292970597743988, "learning_rate": 1.688554980210566e-05, "loss": 0.0364, "step": 182800 }, { "epoch": 1.3590934423184098, "grad_norm": 0.5636408925056458, "learning_rate": 1.686599502526477e-05, "loss": 0.0346, "step": 182900 }, { "epoch": 1.359836522385287, "grad_norm": 2.6099541187286377, "learning_rate": 1.6846440248423885e-05, "loss": 0.036, "step": 183000 }, { "epoch": 1.3605796024521641, "grad_norm": 0.4037156403064728, "learning_rate": 1.6826885471582997e-05, "loss": 0.0393, "step": 183100 }, { "epoch": 1.3613226825190414, "grad_norm": 0.5565699934959412, "learning_rate": 1.680733069474211e-05, "loss": 0.0305, "step": 183200 }, { "epoch": 1.3620657625859187, "grad_norm": 0.6247751116752625, "learning_rate": 1.6787775917901223e-05, "loss": 0.0373, "step": 183300 }, { "epoch": 1.3628088426527958, "grad_norm": 0.30978524684906006, "learning_rate": 1.6768221141060338e-05, "loss": 0.0289, "step": 183400 }, { "epoch": 1.363551922719673, "grad_norm": 0.27709728479385376, "learning_rate": 1.6748666364219452e-05, "loss": 0.0327, "step": 183500 }, { "epoch": 1.3642950027865504, "grad_norm": 0.4027794897556305, "learning_rate": 1.6729111587378564e-05, "loss": 0.0286, "step": 183600 }, { "epoch": 1.3650380828534274, "grad_norm": 0.1344064176082611, "learning_rate": 1.670955681053768e-05, "loss": 0.0419, "step": 183700 }, { "epoch": 1.3657811629203047, "grad_norm": 0.5154412984848022, "learning_rate": 1.669000203369679e-05, "loss": 0.0398, "step": 183800 }, { "epoch": 1.366524242987182, "grad_norm": 0.1703673154115677, "learning_rate": 1.6670447256855905e-05, "loss": 0.039, "step": 183900 }, { "epoch": 1.367267323054059, "grad_norm": 0.5279111862182617, "learning_rate": 1.665089248001502e-05, "loss": 0.0321, "step": 184000 }, { "epoch": 1.3680104031209364, "grad_norm": 0.0951002836227417, "learning_rate": 1.663133770317413e-05, "loss": 0.0311, "step": 184100 }, { "epoch": 1.3687534831878134, "grad_norm": 0.461066871881485, "learning_rate": 1.6611782926333246e-05, "loss": 0.0332, "step": 184200 }, { "epoch": 1.3694965632546907, "grad_norm": 1.8543422222137451, "learning_rate": 1.6592228149492357e-05, "loss": 0.0346, "step": 184300 }, { "epoch": 1.3702396433215678, "grad_norm": 0.08551236987113953, "learning_rate": 1.6572673372651472e-05, "loss": 0.0337, "step": 184400 }, { "epoch": 1.370982723388445, "grad_norm": 2.8686792850494385, "learning_rate": 1.6553118595810587e-05, "loss": 0.0285, "step": 184500 }, { "epoch": 1.3717258034553224, "grad_norm": 7.189798355102539, "learning_rate": 1.6533563818969698e-05, "loss": 0.0382, "step": 184600 }, { "epoch": 1.3724688835221994, "grad_norm": 0.4478267729282379, "learning_rate": 1.6514009042128813e-05, "loss": 0.0389, "step": 184700 }, { "epoch": 1.3732119635890767, "grad_norm": 0.7077392339706421, "learning_rate": 1.6494454265287927e-05, "loss": 0.036, "step": 184800 }, { "epoch": 1.373955043655954, "grad_norm": 1.0631729364395142, "learning_rate": 1.647489948844704e-05, "loss": 0.0394, "step": 184900 }, { "epoch": 1.374698123722831, "grad_norm": 0.09518816322088242, "learning_rate": 1.6455344711606154e-05, "loss": 0.0363, "step": 185000 }, { "epoch": 1.3754412037897084, "grad_norm": 0.24274791777133942, "learning_rate": 1.6435789934765265e-05, "loss": 0.0299, "step": 185100 }, { "epoch": 1.3761842838565856, "grad_norm": 1.0644948482513428, "learning_rate": 1.641623515792438e-05, "loss": 0.0329, "step": 185200 }, { "epoch": 1.3769273639234627, "grad_norm": 0.6517938375473022, "learning_rate": 1.6396680381083495e-05, "loss": 0.0364, "step": 185300 }, { "epoch": 1.37767044399034, "grad_norm": 0.452434778213501, "learning_rate": 1.6377125604242606e-05, "loss": 0.0348, "step": 185400 }, { "epoch": 1.3784135240572173, "grad_norm": 0.5848062634468079, "learning_rate": 1.635757082740172e-05, "loss": 0.0323, "step": 185500 }, { "epoch": 1.3791566041240944, "grad_norm": 0.1282287985086441, "learning_rate": 1.6338016050560832e-05, "loss": 0.0322, "step": 185600 }, { "epoch": 1.3798996841909716, "grad_norm": 0.4316161274909973, "learning_rate": 1.6318461273719947e-05, "loss": 0.0293, "step": 185700 }, { "epoch": 1.3806427642578487, "grad_norm": 0.29205426573753357, "learning_rate": 1.6298906496879058e-05, "loss": 0.0359, "step": 185800 }, { "epoch": 1.381385844324726, "grad_norm": 0.12817925214767456, "learning_rate": 1.6279351720038173e-05, "loss": 0.0335, "step": 185900 }, { "epoch": 1.382128924391603, "grad_norm": 0.2064104974269867, "learning_rate": 1.6259796943197288e-05, "loss": 0.0383, "step": 186000 }, { "epoch": 1.3828720044584804, "grad_norm": 0.9154179096221924, "learning_rate": 1.62402421663564e-05, "loss": 0.0383, "step": 186100 }, { "epoch": 1.3836150845253576, "grad_norm": 0.7342627644538879, "learning_rate": 1.6220687389515514e-05, "loss": 0.0366, "step": 186200 }, { "epoch": 1.3843581645922347, "grad_norm": 0.4257078468799591, "learning_rate": 1.6201132612674625e-05, "loss": 0.0326, "step": 186300 }, { "epoch": 1.385101244659112, "grad_norm": 0.521092414855957, "learning_rate": 1.618157783583374e-05, "loss": 0.0376, "step": 186400 }, { "epoch": 1.3858443247259893, "grad_norm": 0.450666606426239, "learning_rate": 1.616202305899285e-05, "loss": 0.0384, "step": 186500 }, { "epoch": 1.3865874047928664, "grad_norm": 1.0016143321990967, "learning_rate": 1.6142468282151966e-05, "loss": 0.024, "step": 186600 }, { "epoch": 1.3873304848597436, "grad_norm": 0.6545040607452393, "learning_rate": 1.6122913505311078e-05, "loss": 0.0344, "step": 186700 }, { "epoch": 1.388073564926621, "grad_norm": 0.35555803775787354, "learning_rate": 1.6103358728470192e-05, "loss": 0.0358, "step": 186800 }, { "epoch": 1.388816644993498, "grad_norm": 0.6628378629684448, "learning_rate": 1.6083803951629307e-05, "loss": 0.0376, "step": 186900 }, { "epoch": 1.3895597250603753, "grad_norm": 0.586877167224884, "learning_rate": 1.606424917478842e-05, "loss": 0.0284, "step": 187000 }, { "epoch": 1.3903028051272526, "grad_norm": 0.1754978895187378, "learning_rate": 1.6044694397947533e-05, "loss": 0.0321, "step": 187100 }, { "epoch": 1.3910458851941296, "grad_norm": 0.3012150824069977, "learning_rate": 1.6025139621106645e-05, "loss": 0.0391, "step": 187200 }, { "epoch": 1.391788965261007, "grad_norm": 0.38872581720352173, "learning_rate": 1.600558484426576e-05, "loss": 0.0391, "step": 187300 }, { "epoch": 1.3925320453278842, "grad_norm": 0.4939943850040436, "learning_rate": 1.598603006742487e-05, "loss": 0.0274, "step": 187400 }, { "epoch": 1.3932751253947613, "grad_norm": 0.5715798139572144, "learning_rate": 1.5966475290583985e-05, "loss": 0.0301, "step": 187500 }, { "epoch": 1.3940182054616386, "grad_norm": 0.4019854664802551, "learning_rate": 1.5946920513743097e-05, "loss": 0.0338, "step": 187600 }, { "epoch": 1.3947612855285156, "grad_norm": 0.4715663492679596, "learning_rate": 1.592736573690221e-05, "loss": 0.0338, "step": 187700 }, { "epoch": 1.395504365595393, "grad_norm": 0.24107222259044647, "learning_rate": 1.5907810960061326e-05, "loss": 0.0277, "step": 187800 }, { "epoch": 1.39624744566227, "grad_norm": 1.8958635330200195, "learning_rate": 1.5888256183220438e-05, "loss": 0.0301, "step": 187900 }, { "epoch": 1.3969905257291473, "grad_norm": 1.3319509029388428, "learning_rate": 1.5868701406379553e-05, "loss": 0.0328, "step": 188000 }, { "epoch": 1.3977336057960246, "grad_norm": 1.1139823198318481, "learning_rate": 1.5849146629538664e-05, "loss": 0.042, "step": 188100 }, { "epoch": 1.3984766858629016, "grad_norm": 0.7081483006477356, "learning_rate": 1.582959185269778e-05, "loss": 0.036, "step": 188200 }, { "epoch": 1.399219765929779, "grad_norm": 0.3640769422054291, "learning_rate": 1.581003707585689e-05, "loss": 0.0371, "step": 188300 }, { "epoch": 1.3999628459966562, "grad_norm": 0.5468807220458984, "learning_rate": 1.5790482299016005e-05, "loss": 0.0324, "step": 188400 }, { "epoch": 1.4007059260635333, "grad_norm": 0.15133748948574066, "learning_rate": 1.577092752217512e-05, "loss": 0.0377, "step": 188500 }, { "epoch": 1.4014490061304106, "grad_norm": 0.804024338722229, "learning_rate": 1.575137274533423e-05, "loss": 0.0284, "step": 188600 }, { "epoch": 1.4021920861972879, "grad_norm": 0.49157822132110596, "learning_rate": 1.5731817968493346e-05, "loss": 0.0382, "step": 188700 }, { "epoch": 1.402935166264165, "grad_norm": 0.22933423519134521, "learning_rate": 1.5712263191652457e-05, "loss": 0.0307, "step": 188800 }, { "epoch": 1.4036782463310422, "grad_norm": 0.4661417603492737, "learning_rate": 1.5692708414811572e-05, "loss": 0.0511, "step": 188900 }, { "epoch": 1.4044213263979195, "grad_norm": 0.9438570141792297, "learning_rate": 1.5673153637970683e-05, "loss": 0.036, "step": 189000 }, { "epoch": 1.4051644064647966, "grad_norm": 0.5423058271408081, "learning_rate": 1.5653598861129798e-05, "loss": 0.0358, "step": 189100 }, { "epoch": 1.4059074865316739, "grad_norm": 0.8303951025009155, "learning_rate": 1.563404408428891e-05, "loss": 0.0335, "step": 189200 }, { "epoch": 1.406650566598551, "grad_norm": 0.3806622624397278, "learning_rate": 1.5614489307448024e-05, "loss": 0.0396, "step": 189300 }, { "epoch": 1.4073936466654282, "grad_norm": 0.5279279947280884, "learning_rate": 1.559493453060714e-05, "loss": 0.0288, "step": 189400 }, { "epoch": 1.4081367267323053, "grad_norm": 0.5468716025352478, "learning_rate": 1.557537975376625e-05, "loss": 0.0369, "step": 189500 }, { "epoch": 1.4088798067991826, "grad_norm": 0.5955812931060791, "learning_rate": 1.5555824976925365e-05, "loss": 0.0385, "step": 189600 }, { "epoch": 1.4096228868660599, "grad_norm": 1.9238618612289429, "learning_rate": 1.5536270200084476e-05, "loss": 0.0318, "step": 189700 }, { "epoch": 1.410365966932937, "grad_norm": 0.18525336682796478, "learning_rate": 1.551671542324359e-05, "loss": 0.0322, "step": 189800 }, { "epoch": 1.4111090469998142, "grad_norm": 0.7993316650390625, "learning_rate": 1.5497160646402703e-05, "loss": 0.0299, "step": 189900 }, { "epoch": 1.4118521270666915, "grad_norm": 3.348496198654175, "learning_rate": 1.5477605869561817e-05, "loss": 0.0334, "step": 190000 }, { "epoch": 1.4125952071335686, "grad_norm": 0.2934270203113556, "learning_rate": 1.545805109272093e-05, "loss": 0.0349, "step": 190100 }, { "epoch": 1.4133382872004459, "grad_norm": 0.1868973672389984, "learning_rate": 1.5438496315880043e-05, "loss": 0.0302, "step": 190200 }, { "epoch": 1.4140813672673231, "grad_norm": 0.8134732246398926, "learning_rate": 1.5418941539039158e-05, "loss": 0.0376, "step": 190300 }, { "epoch": 1.4148244473342002, "grad_norm": 0.41111162304878235, "learning_rate": 1.539938676219827e-05, "loss": 0.0243, "step": 190400 }, { "epoch": 1.4155675274010775, "grad_norm": 0.30801162123680115, "learning_rate": 1.5379831985357384e-05, "loss": 0.0328, "step": 190500 }, { "epoch": 1.4163106074679548, "grad_norm": 0.17193806171417236, "learning_rate": 1.5360277208516496e-05, "loss": 0.0377, "step": 190600 }, { "epoch": 1.4170536875348319, "grad_norm": 0.20271266996860504, "learning_rate": 1.534072243167561e-05, "loss": 0.0282, "step": 190700 }, { "epoch": 1.4177967676017091, "grad_norm": 0.655733048915863, "learning_rate": 1.5321167654834722e-05, "loss": 0.0342, "step": 190800 }, { "epoch": 1.4185398476685864, "grad_norm": 0.527825117111206, "learning_rate": 1.5301612877993837e-05, "loss": 0.0314, "step": 190900 }, { "epoch": 1.4192829277354635, "grad_norm": 0.5957244634628296, "learning_rate": 1.5282058101152948e-05, "loss": 0.0295, "step": 191000 }, { "epoch": 1.4200260078023408, "grad_norm": 0.7220511436462402, "learning_rate": 1.5262503324312063e-05, "loss": 0.0401, "step": 191100 }, { "epoch": 1.4207690878692179, "grad_norm": 2.0191807746887207, "learning_rate": 1.5242948547471178e-05, "loss": 0.0396, "step": 191200 }, { "epoch": 1.4215121679360951, "grad_norm": 0.5304579734802246, "learning_rate": 1.522339377063029e-05, "loss": 0.0304, "step": 191300 }, { "epoch": 1.4222552480029722, "grad_norm": 0.5049948692321777, "learning_rate": 1.5203838993789404e-05, "loss": 0.039, "step": 191400 }, { "epoch": 1.4229983280698495, "grad_norm": 0.39291173219680786, "learning_rate": 1.5184284216948517e-05, "loss": 0.0315, "step": 191500 }, { "epoch": 1.4237414081367268, "grad_norm": 0.778049111366272, "learning_rate": 1.516472944010763e-05, "loss": 0.0351, "step": 191600 }, { "epoch": 1.4244844882036038, "grad_norm": 3.893615484237671, "learning_rate": 1.5145174663266743e-05, "loss": 0.0349, "step": 191700 }, { "epoch": 1.4252275682704811, "grad_norm": 0.11674647033214569, "learning_rate": 1.5125619886425858e-05, "loss": 0.0387, "step": 191800 }, { "epoch": 1.4259706483373584, "grad_norm": 0.35498952865600586, "learning_rate": 1.510606510958497e-05, "loss": 0.0291, "step": 191900 }, { "epoch": 1.4267137284042355, "grad_norm": 2.2891650199890137, "learning_rate": 1.5086510332744084e-05, "loss": 0.032, "step": 192000 }, { "epoch": 1.4274568084711128, "grad_norm": 1.0489330291748047, "learning_rate": 1.5066955555903199e-05, "loss": 0.0287, "step": 192100 }, { "epoch": 1.42819988853799, "grad_norm": 0.1924586296081543, "learning_rate": 1.504740077906231e-05, "loss": 0.031, "step": 192200 }, { "epoch": 1.4289429686048671, "grad_norm": 0.43390461802482605, "learning_rate": 1.5027846002221425e-05, "loss": 0.0303, "step": 192300 }, { "epoch": 1.4296860486717444, "grad_norm": 0.14574852585792542, "learning_rate": 1.5008291225380536e-05, "loss": 0.045, "step": 192400 }, { "epoch": 1.4304291287386217, "grad_norm": 0.7610241174697876, "learning_rate": 1.4988736448539651e-05, "loss": 0.0315, "step": 192500 }, { "epoch": 1.4311722088054988, "grad_norm": 0.3230319917201996, "learning_rate": 1.4969181671698762e-05, "loss": 0.0297, "step": 192600 }, { "epoch": 1.431915288872376, "grad_norm": 0.40001967549324036, "learning_rate": 1.4949626894857877e-05, "loss": 0.0385, "step": 192700 }, { "epoch": 1.4326583689392531, "grad_norm": 0.5088039040565491, "learning_rate": 1.4930072118016992e-05, "loss": 0.0353, "step": 192800 }, { "epoch": 1.4334014490061304, "grad_norm": 0.39068347215652466, "learning_rate": 1.4910517341176103e-05, "loss": 0.0337, "step": 192900 }, { "epoch": 1.4341445290730075, "grad_norm": 2.497997283935547, "learning_rate": 1.4890962564335218e-05, "loss": 0.0404, "step": 193000 }, { "epoch": 1.4348876091398848, "grad_norm": 2.6104018688201904, "learning_rate": 1.487140778749433e-05, "loss": 0.0336, "step": 193100 }, { "epoch": 1.435630689206762, "grad_norm": 0.8740593194961548, "learning_rate": 1.4851853010653444e-05, "loss": 0.0341, "step": 193200 }, { "epoch": 1.4363737692736391, "grad_norm": 0.32780376076698303, "learning_rate": 1.4832298233812555e-05, "loss": 0.0322, "step": 193300 }, { "epoch": 1.4371168493405164, "grad_norm": 1.683281660079956, "learning_rate": 1.481274345697167e-05, "loss": 0.0351, "step": 193400 }, { "epoch": 1.4378599294073937, "grad_norm": 0.5255507230758667, "learning_rate": 1.4793188680130782e-05, "loss": 0.0378, "step": 193500 }, { "epoch": 1.4386030094742708, "grad_norm": 0.280023992061615, "learning_rate": 1.4773633903289896e-05, "loss": 0.0313, "step": 193600 }, { "epoch": 1.439346089541148, "grad_norm": 0.23651553690433502, "learning_rate": 1.4754079126449011e-05, "loss": 0.0376, "step": 193700 }, { "epoch": 1.4400891696080254, "grad_norm": 1.5815614461898804, "learning_rate": 1.4734524349608122e-05, "loss": 0.0299, "step": 193800 }, { "epoch": 1.4408322496749024, "grad_norm": 0.32355111837387085, "learning_rate": 1.4714969572767237e-05, "loss": 0.0279, "step": 193900 }, { "epoch": 1.4415753297417797, "grad_norm": 0.6366586685180664, "learning_rate": 1.4695414795926349e-05, "loss": 0.0453, "step": 194000 }, { "epoch": 1.442318409808657, "grad_norm": 0.6759881377220154, "learning_rate": 1.4675860019085463e-05, "loss": 0.0325, "step": 194100 }, { "epoch": 1.443061489875534, "grad_norm": 0.42459043860435486, "learning_rate": 1.4656305242244575e-05, "loss": 0.0327, "step": 194200 }, { "epoch": 1.4438045699424114, "grad_norm": 0.27208536863327026, "learning_rate": 1.463675046540369e-05, "loss": 0.0298, "step": 194300 }, { "epoch": 1.4445476500092884, "grad_norm": 0.8621758222579956, "learning_rate": 1.4617195688562804e-05, "loss": 0.0328, "step": 194400 }, { "epoch": 1.4452907300761657, "grad_norm": 0.1492447406053543, "learning_rate": 1.4597640911721916e-05, "loss": 0.0351, "step": 194500 }, { "epoch": 1.4460338101430428, "grad_norm": 0.21552444994449615, "learning_rate": 1.457808613488103e-05, "loss": 0.0382, "step": 194600 }, { "epoch": 1.44677689020992, "grad_norm": 0.133006289601326, "learning_rate": 1.4558531358040142e-05, "loss": 0.0298, "step": 194700 }, { "epoch": 1.4475199702767974, "grad_norm": 0.40657472610473633, "learning_rate": 1.4538976581199257e-05, "loss": 0.0336, "step": 194800 }, { "epoch": 1.4482630503436744, "grad_norm": 0.6210281252861023, "learning_rate": 1.4519421804358368e-05, "loss": 0.0378, "step": 194900 }, { "epoch": 1.4490061304105517, "grad_norm": 0.7319797873497009, "learning_rate": 1.4499867027517483e-05, "loss": 0.0333, "step": 195000 }, { "epoch": 1.449749210477429, "grad_norm": 9.143414497375488, "learning_rate": 1.4480312250676594e-05, "loss": 0.0332, "step": 195100 }, { "epoch": 1.450492290544306, "grad_norm": 0.24313585460186005, "learning_rate": 1.4460757473835709e-05, "loss": 0.0323, "step": 195200 }, { "epoch": 1.4512353706111834, "grad_norm": 0.2722693979740143, "learning_rate": 1.4441202696994824e-05, "loss": 0.0391, "step": 195300 }, { "epoch": 1.4519784506780606, "grad_norm": 0.5478352308273315, "learning_rate": 1.4421647920153935e-05, "loss": 0.0297, "step": 195400 }, { "epoch": 1.4527215307449377, "grad_norm": 0.5717867612838745, "learning_rate": 1.440209314331305e-05, "loss": 0.036, "step": 195500 }, { "epoch": 1.453464610811815, "grad_norm": 0.33797019720077515, "learning_rate": 1.4382538366472161e-05, "loss": 0.0387, "step": 195600 }, { "epoch": 1.4542076908786923, "grad_norm": 0.7965496182441711, "learning_rate": 1.4362983589631276e-05, "loss": 0.032, "step": 195700 }, { "epoch": 1.4549507709455693, "grad_norm": 0.45621341466903687, "learning_rate": 1.4343428812790389e-05, "loss": 0.0343, "step": 195800 }, { "epoch": 1.4556938510124466, "grad_norm": 0.18652664124965668, "learning_rate": 1.4323874035949502e-05, "loss": 0.033, "step": 195900 }, { "epoch": 1.456436931079324, "grad_norm": 0.2201235145330429, "learning_rate": 1.4304319259108615e-05, "loss": 0.0284, "step": 196000 }, { "epoch": 1.457180011146201, "grad_norm": 2.7237865924835205, "learning_rate": 1.4284764482267728e-05, "loss": 0.0353, "step": 196100 }, { "epoch": 1.4579230912130783, "grad_norm": 0.0933411717414856, "learning_rate": 1.4265209705426843e-05, "loss": 0.0354, "step": 196200 }, { "epoch": 1.4586661712799553, "grad_norm": 3.1912808418273926, "learning_rate": 1.4245654928585956e-05, "loss": 0.0306, "step": 196300 }, { "epoch": 1.4594092513468326, "grad_norm": 0.3228622376918793, "learning_rate": 1.4226100151745069e-05, "loss": 0.0397, "step": 196400 }, { "epoch": 1.4601523314137097, "grad_norm": 0.3650961220264435, "learning_rate": 1.4206545374904182e-05, "loss": 0.0372, "step": 196500 }, { "epoch": 1.460895411480587, "grad_norm": 0.7365734577178955, "learning_rate": 1.4186990598063297e-05, "loss": 0.0329, "step": 196600 }, { "epoch": 1.4616384915474643, "grad_norm": 0.6250199675559998, "learning_rate": 1.4167435821222408e-05, "loss": 0.0293, "step": 196700 }, { "epoch": 1.4623815716143413, "grad_norm": 0.3983941376209259, "learning_rate": 1.4147881044381523e-05, "loss": 0.0313, "step": 196800 }, { "epoch": 1.4631246516812186, "grad_norm": 2.081169366836548, "learning_rate": 1.4128326267540634e-05, "loss": 0.0367, "step": 196900 }, { "epoch": 1.463867731748096, "grad_norm": 0.1586562842130661, "learning_rate": 1.410877149069975e-05, "loss": 0.0356, "step": 197000 }, { "epoch": 1.464610811814973, "grad_norm": 0.7490348219871521, "learning_rate": 1.4089216713858864e-05, "loss": 0.0381, "step": 197100 }, { "epoch": 1.4653538918818503, "grad_norm": 0.4885406792163849, "learning_rate": 1.4069661937017975e-05, "loss": 0.0311, "step": 197200 }, { "epoch": 1.4660969719487276, "grad_norm": 1.4064565896987915, "learning_rate": 1.405010716017709e-05, "loss": 0.0331, "step": 197300 }, { "epoch": 1.4668400520156046, "grad_norm": 0.7598900198936462, "learning_rate": 1.4030552383336201e-05, "loss": 0.0307, "step": 197400 }, { "epoch": 1.467583132082482, "grad_norm": 0.747443437576294, "learning_rate": 1.4010997606495316e-05, "loss": 0.0463, "step": 197500 }, { "epoch": 1.4683262121493592, "grad_norm": 1.3681015968322754, "learning_rate": 1.3991442829654428e-05, "loss": 0.0342, "step": 197600 }, { "epoch": 1.4690692922162363, "grad_norm": 0.4226016104221344, "learning_rate": 1.3971888052813542e-05, "loss": 0.035, "step": 197700 }, { "epoch": 1.4698123722831136, "grad_norm": 1.4263702630996704, "learning_rate": 1.3952333275972657e-05, "loss": 0.032, "step": 197800 }, { "epoch": 1.4705554523499906, "grad_norm": 0.5994787812232971, "learning_rate": 1.3932778499131769e-05, "loss": 0.0277, "step": 197900 }, { "epoch": 1.471298532416868, "grad_norm": 0.511102020740509, "learning_rate": 1.3913223722290883e-05, "loss": 0.0372, "step": 198000 }, { "epoch": 1.472041612483745, "grad_norm": 0.9975734353065491, "learning_rate": 1.3893668945449995e-05, "loss": 0.0312, "step": 198100 }, { "epoch": 1.4727846925506223, "grad_norm": 0.4357076585292816, "learning_rate": 1.387411416860911e-05, "loss": 0.0357, "step": 198200 }, { "epoch": 1.4735277726174996, "grad_norm": 0.5865920782089233, "learning_rate": 1.385455939176822e-05, "loss": 0.0333, "step": 198300 }, { "epoch": 1.4742708526843766, "grad_norm": 0.9975289106369019, "learning_rate": 1.3835004614927336e-05, "loss": 0.0313, "step": 198400 }, { "epoch": 1.475013932751254, "grad_norm": 0.3658576011657715, "learning_rate": 1.3815449838086447e-05, "loss": 0.0418, "step": 198500 }, { "epoch": 1.4757570128181312, "grad_norm": 0.40709537267684937, "learning_rate": 1.3795895061245562e-05, "loss": 0.0349, "step": 198600 }, { "epoch": 1.4765000928850083, "grad_norm": 0.6367713212966919, "learning_rate": 1.3776340284404676e-05, "loss": 0.0331, "step": 198700 }, { "epoch": 1.4772431729518856, "grad_norm": 0.8178965449333191, "learning_rate": 1.3756785507563788e-05, "loss": 0.0372, "step": 198800 }, { "epoch": 1.4779862530187629, "grad_norm": 0.15861351788043976, "learning_rate": 1.3737230730722903e-05, "loss": 0.0336, "step": 198900 }, { "epoch": 1.47872933308564, "grad_norm": 0.20659239590168, "learning_rate": 1.3717675953882014e-05, "loss": 0.0347, "step": 199000 }, { "epoch": 1.4794724131525172, "grad_norm": 1.51454758644104, "learning_rate": 1.3698121177041129e-05, "loss": 0.038, "step": 199100 }, { "epoch": 1.4802154932193945, "grad_norm": 0.2850351333618164, "learning_rate": 1.367856640020024e-05, "loss": 0.0281, "step": 199200 }, { "epoch": 1.4809585732862716, "grad_norm": 0.13631965219974518, "learning_rate": 1.3659011623359355e-05, "loss": 0.0349, "step": 199300 }, { "epoch": 1.4817016533531489, "grad_norm": 0.8175708651542664, "learning_rate": 1.3639456846518466e-05, "loss": 0.0403, "step": 199400 }, { "epoch": 1.4824447334200261, "grad_norm": 0.8908068537712097, "learning_rate": 1.3619902069677581e-05, "loss": 0.0427, "step": 199500 }, { "epoch": 1.4831878134869032, "grad_norm": 0.25155386328697205, "learning_rate": 1.3600347292836696e-05, "loss": 0.0385, "step": 199600 }, { "epoch": 1.4839308935537805, "grad_norm": 0.8904440999031067, "learning_rate": 1.3580792515995807e-05, "loss": 0.0336, "step": 199700 }, { "epoch": 1.4846739736206576, "grad_norm": 0.2021860033273697, "learning_rate": 1.3561237739154922e-05, "loss": 0.0285, "step": 199800 }, { "epoch": 1.4854170536875348, "grad_norm": 2.6852476596832275, "learning_rate": 1.3541682962314033e-05, "loss": 0.0335, "step": 199900 }, { "epoch": 1.486160133754412, "grad_norm": 0.31416624784469604, "learning_rate": 1.3522128185473148e-05, "loss": 0.0374, "step": 200000 }, { "epoch": 1.4869032138212892, "grad_norm": 0.3563138246536255, "learning_rate": 1.350257340863226e-05, "loss": 0.0301, "step": 200100 }, { "epoch": 1.4876462938881665, "grad_norm": 0.25641533732414246, "learning_rate": 1.3483018631791374e-05, "loss": 0.0342, "step": 200200 }, { "epoch": 1.4883893739550436, "grad_norm": 0.7021012306213379, "learning_rate": 1.3463463854950487e-05, "loss": 0.0342, "step": 200300 }, { "epoch": 1.4891324540219208, "grad_norm": 0.43270057439804077, "learning_rate": 1.34439090781096e-05, "loss": 0.0343, "step": 200400 }, { "epoch": 1.4898755340887981, "grad_norm": 0.088007353246212, "learning_rate": 1.3424354301268715e-05, "loss": 0.046, "step": 200500 }, { "epoch": 1.4906186141556752, "grad_norm": 0.3133363127708435, "learning_rate": 1.3404799524427828e-05, "loss": 0.0349, "step": 200600 }, { "epoch": 1.4913616942225525, "grad_norm": 0.6350749135017395, "learning_rate": 1.3385244747586941e-05, "loss": 0.0313, "step": 200700 }, { "epoch": 1.4921047742894298, "grad_norm": 2.069354295730591, "learning_rate": 1.3365689970746054e-05, "loss": 0.0298, "step": 200800 }, { "epoch": 1.4928478543563068, "grad_norm": 0.22212301194667816, "learning_rate": 1.3346135193905167e-05, "loss": 0.0398, "step": 200900 }, { "epoch": 1.4935909344231841, "grad_norm": 3.0023181438446045, "learning_rate": 1.332658041706428e-05, "loss": 0.0319, "step": 201000 }, { "epoch": 1.4943340144900614, "grad_norm": 0.22962692379951477, "learning_rate": 1.3307025640223395e-05, "loss": 0.0352, "step": 201100 }, { "epoch": 1.4950770945569385, "grad_norm": 0.6137348413467407, "learning_rate": 1.3287470863382508e-05, "loss": 0.0307, "step": 201200 }, { "epoch": 1.4958201746238158, "grad_norm": 0.5768724083900452, "learning_rate": 1.3267916086541621e-05, "loss": 0.0414, "step": 201300 }, { "epoch": 1.4965632546906928, "grad_norm": 0.46560370922088623, "learning_rate": 1.3248361309700734e-05, "loss": 0.0337, "step": 201400 }, { "epoch": 1.4973063347575701, "grad_norm": 0.4755012094974518, "learning_rate": 1.3228806532859848e-05, "loss": 0.0332, "step": 201500 }, { "epoch": 1.4980494148244472, "grad_norm": 3.7708382606506348, "learning_rate": 1.3209251756018962e-05, "loss": 0.0311, "step": 201600 }, { "epoch": 1.4987924948913245, "grad_norm": 0.21414272487163544, "learning_rate": 1.3189696979178074e-05, "loss": 0.0316, "step": 201700 }, { "epoch": 1.4995355749582018, "grad_norm": 0.35750484466552734, "learning_rate": 1.3170142202337188e-05, "loss": 0.0325, "step": 201800 }, { "epoch": 1.5002786550250788, "grad_norm": 0.18722094595432281, "learning_rate": 1.31505874254963e-05, "loss": 0.0319, "step": 201900 }, { "epoch": 1.5010217350919561, "grad_norm": 0.23407875001430511, "learning_rate": 1.3131032648655415e-05, "loss": 0.0338, "step": 202000 }, { "epoch": 1.5017648151588334, "grad_norm": 6.957334518432617, "learning_rate": 1.311147787181453e-05, "loss": 0.0342, "step": 202100 }, { "epoch": 1.5025078952257105, "grad_norm": 0.551520049571991, "learning_rate": 1.309192309497364e-05, "loss": 0.0312, "step": 202200 }, { "epoch": 1.5032509752925878, "grad_norm": 0.7012757062911987, "learning_rate": 1.3072368318132756e-05, "loss": 0.0327, "step": 202300 }, { "epoch": 1.503994055359465, "grad_norm": 0.41721203923225403, "learning_rate": 1.3052813541291867e-05, "loss": 0.0362, "step": 202400 }, { "epoch": 1.5047371354263421, "grad_norm": 0.5778596997261047, "learning_rate": 1.3033258764450982e-05, "loss": 0.0344, "step": 202500 }, { "epoch": 1.5054802154932194, "grad_norm": 0.22285187244415283, "learning_rate": 1.3013703987610093e-05, "loss": 0.0361, "step": 202600 }, { "epoch": 1.5062232955600967, "grad_norm": 0.4399549961090088, "learning_rate": 1.2994149210769208e-05, "loss": 0.038, "step": 202700 }, { "epoch": 1.5069663756269738, "grad_norm": 0.46925681829452515, "learning_rate": 1.297459443392832e-05, "loss": 0.0431, "step": 202800 }, { "epoch": 1.507709455693851, "grad_norm": 0.5276687145233154, "learning_rate": 1.2955039657087434e-05, "loss": 0.0351, "step": 202900 }, { "epoch": 1.5084525357607284, "grad_norm": 2.5348763465881348, "learning_rate": 1.2935484880246549e-05, "loss": 0.0291, "step": 203000 }, { "epoch": 1.5091956158276054, "grad_norm": 0.5051155090332031, "learning_rate": 1.291593010340566e-05, "loss": 0.0327, "step": 203100 }, { "epoch": 1.5099386958944825, "grad_norm": 0.2525690495967865, "learning_rate": 1.2896375326564775e-05, "loss": 0.0316, "step": 203200 }, { "epoch": 1.51068177596136, "grad_norm": 2.0653040409088135, "learning_rate": 1.2876820549723886e-05, "loss": 0.0355, "step": 203300 }, { "epoch": 1.511424856028237, "grad_norm": 0.7613476514816284, "learning_rate": 1.2857265772883001e-05, "loss": 0.0319, "step": 203400 }, { "epoch": 1.5121679360951141, "grad_norm": 0.5634822249412537, "learning_rate": 1.2837710996042112e-05, "loss": 0.0366, "step": 203500 }, { "epoch": 1.5129110161619914, "grad_norm": 0.2400141954421997, "learning_rate": 1.2818156219201227e-05, "loss": 0.0363, "step": 203600 }, { "epoch": 1.5136540962288687, "grad_norm": 0.6968608498573303, "learning_rate": 1.2798601442360342e-05, "loss": 0.0309, "step": 203700 }, { "epoch": 1.5143971762957458, "grad_norm": 0.3499423861503601, "learning_rate": 1.2779046665519453e-05, "loss": 0.0311, "step": 203800 }, { "epoch": 1.515140256362623, "grad_norm": 0.9831475615501404, "learning_rate": 1.2759491888678568e-05, "loss": 0.0411, "step": 203900 }, { "epoch": 1.5158833364295003, "grad_norm": 0.6434984803199768, "learning_rate": 1.273993711183768e-05, "loss": 0.0278, "step": 204000 }, { "epoch": 1.5166264164963774, "grad_norm": 0.3780430555343628, "learning_rate": 1.2720382334996794e-05, "loss": 0.036, "step": 204100 }, { "epoch": 1.5173694965632547, "grad_norm": 0.6516414880752563, "learning_rate": 1.2700827558155906e-05, "loss": 0.0358, "step": 204200 }, { "epoch": 1.518112576630132, "grad_norm": 1.0250896215438843, "learning_rate": 1.268127278131502e-05, "loss": 0.0283, "step": 204300 }, { "epoch": 1.518855656697009, "grad_norm": 0.6152116060256958, "learning_rate": 1.2661718004474132e-05, "loss": 0.0369, "step": 204400 }, { "epoch": 1.5195987367638863, "grad_norm": 0.5634803771972656, "learning_rate": 1.2642163227633246e-05, "loss": 0.0297, "step": 204500 }, { "epoch": 1.5203418168307636, "grad_norm": 1.022150993347168, "learning_rate": 1.2622608450792361e-05, "loss": 0.0289, "step": 204600 }, { "epoch": 1.5210848968976407, "grad_norm": 1.8144506216049194, "learning_rate": 1.2603053673951473e-05, "loss": 0.0339, "step": 204700 }, { "epoch": 1.5218279769645178, "grad_norm": 0.6270276308059692, "learning_rate": 1.2583498897110587e-05, "loss": 0.0258, "step": 204800 }, { "epoch": 1.5225710570313953, "grad_norm": 0.6363964676856995, "learning_rate": 1.2563944120269699e-05, "loss": 0.0314, "step": 204900 }, { "epoch": 1.5233141370982723, "grad_norm": 0.21885718405246735, "learning_rate": 1.2544389343428814e-05, "loss": 0.0282, "step": 205000 }, { "epoch": 1.5240572171651494, "grad_norm": 0.21349987387657166, "learning_rate": 1.2524834566587927e-05, "loss": 0.0389, "step": 205100 }, { "epoch": 1.5248002972320267, "grad_norm": 2.5103936195373535, "learning_rate": 1.250527978974704e-05, "loss": 0.0355, "step": 205200 }, { "epoch": 1.525543377298904, "grad_norm": 0.3932492136955261, "learning_rate": 1.2485725012906153e-05, "loss": 0.0328, "step": 205300 }, { "epoch": 1.526286457365781, "grad_norm": 0.5239884257316589, "learning_rate": 1.2466170236065266e-05, "loss": 0.03, "step": 205400 }, { "epoch": 1.5270295374326583, "grad_norm": 0.22543790936470032, "learning_rate": 1.244661545922438e-05, "loss": 0.033, "step": 205500 }, { "epoch": 1.5277726174995356, "grad_norm": 0.17974628508090973, "learning_rate": 1.2427060682383494e-05, "loss": 0.0406, "step": 205600 }, { "epoch": 1.5285156975664127, "grad_norm": 0.3779504597187042, "learning_rate": 1.2407505905542607e-05, "loss": 0.032, "step": 205700 }, { "epoch": 1.52925877763329, "grad_norm": 0.663779079914093, "learning_rate": 1.238795112870172e-05, "loss": 0.0281, "step": 205800 }, { "epoch": 1.5300018577001673, "grad_norm": 0.6207916140556335, "learning_rate": 1.2368396351860833e-05, "loss": 0.0269, "step": 205900 }, { "epoch": 1.5307449377670443, "grad_norm": 0.46305808424949646, "learning_rate": 1.2348841575019948e-05, "loss": 0.0324, "step": 206000 }, { "epoch": 1.5314880178339216, "grad_norm": 0.8329316973686218, "learning_rate": 1.232928679817906e-05, "loss": 0.0338, "step": 206100 }, { "epoch": 1.532231097900799, "grad_norm": 1.0051038265228271, "learning_rate": 1.2309732021338174e-05, "loss": 0.0377, "step": 206200 }, { "epoch": 1.532974177967676, "grad_norm": 0.32019472122192383, "learning_rate": 1.2290177244497287e-05, "loss": 0.0365, "step": 206300 }, { "epoch": 1.5337172580345533, "grad_norm": 0.3316141366958618, "learning_rate": 1.22706224676564e-05, "loss": 0.034, "step": 206400 }, { "epoch": 1.5344603381014306, "grad_norm": 0.7807109355926514, "learning_rate": 1.2251067690815513e-05, "loss": 0.0432, "step": 206500 }, { "epoch": 1.5352034181683076, "grad_norm": 0.3356735408306122, "learning_rate": 1.2231512913974626e-05, "loss": 0.0366, "step": 206600 }, { "epoch": 1.5359464982351847, "grad_norm": 0.4753335416316986, "learning_rate": 1.221195813713374e-05, "loss": 0.0328, "step": 206700 }, { "epoch": 1.5366895783020622, "grad_norm": 0.47309792041778564, "learning_rate": 1.2192403360292854e-05, "loss": 0.0298, "step": 206800 }, { "epoch": 1.5374326583689393, "grad_norm": 1.0630908012390137, "learning_rate": 1.2172848583451967e-05, "loss": 0.0361, "step": 206900 }, { "epoch": 1.5381757384358163, "grad_norm": 0.6851255893707275, "learning_rate": 1.215329380661108e-05, "loss": 0.0311, "step": 207000 }, { "epoch": 1.5389188185026936, "grad_norm": 0.5446286201477051, "learning_rate": 1.2133739029770193e-05, "loss": 0.038, "step": 207100 }, { "epoch": 1.539661898569571, "grad_norm": 0.4492778778076172, "learning_rate": 1.2114184252929306e-05, "loss": 0.0281, "step": 207200 }, { "epoch": 1.540404978636448, "grad_norm": 0.54234778881073, "learning_rate": 1.209462947608842e-05, "loss": 0.0321, "step": 207300 }, { "epoch": 1.5411480587033253, "grad_norm": 1.7295585870742798, "learning_rate": 1.2075074699247532e-05, "loss": 0.0296, "step": 207400 }, { "epoch": 1.5418911387702026, "grad_norm": 0.812653124332428, "learning_rate": 1.2055519922406647e-05, "loss": 0.039, "step": 207500 }, { "epoch": 1.5426342188370796, "grad_norm": 0.35901162028312683, "learning_rate": 1.203596514556576e-05, "loss": 0.0373, "step": 207600 }, { "epoch": 1.543377298903957, "grad_norm": 0.4846644699573517, "learning_rate": 1.2016410368724873e-05, "loss": 0.0332, "step": 207700 }, { "epoch": 1.5441203789708342, "grad_norm": 0.10293322801589966, "learning_rate": 1.1996855591883986e-05, "loss": 0.0305, "step": 207800 }, { "epoch": 1.5448634590377113, "grad_norm": 0.526848554611206, "learning_rate": 1.19773008150431e-05, "loss": 0.0322, "step": 207900 }, { "epoch": 1.5456065391045886, "grad_norm": 0.5220353603363037, "learning_rate": 1.1957746038202212e-05, "loss": 0.0319, "step": 208000 }, { "epoch": 1.5463496191714659, "grad_norm": 0.022422855719923973, "learning_rate": 1.1938191261361325e-05, "loss": 0.031, "step": 208100 }, { "epoch": 1.547092699238343, "grad_norm": 0.7166577577590942, "learning_rate": 1.1918636484520439e-05, "loss": 0.0308, "step": 208200 }, { "epoch": 1.54783577930522, "grad_norm": 1.3335756063461304, "learning_rate": 1.1899081707679552e-05, "loss": 0.0322, "step": 208300 }, { "epoch": 1.5485788593720975, "grad_norm": 0.33215221762657166, "learning_rate": 1.1879526930838666e-05, "loss": 0.0351, "step": 208400 }, { "epoch": 1.5493219394389746, "grad_norm": 0.3280538320541382, "learning_rate": 1.185997215399778e-05, "loss": 0.0301, "step": 208500 }, { "epoch": 1.5500650195058516, "grad_norm": 0.18732166290283203, "learning_rate": 1.1840417377156893e-05, "loss": 0.0311, "step": 208600 }, { "epoch": 1.550808099572729, "grad_norm": 1.516517996788025, "learning_rate": 1.1820862600316006e-05, "loss": 0.0335, "step": 208700 }, { "epoch": 1.5515511796396062, "grad_norm": 0.12009353190660477, "learning_rate": 1.1801307823475119e-05, "loss": 0.0341, "step": 208800 }, { "epoch": 1.5522942597064833, "grad_norm": 0.5942254662513733, "learning_rate": 1.1781753046634232e-05, "loss": 0.0297, "step": 208900 }, { "epoch": 1.5530373397733606, "grad_norm": 10.711411476135254, "learning_rate": 1.1762198269793345e-05, "loss": 0.0395, "step": 209000 }, { "epoch": 1.5537804198402378, "grad_norm": 0.17376986145973206, "learning_rate": 1.1742643492952458e-05, "loss": 0.0354, "step": 209100 }, { "epoch": 1.554523499907115, "grad_norm": 1.3953337669372559, "learning_rate": 1.1723088716111573e-05, "loss": 0.0321, "step": 209200 }, { "epoch": 1.5552665799739922, "grad_norm": 0.7380355596542358, "learning_rate": 1.1703533939270686e-05, "loss": 0.0354, "step": 209300 }, { "epoch": 1.5560096600408695, "grad_norm": 3.625545024871826, "learning_rate": 1.1683979162429799e-05, "loss": 0.0398, "step": 209400 }, { "epoch": 1.5567527401077466, "grad_norm": 0.3812175393104553, "learning_rate": 1.1664424385588912e-05, "loss": 0.0336, "step": 209500 }, { "epoch": 1.5574958201746238, "grad_norm": 0.35583576560020447, "learning_rate": 1.1644869608748025e-05, "loss": 0.0369, "step": 209600 }, { "epoch": 1.5582389002415011, "grad_norm": 0.788577139377594, "learning_rate": 1.1625314831907138e-05, "loss": 0.0407, "step": 209700 }, { "epoch": 1.5589819803083782, "grad_norm": 0.24805399775505066, "learning_rate": 1.1605760055066251e-05, "loss": 0.0292, "step": 209800 }, { "epoch": 1.5597250603752553, "grad_norm": 0.5646881461143494, "learning_rate": 1.1586205278225364e-05, "loss": 0.0326, "step": 209900 }, { "epoch": 1.5604681404421328, "grad_norm": 0.8049068450927734, "learning_rate": 1.1566650501384479e-05, "loss": 0.0335, "step": 210000 }, { "epoch": 1.5612112205090098, "grad_norm": 0.4467696249485016, "learning_rate": 1.1547095724543592e-05, "loss": 0.0406, "step": 210100 }, { "epoch": 1.561954300575887, "grad_norm": 0.5871259570121765, "learning_rate": 1.1527540947702705e-05, "loss": 0.0314, "step": 210200 }, { "epoch": 1.5626973806427644, "grad_norm": 0.35758721828460693, "learning_rate": 1.1507986170861818e-05, "loss": 0.0359, "step": 210300 }, { "epoch": 1.5634404607096415, "grad_norm": 1.5732477903366089, "learning_rate": 1.1488431394020933e-05, "loss": 0.0305, "step": 210400 }, { "epoch": 1.5641835407765186, "grad_norm": 0.08280766755342484, "learning_rate": 1.1468876617180046e-05, "loss": 0.0352, "step": 210500 }, { "epoch": 1.5649266208433958, "grad_norm": 0.5083556771278381, "learning_rate": 1.1449321840339159e-05, "loss": 0.0335, "step": 210600 }, { "epoch": 1.5656697009102731, "grad_norm": 0.5105210542678833, "learning_rate": 1.1429767063498272e-05, "loss": 0.0309, "step": 210700 }, { "epoch": 1.5664127809771502, "grad_norm": 0.19146481156349182, "learning_rate": 1.1410212286657385e-05, "loss": 0.0375, "step": 210800 }, { "epoch": 1.5671558610440275, "grad_norm": 0.5215849876403809, "learning_rate": 1.13906575098165e-05, "loss": 0.0297, "step": 210900 }, { "epoch": 1.5678989411109048, "grad_norm": 0.5051828622817993, "learning_rate": 1.1371102732975613e-05, "loss": 0.0297, "step": 211000 }, { "epoch": 1.5686420211777818, "grad_norm": 0.7248202562332153, "learning_rate": 1.1351547956134726e-05, "loss": 0.0278, "step": 211100 }, { "epoch": 1.5693851012446591, "grad_norm": 0.20095330476760864, "learning_rate": 1.1331993179293839e-05, "loss": 0.0337, "step": 211200 }, { "epoch": 1.5701281813115364, "grad_norm": 0.24253825843334198, "learning_rate": 1.1312438402452952e-05, "loss": 0.0328, "step": 211300 }, { "epoch": 1.5708712613784135, "grad_norm": 0.7881041169166565, "learning_rate": 1.1292883625612065e-05, "loss": 0.0267, "step": 211400 }, { "epoch": 1.5716143414452908, "grad_norm": 0.2627990245819092, "learning_rate": 1.1273328848771178e-05, "loss": 0.0259, "step": 211500 }, { "epoch": 1.572357421512168, "grad_norm": 0.47700655460357666, "learning_rate": 1.1253774071930291e-05, "loss": 0.0336, "step": 211600 }, { "epoch": 1.5731005015790451, "grad_norm": 0.6608429551124573, "learning_rate": 1.1234219295089404e-05, "loss": 0.0348, "step": 211700 }, { "epoch": 1.5738435816459222, "grad_norm": 0.28046780824661255, "learning_rate": 1.121466451824852e-05, "loss": 0.0314, "step": 211800 }, { "epoch": 1.5745866617127997, "grad_norm": 0.6098657250404358, "learning_rate": 1.1195109741407632e-05, "loss": 0.0331, "step": 211900 }, { "epoch": 1.5753297417796768, "grad_norm": 2.0406243801116943, "learning_rate": 1.1175554964566745e-05, "loss": 0.0333, "step": 212000 }, { "epoch": 1.5760728218465538, "grad_norm": 1.0494341850280762, "learning_rate": 1.1156000187725858e-05, "loss": 0.0395, "step": 212100 }, { "epoch": 1.5768159019134311, "grad_norm": 0.2782084047794342, "learning_rate": 1.1136445410884972e-05, "loss": 0.0308, "step": 212200 }, { "epoch": 1.5775589819803084, "grad_norm": 1.4682798385620117, "learning_rate": 1.1116890634044085e-05, "loss": 0.0336, "step": 212300 }, { "epoch": 1.5783020620471855, "grad_norm": 2.0869123935699463, "learning_rate": 1.1097335857203198e-05, "loss": 0.0326, "step": 212400 }, { "epoch": 1.5790451421140628, "grad_norm": 0.2524487376213074, "learning_rate": 1.107778108036231e-05, "loss": 0.0309, "step": 212500 }, { "epoch": 1.57978822218094, "grad_norm": 0.4427799880504608, "learning_rate": 1.1058226303521426e-05, "loss": 0.0376, "step": 212600 }, { "epoch": 1.5805313022478171, "grad_norm": 0.6652727127075195, "learning_rate": 1.1038671526680539e-05, "loss": 0.0364, "step": 212700 }, { "epoch": 1.5812743823146944, "grad_norm": 1.929429054260254, "learning_rate": 1.1019116749839652e-05, "loss": 0.0345, "step": 212800 }, { "epoch": 1.5820174623815717, "grad_norm": 0.09449823200702667, "learning_rate": 1.0999561972998765e-05, "loss": 0.0377, "step": 212900 }, { "epoch": 1.5827605424484488, "grad_norm": 0.2608896493911743, "learning_rate": 1.0980007196157878e-05, "loss": 0.029, "step": 213000 }, { "epoch": 1.583503622515326, "grad_norm": 0.75107342004776, "learning_rate": 1.0960452419316991e-05, "loss": 0.0331, "step": 213100 }, { "epoch": 1.5842467025822033, "grad_norm": 0.09670861810445786, "learning_rate": 1.0940897642476104e-05, "loss": 0.0343, "step": 213200 }, { "epoch": 1.5849897826490804, "grad_norm": 0.7217620611190796, "learning_rate": 1.0921342865635217e-05, "loss": 0.0311, "step": 213300 }, { "epoch": 1.5857328627159575, "grad_norm": 1.5683146715164185, "learning_rate": 1.0901788088794332e-05, "loss": 0.0316, "step": 213400 }, { "epoch": 1.586475942782835, "grad_norm": 0.27975592017173767, "learning_rate": 1.0882233311953445e-05, "loss": 0.0357, "step": 213500 }, { "epoch": 1.587219022849712, "grad_norm": 0.05690145120024681, "learning_rate": 1.0862678535112558e-05, "loss": 0.0292, "step": 213600 }, { "epoch": 1.5879621029165891, "grad_norm": 0.07133837789297104, "learning_rate": 1.0843123758271671e-05, "loss": 0.032, "step": 213700 }, { "epoch": 1.5887051829834666, "grad_norm": 0.5935697555541992, "learning_rate": 1.0823568981430784e-05, "loss": 0.0453, "step": 213800 }, { "epoch": 1.5894482630503437, "grad_norm": 0.21532206237316132, "learning_rate": 1.0804014204589897e-05, "loss": 0.0281, "step": 213900 }, { "epoch": 1.5901913431172208, "grad_norm": 0.6321874856948853, "learning_rate": 1.078445942774901e-05, "loss": 0.0302, "step": 214000 }, { "epoch": 1.590934423184098, "grad_norm": 0.5863400101661682, "learning_rate": 1.0764904650908123e-05, "loss": 0.0347, "step": 214100 }, { "epoch": 1.5916775032509753, "grad_norm": 0.3978463113307953, "learning_rate": 1.0745349874067236e-05, "loss": 0.0308, "step": 214200 }, { "epoch": 1.5924205833178524, "grad_norm": 0.38246893882751465, "learning_rate": 1.0725795097226351e-05, "loss": 0.0318, "step": 214300 }, { "epoch": 1.5931636633847297, "grad_norm": 0.2000616490840912, "learning_rate": 1.0706240320385464e-05, "loss": 0.0308, "step": 214400 }, { "epoch": 1.593906743451607, "grad_norm": 4.636688232421875, "learning_rate": 1.0686685543544577e-05, "loss": 0.0368, "step": 214500 }, { "epoch": 1.594649823518484, "grad_norm": 0.4762311577796936, "learning_rate": 1.066713076670369e-05, "loss": 0.0297, "step": 214600 }, { "epoch": 1.5953929035853613, "grad_norm": 0.4820430874824524, "learning_rate": 1.0647575989862803e-05, "loss": 0.0355, "step": 214700 }, { "epoch": 1.5961359836522386, "grad_norm": 0.6721144318580627, "learning_rate": 1.0628021213021916e-05, "loss": 0.0312, "step": 214800 }, { "epoch": 1.5968790637191157, "grad_norm": 0.5090517997741699, "learning_rate": 1.0608466436181031e-05, "loss": 0.0342, "step": 214900 }, { "epoch": 1.597622143785993, "grad_norm": 0.7354902029037476, "learning_rate": 1.0588911659340144e-05, "loss": 0.0454, "step": 215000 }, { "epoch": 1.5983652238528703, "grad_norm": 0.8132868409156799, "learning_rate": 1.0569356882499257e-05, "loss": 0.0353, "step": 215100 }, { "epoch": 1.5991083039197473, "grad_norm": 0.36119136214256287, "learning_rate": 1.054980210565837e-05, "loss": 0.0254, "step": 215200 }, { "epoch": 1.5998513839866244, "grad_norm": 0.3722553849220276, "learning_rate": 1.0530247328817485e-05, "loss": 0.0391, "step": 215300 }, { "epoch": 1.600594464053502, "grad_norm": 0.5627496242523193, "learning_rate": 1.0510692551976598e-05, "loss": 0.0309, "step": 215400 }, { "epoch": 1.601337544120379, "grad_norm": 0.26267629861831665, "learning_rate": 1.0491137775135711e-05, "loss": 0.027, "step": 215500 }, { "epoch": 1.602080624187256, "grad_norm": 0.2158655971288681, "learning_rate": 1.0471582998294824e-05, "loss": 0.0382, "step": 215600 }, { "epoch": 1.6028237042541333, "grad_norm": 1.8511232137680054, "learning_rate": 1.0452028221453937e-05, "loss": 0.0276, "step": 215700 }, { "epoch": 1.6035667843210106, "grad_norm": 1.1381440162658691, "learning_rate": 1.043247344461305e-05, "loss": 0.0323, "step": 215800 }, { "epoch": 1.6043098643878877, "grad_norm": 2.2147505283355713, "learning_rate": 1.0412918667772164e-05, "loss": 0.039, "step": 215900 }, { "epoch": 1.605052944454765, "grad_norm": 0.3837757110595703, "learning_rate": 1.0393363890931278e-05, "loss": 0.0274, "step": 216000 }, { "epoch": 1.6057960245216423, "grad_norm": 2.7019786834716797, "learning_rate": 1.0373809114090391e-05, "loss": 0.0347, "step": 216100 }, { "epoch": 1.6065391045885193, "grad_norm": 0.6074824333190918, "learning_rate": 1.0354254337249505e-05, "loss": 0.033, "step": 216200 }, { "epoch": 1.6072821846553966, "grad_norm": 0.21769122779369354, "learning_rate": 1.0334699560408618e-05, "loss": 0.0326, "step": 216300 }, { "epoch": 1.608025264722274, "grad_norm": 0.07630527764558792, "learning_rate": 1.031514478356773e-05, "loss": 0.0303, "step": 216400 }, { "epoch": 1.608768344789151, "grad_norm": 0.12393887341022491, "learning_rate": 1.0295590006726844e-05, "loss": 0.0317, "step": 216500 }, { "epoch": 1.6095114248560283, "grad_norm": 0.24535465240478516, "learning_rate": 1.0276035229885957e-05, "loss": 0.0345, "step": 216600 }, { "epoch": 1.6102545049229056, "grad_norm": 0.18093769252300262, "learning_rate": 1.025648045304507e-05, "loss": 0.034, "step": 216700 }, { "epoch": 1.6109975849897826, "grad_norm": 0.31281009316444397, "learning_rate": 1.0236925676204185e-05, "loss": 0.0394, "step": 216800 }, { "epoch": 1.6117406650566597, "grad_norm": 0.25630369782447815, "learning_rate": 1.0217370899363298e-05, "loss": 0.033, "step": 216900 }, { "epoch": 1.6124837451235372, "grad_norm": 0.6873476505279541, "learning_rate": 1.019781612252241e-05, "loss": 0.0305, "step": 217000 } ], "logging_steps": 100, "max_steps": 269150, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.310244863015483e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }