{ "best_metric": 0.6936118602752686, "best_model_checkpoint": "./wav2vec-bert-korean-dialect-recognition/checkpoint-294606", "epoch": 10.0, "eval_steps": 500, "global_step": 327340, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030549275982159224, "grad_norm": 1.5348891019821167, "learning_rate": 1e-05, "loss": 1.7854, "step": 100 }, { "epoch": 0.006109855196431845, "grad_norm": 1.399305820465088, "learning_rate": 2e-05, "loss": 1.7564, "step": 200 }, { "epoch": 0.009164782794647768, "grad_norm": 1.1098747253417969, "learning_rate": 3e-05, "loss": 1.7385, "step": 300 }, { "epoch": 0.01221971039286369, "grad_norm": 0.9315665364265442, "learning_rate": 4e-05, "loss": 1.6674, "step": 400 }, { "epoch": 0.015274637991079611, "grad_norm": 1.0861330032348633, "learning_rate": 5e-05, "loss": 1.7178, "step": 500 }, { "epoch": 0.018329565589295535, "grad_norm": 1.7394012212753296, "learning_rate": 4.9984701994859874e-05, "loss": 1.6606, "step": 600 }, { "epoch": 0.021384493187511455, "grad_norm": 1.3549752235412598, "learning_rate": 4.9969403989719745e-05, "loss": 1.6554, "step": 700 }, { "epoch": 0.02443942078572738, "grad_norm": 1.4596511125564575, "learning_rate": 4.9954105984579616e-05, "loss": 1.6682, "step": 800 }, { "epoch": 0.0274943483839433, "grad_norm": 1.1955530643463135, "learning_rate": 4.993880797943949e-05, "loss": 1.671, "step": 900 }, { "epoch": 0.030549275982159223, "grad_norm": 0.7402735948562622, "learning_rate": 4.992350997429935e-05, "loss": 1.6168, "step": 1000 }, { "epoch": 0.03360420358037514, "grad_norm": 1.1003531217575073, "learning_rate": 4.990821196915922e-05, "loss": 1.5873, "step": 1100 }, { "epoch": 0.03665913117859107, "grad_norm": 2.4431450366973877, "learning_rate": 4.9892913964019094e-05, "loss": 1.6629, "step": 1200 }, { "epoch": 0.03971405877680699, "grad_norm": 1.907747507095337, "learning_rate": 4.9877615958878965e-05, "loss": 1.6859, "step": 1300 }, { "epoch": 0.04276898637502291, "grad_norm": 1.5831812620162964, "learning_rate": 4.9862317953738836e-05, "loss": 1.6506, "step": 1400 }, { "epoch": 0.04582391397323884, "grad_norm": 2.0335052013397217, "learning_rate": 4.984701994859871e-05, "loss": 1.6158, "step": 1500 }, { "epoch": 0.04887884157145476, "grad_norm": 1.364405632019043, "learning_rate": 4.983172194345858e-05, "loss": 1.5788, "step": 1600 }, { "epoch": 0.05193376916967068, "grad_norm": 0.8212212920188904, "learning_rate": 4.981642393831845e-05, "loss": 1.6158, "step": 1700 }, { "epoch": 0.0549886967678866, "grad_norm": 1.3664904832839966, "learning_rate": 4.9801125933178314e-05, "loss": 1.6436, "step": 1800 }, { "epoch": 0.058043624366102525, "grad_norm": 2.1316192150115967, "learning_rate": 4.9785827928038185e-05, "loss": 1.6091, "step": 1900 }, { "epoch": 0.061098551964318445, "grad_norm": 1.4975366592407227, "learning_rate": 4.9770529922898056e-05, "loss": 1.6547, "step": 2000 }, { "epoch": 0.06415347956253437, "grad_norm": 2.3204729557037354, "learning_rate": 4.9755384897809324e-05, "loss": 1.5825, "step": 2100 }, { "epoch": 0.06720840716075029, "grad_norm": 0.7949225306510925, "learning_rate": 4.9740086892669195e-05, "loss": 1.5964, "step": 2200 }, { "epoch": 0.0702633347589662, "grad_norm": 2.1805293560028076, "learning_rate": 4.9724788887529066e-05, "loss": 1.6642, "step": 2300 }, { "epoch": 0.07331826235718214, "grad_norm": 2.7045323848724365, "learning_rate": 4.970949088238894e-05, "loss": 1.6367, "step": 2400 }, { "epoch": 0.07637318995539806, "grad_norm": 1.8262252807617188, "learning_rate": 4.969419287724881e-05, "loss": 1.591, "step": 2500 }, { "epoch": 0.07942811755361398, "grad_norm": 2.541649341583252, "learning_rate": 4.967889487210868e-05, "loss": 1.5751, "step": 2600 }, { "epoch": 0.0824830451518299, "grad_norm": 2.23138427734375, "learning_rate": 4.966359686696855e-05, "loss": 1.5997, "step": 2700 }, { "epoch": 0.08553797275004582, "grad_norm": 1.8299692869186401, "learning_rate": 4.9648298861828415e-05, "loss": 1.594, "step": 2800 }, { "epoch": 0.08859290034826174, "grad_norm": 1.8534067869186401, "learning_rate": 4.963300085668829e-05, "loss": 1.6269, "step": 2900 }, { "epoch": 0.09164782794647768, "grad_norm": 1.8997411727905273, "learning_rate": 4.961770285154816e-05, "loss": 1.6089, "step": 3000 }, { "epoch": 0.0947027555446936, "grad_norm": 2.0725138187408447, "learning_rate": 4.960240484640803e-05, "loss": 1.5926, "step": 3100 }, { "epoch": 0.09775768314290952, "grad_norm": 1.6708780527114868, "learning_rate": 4.95871068412679e-05, "loss": 1.6021, "step": 3200 }, { "epoch": 0.10081261074112544, "grad_norm": 1.2676891088485718, "learning_rate": 4.957180883612777e-05, "loss": 1.594, "step": 3300 }, { "epoch": 0.10386753833934136, "grad_norm": 2.105405807495117, "learning_rate": 4.955651083098764e-05, "loss": 1.58, "step": 3400 }, { "epoch": 0.10692246593755728, "grad_norm": 0.9499572515487671, "learning_rate": 4.9541212825847514e-05, "loss": 1.5693, "step": 3500 }, { "epoch": 0.1099773935357732, "grad_norm": 1.4242860078811646, "learning_rate": 4.952591482070738e-05, "loss": 1.6005, "step": 3600 }, { "epoch": 0.11303232113398913, "grad_norm": 2.2109808921813965, "learning_rate": 4.951061681556725e-05, "loss": 1.596, "step": 3700 }, { "epoch": 0.11608724873220505, "grad_norm": 1.9320544004440308, "learning_rate": 4.949531881042712e-05, "loss": 1.5385, "step": 3800 }, { "epoch": 0.11914217633042097, "grad_norm": 2.3479628562927246, "learning_rate": 4.948002080528699e-05, "loss": 1.6029, "step": 3900 }, { "epoch": 0.12219710392863689, "grad_norm": 2.0376906394958496, "learning_rate": 4.946472280014686e-05, "loss": 1.5878, "step": 4000 }, { "epoch": 0.12525203152685283, "grad_norm": 1.2168834209442139, "learning_rate": 4.944957777505814e-05, "loss": 1.557, "step": 4100 }, { "epoch": 0.12830695912506873, "grad_norm": 1.6018855571746826, "learning_rate": 4.943427976991801e-05, "loss": 1.5317, "step": 4200 }, { "epoch": 0.13136188672328467, "grad_norm": 1.5891162157058716, "learning_rate": 4.941898176477788e-05, "loss": 1.5615, "step": 4300 }, { "epoch": 0.13441681432150057, "grad_norm": 1.4518991708755493, "learning_rate": 4.940368375963775e-05, "loss": 1.5113, "step": 4400 }, { "epoch": 0.1374717419197165, "grad_norm": 1.9849175214767456, "learning_rate": 4.9388385754497615e-05, "loss": 1.6008, "step": 4500 }, { "epoch": 0.1405266695179324, "grad_norm": 1.123494267463684, "learning_rate": 4.9373087749357486e-05, "loss": 1.541, "step": 4600 }, { "epoch": 0.14358159711614835, "grad_norm": 2.3314383029937744, "learning_rate": 4.935778974421736e-05, "loss": 1.5266, "step": 4700 }, { "epoch": 0.14663652471436428, "grad_norm": 0.6723255515098572, "learning_rate": 4.934249173907723e-05, "loss": 1.5489, "step": 4800 }, { "epoch": 0.1496914523125802, "grad_norm": 1.7696168422698975, "learning_rate": 4.93271937339371e-05, "loss": 1.5782, "step": 4900 }, { "epoch": 0.15274637991079612, "grad_norm": 2.4960386753082275, "learning_rate": 4.931189572879697e-05, "loss": 1.5567, "step": 5000 }, { "epoch": 0.15580130750901203, "grad_norm": 0.6923491954803467, "learning_rate": 4.929659772365684e-05, "loss": 1.5766, "step": 5100 }, { "epoch": 0.15885623510722796, "grad_norm": 1.6389570236206055, "learning_rate": 4.928129971851671e-05, "loss": 1.568, "step": 5200 }, { "epoch": 0.16191116270544387, "grad_norm": 1.4275109767913818, "learning_rate": 4.926600171337658e-05, "loss": 1.5785, "step": 5300 }, { "epoch": 0.1649660903036598, "grad_norm": 0.9533630013465881, "learning_rate": 4.925070370823645e-05, "loss": 1.5425, "step": 5400 }, { "epoch": 0.16802101790187574, "grad_norm": 2.8133769035339355, "learning_rate": 4.923540570309632e-05, "loss": 1.5054, "step": 5500 }, { "epoch": 0.17107594550009164, "grad_norm": 1.8214422464370728, "learning_rate": 4.922010769795619e-05, "loss": 1.5494, "step": 5600 }, { "epoch": 0.17413087309830758, "grad_norm": 1.1751452684402466, "learning_rate": 4.920480969281606e-05, "loss": 1.5616, "step": 5700 }, { "epoch": 0.17718580069652348, "grad_norm": 1.7518926858901978, "learning_rate": 4.9189511687675933e-05, "loss": 1.4986, "step": 5800 }, { "epoch": 0.18024072829473942, "grad_norm": 1.4943835735321045, "learning_rate": 4.9174213682535805e-05, "loss": 1.539, "step": 5900 }, { "epoch": 0.18329565589295535, "grad_norm": 1.664491891860962, "learning_rate": 4.9158915677395676e-05, "loss": 1.5681, "step": 6000 }, { "epoch": 0.18635058349117126, "grad_norm": 1.8616182804107666, "learning_rate": 4.9143770652306944e-05, "loss": 1.4818, "step": 6100 }, { "epoch": 0.1894055110893872, "grad_norm": 1.5610833168029785, "learning_rate": 4.9128472647166815e-05, "loss": 1.5125, "step": 6200 }, { "epoch": 0.1924604386876031, "grad_norm": 3.129401445388794, "learning_rate": 4.911317464202668e-05, "loss": 1.4901, "step": 6300 }, { "epoch": 0.19551536628581903, "grad_norm": 1.6080937385559082, "learning_rate": 4.909787663688655e-05, "loss": 1.4938, "step": 6400 }, { "epoch": 0.19857029388403494, "grad_norm": 1.7884577512741089, "learning_rate": 4.908257863174642e-05, "loss": 1.4891, "step": 6500 }, { "epoch": 0.20162522148225087, "grad_norm": 0.6538684964179993, "learning_rate": 4.906728062660629e-05, "loss": 1.5345, "step": 6600 }, { "epoch": 0.2046801490804668, "grad_norm": 0.9260094165802002, "learning_rate": 4.9051982621466164e-05, "loss": 1.5286, "step": 6700 }, { "epoch": 0.2077350766786827, "grad_norm": 1.738603949546814, "learning_rate": 4.9036684616326035e-05, "loss": 1.458, "step": 6800 }, { "epoch": 0.21079000427689865, "grad_norm": 0.5729855895042419, "learning_rate": 4.9021386611185906e-05, "loss": 1.4975, "step": 6900 }, { "epoch": 0.21384493187511455, "grad_norm": 2.177529811859131, "learning_rate": 4.900608860604578e-05, "loss": 1.4874, "step": 7000 }, { "epoch": 0.2168998594733305, "grad_norm": 2.3976924419403076, "learning_rate": 4.899079060090564e-05, "loss": 1.5065, "step": 7100 }, { "epoch": 0.2199547870715464, "grad_norm": 1.6993240118026733, "learning_rate": 4.897549259576551e-05, "loss": 1.449, "step": 7200 }, { "epoch": 0.22300971466976233, "grad_norm": 1.3176172971725464, "learning_rate": 4.8960194590625384e-05, "loss": 1.4667, "step": 7300 }, { "epoch": 0.22606464226797826, "grad_norm": 0.5767053365707397, "learning_rate": 4.8944896585485255e-05, "loss": 1.5477, "step": 7400 }, { "epoch": 0.22911956986619417, "grad_norm": 2.0425851345062256, "learning_rate": 4.8929598580345126e-05, "loss": 1.4457, "step": 7500 }, { "epoch": 0.2321744974644101, "grad_norm": 2.019169807434082, "learning_rate": 4.8914300575205e-05, "loss": 1.4033, "step": 7600 }, { "epoch": 0.235229425062626, "grad_norm": 1.443758487701416, "learning_rate": 4.889900257006487e-05, "loss": 1.4599, "step": 7700 }, { "epoch": 0.23828435266084194, "grad_norm": 2.3572771549224854, "learning_rate": 4.888370456492474e-05, "loss": 1.4191, "step": 7800 }, { "epoch": 0.24133928025905785, "grad_norm": 2.7198643684387207, "learning_rate": 4.8868406559784604e-05, "loss": 1.3884, "step": 7900 }, { "epoch": 0.24439420785727378, "grad_norm": 1.7861402034759521, "learning_rate": 4.8853108554644475e-05, "loss": 1.5456, "step": 8000 }, { "epoch": 0.24744913545548972, "grad_norm": 1.8129621744155884, "learning_rate": 4.883796352955575e-05, "loss": 1.4116, "step": 8100 }, { "epoch": 0.25050406305370565, "grad_norm": 2.0511136054992676, "learning_rate": 4.8822665524415615e-05, "loss": 1.3446, "step": 8200 }, { "epoch": 0.25355899065192156, "grad_norm": 3.2969486713409424, "learning_rate": 4.8807367519275486e-05, "loss": 1.4883, "step": 8300 }, { "epoch": 0.25661391825013746, "grad_norm": 0.9347625374794006, "learning_rate": 4.879206951413536e-05, "loss": 1.4372, "step": 8400 }, { "epoch": 0.25966884584835337, "grad_norm": 4.156298637390137, "learning_rate": 4.877677150899523e-05, "loss": 1.5377, "step": 8500 }, { "epoch": 0.26272377344656933, "grad_norm": 3.0619304180145264, "learning_rate": 4.87614735038551e-05, "loss": 1.509, "step": 8600 }, { "epoch": 0.26577870104478524, "grad_norm": 1.6035257577896118, "learning_rate": 4.874617549871497e-05, "loss": 1.4709, "step": 8700 }, { "epoch": 0.26883362864300114, "grad_norm": 2.9043540954589844, "learning_rate": 4.873087749357484e-05, "loss": 1.4119, "step": 8800 }, { "epoch": 0.2718885562412171, "grad_norm": 2.1844189167022705, "learning_rate": 4.871557948843471e-05, "loss": 1.4593, "step": 8900 }, { "epoch": 0.274943483839433, "grad_norm": 1.494769811630249, "learning_rate": 4.870028148329458e-05, "loss": 1.3733, "step": 9000 }, { "epoch": 0.2779984114376489, "grad_norm": 1.0596073865890503, "learning_rate": 4.868498347815445e-05, "loss": 1.4862, "step": 9100 }, { "epoch": 0.2810533390358648, "grad_norm": 2.1339333057403564, "learning_rate": 4.866968547301432e-05, "loss": 1.4308, "step": 9200 }, { "epoch": 0.2841082666340808, "grad_norm": 2.508408546447754, "learning_rate": 4.865438746787419e-05, "loss": 1.4522, "step": 9300 }, { "epoch": 0.2871631942322967, "grad_norm": 1.4985188245773315, "learning_rate": 4.863908946273406e-05, "loss": 1.4005, "step": 9400 }, { "epoch": 0.2902181218305126, "grad_norm": 1.525770902633667, "learning_rate": 4.862379145759393e-05, "loss": 1.4459, "step": 9500 }, { "epoch": 0.29327304942872856, "grad_norm": 2.6566789150238037, "learning_rate": 4.8608493452453804e-05, "loss": 1.4367, "step": 9600 }, { "epoch": 0.29632797702694447, "grad_norm": 2.781080722808838, "learning_rate": 4.8593195447313675e-05, "loss": 1.4287, "step": 9700 }, { "epoch": 0.2993829046251604, "grad_norm": 0.9105027914047241, "learning_rate": 4.857789744217354e-05, "loss": 1.4311, "step": 9800 }, { "epoch": 0.3024378322233763, "grad_norm": 1.959030032157898, "learning_rate": 4.856259943703341e-05, "loss": 1.4048, "step": 9900 }, { "epoch": 0.30549275982159224, "grad_norm": 3.855910301208496, "learning_rate": 4.854730143189328e-05, "loss": 1.3622, "step": 10000 }, { "epoch": 0.30854768741980815, "grad_norm": 5.930082321166992, "learning_rate": 4.8532156406804557e-05, "loss": 1.3551, "step": 10100 }, { "epoch": 0.31160261501802405, "grad_norm": 2.937746286392212, "learning_rate": 4.851685840166443e-05, "loss": 1.3583, "step": 10200 }, { "epoch": 0.31465754261624, "grad_norm": 3.188389301300049, "learning_rate": 4.85015603965243e-05, "loss": 1.4423, "step": 10300 }, { "epoch": 0.3177124702144559, "grad_norm": 1.9529550075531006, "learning_rate": 4.848626239138417e-05, "loss": 1.4067, "step": 10400 }, { "epoch": 0.32076739781267183, "grad_norm": 4.006279945373535, "learning_rate": 4.847096438624404e-05, "loss": 1.417, "step": 10500 }, { "epoch": 0.32382232541088773, "grad_norm": 1.2116618156433105, "learning_rate": 4.845566638110391e-05, "loss": 1.4318, "step": 10600 }, { "epoch": 0.3268772530091037, "grad_norm": 1.4079272747039795, "learning_rate": 4.844036837596378e-05, "loss": 1.3835, "step": 10700 }, { "epoch": 0.3299321806073196, "grad_norm": 4.599946975708008, "learning_rate": 4.842507037082364e-05, "loss": 1.4411, "step": 10800 }, { "epoch": 0.3329871082055355, "grad_norm": 5.579395294189453, "learning_rate": 4.840977236568351e-05, "loss": 1.3978, "step": 10900 }, { "epoch": 0.33604203580375147, "grad_norm": 1.4071085453033447, "learning_rate": 4.839447436054338e-05, "loss": 1.3574, "step": 11000 }, { "epoch": 0.3390969634019674, "grad_norm": 1.6776551008224487, "learning_rate": 4.8379176355403255e-05, "loss": 1.4283, "step": 11100 }, { "epoch": 0.3421518910001833, "grad_norm": 4.477878093719482, "learning_rate": 4.8363878350263126e-05, "loss": 1.4258, "step": 11200 }, { "epoch": 0.34520681859839925, "grad_norm": 0.5587853789329529, "learning_rate": 4.8348580345123e-05, "loss": 1.4444, "step": 11300 }, { "epoch": 0.34826174619661515, "grad_norm": 4.845232963562012, "learning_rate": 4.833328233998287e-05, "loss": 1.3144, "step": 11400 }, { "epoch": 0.35131667379483106, "grad_norm": 3.292431592941284, "learning_rate": 4.831798433484274e-05, "loss": 1.3466, "step": 11500 }, { "epoch": 0.35437160139304696, "grad_norm": 2.4652178287506104, "learning_rate": 4.8302686329702604e-05, "loss": 1.3643, "step": 11600 }, { "epoch": 0.3574265289912629, "grad_norm": 2.7705917358398438, "learning_rate": 4.8287388324562475e-05, "loss": 1.4231, "step": 11700 }, { "epoch": 0.36048145658947883, "grad_norm": 3.021454334259033, "learning_rate": 4.8272090319422346e-05, "loss": 1.4229, "step": 11800 }, { "epoch": 0.36353638418769474, "grad_norm": 3.1049673557281494, "learning_rate": 4.825679231428222e-05, "loss": 1.3907, "step": 11900 }, { "epoch": 0.3665913117859107, "grad_norm": 2.0627031326293945, "learning_rate": 4.824149430914209e-05, "loss": 1.3381, "step": 12000 }, { "epoch": 0.3696462393841266, "grad_norm": 2.4559926986694336, "learning_rate": 4.822619630400196e-05, "loss": 1.3848, "step": 12100 }, { "epoch": 0.3727011669823425, "grad_norm": 3.559675693511963, "learning_rate": 4.8211051278913234e-05, "loss": 1.3528, "step": 12200 }, { "epoch": 0.3757560945805584, "grad_norm": 2.9225029945373535, "learning_rate": 4.8195753273773105e-05, "loss": 1.359, "step": 12300 }, { "epoch": 0.3788110221787744, "grad_norm": 2.4771335124969482, "learning_rate": 4.8180455268632976e-05, "loss": 1.3831, "step": 12400 }, { "epoch": 0.3818659497769903, "grad_norm": 2.1963953971862793, "learning_rate": 4.816515726349284e-05, "loss": 1.3522, "step": 12500 }, { "epoch": 0.3849208773752062, "grad_norm": 2.0889718532562256, "learning_rate": 4.814985925835271e-05, "loss": 1.3587, "step": 12600 }, { "epoch": 0.38797580497342216, "grad_norm": 2.777860641479492, "learning_rate": 4.813456125321258e-05, "loss": 1.3519, "step": 12700 }, { "epoch": 0.39103073257163806, "grad_norm": 2.518688201904297, "learning_rate": 4.8119263248072454e-05, "loss": 1.43, "step": 12800 }, { "epoch": 0.39408566016985397, "grad_norm": 2.9270553588867188, "learning_rate": 4.8103965242932325e-05, "loss": 1.3488, "step": 12900 }, { "epoch": 0.3971405877680699, "grad_norm": 2.321699857711792, "learning_rate": 4.8088667237792197e-05, "loss": 1.3405, "step": 13000 }, { "epoch": 0.40019551536628584, "grad_norm": 3.4953596591949463, "learning_rate": 4.807336923265207e-05, "loss": 1.2441, "step": 13100 }, { "epoch": 0.40325044296450174, "grad_norm": 2.8772597312927246, "learning_rate": 4.805807122751194e-05, "loss": 1.3981, "step": 13200 }, { "epoch": 0.40630537056271765, "grad_norm": 1.0560011863708496, "learning_rate": 4.80427732223718e-05, "loss": 1.3345, "step": 13300 }, { "epoch": 0.4093602981609336, "grad_norm": 3.9890592098236084, "learning_rate": 4.8027475217231674e-05, "loss": 1.2633, "step": 13400 }, { "epoch": 0.4124152257591495, "grad_norm": 4.9667816162109375, "learning_rate": 4.8012177212091546e-05, "loss": 1.3852, "step": 13500 }, { "epoch": 0.4154701533573654, "grad_norm": 4.832125186920166, "learning_rate": 4.799687920695142e-05, "loss": 1.3617, "step": 13600 }, { "epoch": 0.41852508095558133, "grad_norm": 1.3396093845367432, "learning_rate": 4.798158120181129e-05, "loss": 1.348, "step": 13700 }, { "epoch": 0.4215800085537973, "grad_norm": 2.829025983810425, "learning_rate": 4.796628319667116e-05, "loss": 1.3474, "step": 13800 }, { "epoch": 0.4246349361520132, "grad_norm": 4.8995280265808105, "learning_rate": 4.795098519153103e-05, "loss": 1.3866, "step": 13900 }, { "epoch": 0.4276898637502291, "grad_norm": 1.4323627948760986, "learning_rate": 4.79356871863909e-05, "loss": 1.2936, "step": 14000 }, { "epoch": 0.43074479134844507, "grad_norm": 1.9841852188110352, "learning_rate": 4.7920389181250766e-05, "loss": 1.4257, "step": 14100 }, { "epoch": 0.433799718946661, "grad_norm": 1.8389599323272705, "learning_rate": 4.790524415616204e-05, "loss": 1.3501, "step": 14200 }, { "epoch": 0.4368546465448769, "grad_norm": 2.149446487426758, "learning_rate": 4.788994615102191e-05, "loss": 1.3285, "step": 14300 }, { "epoch": 0.4399095741430928, "grad_norm": 2.1945650577545166, "learning_rate": 4.7874648145881776e-05, "loss": 1.375, "step": 14400 }, { "epoch": 0.44296450174130875, "grad_norm": 3.0235095024108887, "learning_rate": 4.785935014074165e-05, "loss": 1.3076, "step": 14500 }, { "epoch": 0.44601942933952465, "grad_norm": 1.4999414682388306, "learning_rate": 4.784405213560152e-05, "loss": 1.4213, "step": 14600 }, { "epoch": 0.44907435693774056, "grad_norm": 2.6008243560791016, "learning_rate": 4.782875413046139e-05, "loss": 1.3755, "step": 14700 }, { "epoch": 0.4521292845359565, "grad_norm": 3.3716583251953125, "learning_rate": 4.781345612532126e-05, "loss": 1.3439, "step": 14800 }, { "epoch": 0.45518421213417243, "grad_norm": 1.567351222038269, "learning_rate": 4.779815812018113e-05, "loss": 1.2619, "step": 14900 }, { "epoch": 0.45823913973238833, "grad_norm": 4.242433547973633, "learning_rate": 4.7782860115041e-05, "loss": 1.3884, "step": 15000 }, { "epoch": 0.46129406733060424, "grad_norm": 5.024165153503418, "learning_rate": 4.7767562109900874e-05, "loss": 1.2998, "step": 15100 }, { "epoch": 0.4643489949288202, "grad_norm": 1.833509922027588, "learning_rate": 4.775226410476074e-05, "loss": 1.4013, "step": 15200 }, { "epoch": 0.4674039225270361, "grad_norm": 2.599609613418579, "learning_rate": 4.773696609962061e-05, "loss": 1.3305, "step": 15300 }, { "epoch": 0.470458850125252, "grad_norm": 1.2570773363113403, "learning_rate": 4.772166809448048e-05, "loss": 1.2685, "step": 15400 }, { "epoch": 0.473513777723468, "grad_norm": 2.675783634185791, "learning_rate": 4.770637008934035e-05, "loss": 1.2194, "step": 15500 }, { "epoch": 0.4765687053216839, "grad_norm": 5.149052143096924, "learning_rate": 4.769107208420022e-05, "loss": 1.3477, "step": 15600 }, { "epoch": 0.4796236329198998, "grad_norm": 2.4533028602600098, "learning_rate": 4.7675774079060094e-05, "loss": 1.3299, "step": 15700 }, { "epoch": 0.4826785605181157, "grad_norm": 1.2919026613235474, "learning_rate": 4.7660476073919965e-05, "loss": 1.3504, "step": 15800 }, { "epoch": 0.48573348811633166, "grad_norm": 4.265079498291016, "learning_rate": 4.7645178068779837e-05, "loss": 1.4127, "step": 15900 }, { "epoch": 0.48878841571454756, "grad_norm": 2.5100667476654053, "learning_rate": 4.76298800636397e-05, "loss": 1.4027, "step": 16000 }, { "epoch": 0.49184334331276347, "grad_norm": 3.6750094890594482, "learning_rate": 4.761458205849957e-05, "loss": 1.2504, "step": 16100 }, { "epoch": 0.49489827091097943, "grad_norm": 1.9922250509262085, "learning_rate": 4.759943703341084e-05, "loss": 1.303, "step": 16200 }, { "epoch": 0.49795319850919534, "grad_norm": 2.3737528324127197, "learning_rate": 4.758413902827071e-05, "loss": 1.3723, "step": 16300 }, { "epoch": 0.5010081261074113, "grad_norm": 1.4733338356018066, "learning_rate": 4.756884102313058e-05, "loss": 1.2958, "step": 16400 }, { "epoch": 0.5040630537056272, "grad_norm": 5.010000705718994, "learning_rate": 4.7553543017990454e-05, "loss": 1.1981, "step": 16500 }, { "epoch": 0.5071179813038431, "grad_norm": 1.789291501045227, "learning_rate": 4.7538245012850325e-05, "loss": 1.385, "step": 16600 }, { "epoch": 0.510172908902059, "grad_norm": 4.9523234367370605, "learning_rate": 4.7522947007710196e-05, "loss": 1.2778, "step": 16700 }, { "epoch": 0.5132278365002749, "grad_norm": 1.309713363647461, "learning_rate": 4.750764900257007e-05, "loss": 1.3772, "step": 16800 }, { "epoch": 0.5162827640984908, "grad_norm": 2.1764285564422607, "learning_rate": 4.749235099742994e-05, "loss": 1.2442, "step": 16900 }, { "epoch": 0.5193376916967067, "grad_norm": 5.70428991317749, "learning_rate": 4.74770529922898e-05, "loss": 1.3491, "step": 17000 }, { "epoch": 0.5223926192949228, "grad_norm": 5.1388468742370605, "learning_rate": 4.7461754987149674e-05, "loss": 1.3646, "step": 17100 }, { "epoch": 0.5254475468931387, "grad_norm": 3.198745012283325, "learning_rate": 4.7446456982009545e-05, "loss": 1.124, "step": 17200 }, { "epoch": 0.5285024744913546, "grad_norm": 4.073473930358887, "learning_rate": 4.7431158976869416e-05, "loss": 1.3609, "step": 17300 }, { "epoch": 0.5315574020895705, "grad_norm": 3.4154462814331055, "learning_rate": 4.741586097172929e-05, "loss": 1.2877, "step": 17400 }, { "epoch": 0.5346123296877864, "grad_norm": 2.073560953140259, "learning_rate": 4.740056296658916e-05, "loss": 1.3057, "step": 17500 }, { "epoch": 0.5376672572860023, "grad_norm": 2.3424408435821533, "learning_rate": 4.738526496144903e-05, "loss": 1.3119, "step": 17600 }, { "epoch": 0.5407221848842182, "grad_norm": 1.6415555477142334, "learning_rate": 4.73699669563089e-05, "loss": 1.307, "step": 17700 }, { "epoch": 0.5437771124824342, "grad_norm": 3.323765277862549, "learning_rate": 4.7354668951168765e-05, "loss": 1.3771, "step": 17800 }, { "epoch": 0.5468320400806501, "grad_norm": 6.020792007446289, "learning_rate": 4.7339370946028636e-05, "loss": 1.303, "step": 17900 }, { "epoch": 0.549886967678866, "grad_norm": 3.13588547706604, "learning_rate": 4.732407294088851e-05, "loss": 1.3615, "step": 18000 }, { "epoch": 0.5529418952770819, "grad_norm": 0.8763374090194702, "learning_rate": 4.730877493574838e-05, "loss": 1.2856, "step": 18100 }, { "epoch": 0.5559968228752978, "grad_norm": 4.6722025871276855, "learning_rate": 4.729362991065965e-05, "loss": 1.2759, "step": 18200 }, { "epoch": 0.5590517504735137, "grad_norm": 4.301787853240967, "learning_rate": 4.7278331905519524e-05, "loss": 1.2397, "step": 18300 }, { "epoch": 0.5621066780717296, "grad_norm": 3.807133436203003, "learning_rate": 4.7263033900379396e-05, "loss": 1.3704, "step": 18400 }, { "epoch": 0.5651616056699457, "grad_norm": 3.641347885131836, "learning_rate": 4.724773589523927e-05, "loss": 1.2844, "step": 18500 }, { "epoch": 0.5682165332681616, "grad_norm": 2.7137675285339355, "learning_rate": 4.723243789009914e-05, "loss": 1.2856, "step": 18600 }, { "epoch": 0.5712714608663775, "grad_norm": 4.199899673461914, "learning_rate": 4.7217139884959e-05, "loss": 1.2635, "step": 18700 }, { "epoch": 0.5743263884645934, "grad_norm": 3.4955711364746094, "learning_rate": 4.7201841879818873e-05, "loss": 1.4038, "step": 18800 }, { "epoch": 0.5773813160628093, "grad_norm": 4.2103471755981445, "learning_rate": 4.7186543874678745e-05, "loss": 1.2279, "step": 18900 }, { "epoch": 0.5804362436610252, "grad_norm": 1.8225414752960205, "learning_rate": 4.7171245869538616e-05, "loss": 1.3211, "step": 19000 }, { "epoch": 0.5834911712592411, "grad_norm": 4.89621639251709, "learning_rate": 4.715594786439849e-05, "loss": 1.2343, "step": 19100 }, { "epoch": 0.5865460988574571, "grad_norm": 3.157581090927124, "learning_rate": 4.714064985925836e-05, "loss": 1.3112, "step": 19200 }, { "epoch": 0.589601026455673, "grad_norm": 4.4817423820495605, "learning_rate": 4.712535185411823e-05, "loss": 1.3675, "step": 19300 }, { "epoch": 0.5926559540538889, "grad_norm": 3.6259207725524902, "learning_rate": 4.71100538489781e-05, "loss": 1.2662, "step": 19400 }, { "epoch": 0.5957108816521048, "grad_norm": 1.9341295957565308, "learning_rate": 4.7094755843837965e-05, "loss": 1.264, "step": 19500 }, { "epoch": 0.5987658092503207, "grad_norm": 2.3397438526153564, "learning_rate": 4.7079457838697836e-05, "loss": 1.2692, "step": 19600 }, { "epoch": 0.6018207368485367, "grad_norm": 2.147289514541626, "learning_rate": 4.706415983355771e-05, "loss": 1.1633, "step": 19700 }, { "epoch": 0.6048756644467526, "grad_norm": 4.903138160705566, "learning_rate": 4.704886182841758e-05, "loss": 1.2628, "step": 19800 }, { "epoch": 0.6079305920449686, "grad_norm": 4.019737720489502, "learning_rate": 4.703356382327745e-05, "loss": 1.2283, "step": 19900 }, { "epoch": 0.6109855196431845, "grad_norm": 5.8793182373046875, "learning_rate": 4.701826581813732e-05, "loss": 1.2297, "step": 20000 }, { "epoch": 0.6140404472414004, "grad_norm": 2.11370587348938, "learning_rate": 4.700296781299719e-05, "loss": 1.3137, "step": 20100 }, { "epoch": 0.6170953748396163, "grad_norm": 2.1315717697143555, "learning_rate": 4.698782278790846e-05, "loss": 1.3261, "step": 20200 }, { "epoch": 0.6201503024378322, "grad_norm": 3.4208743572235107, "learning_rate": 4.697252478276833e-05, "loss": 1.3392, "step": 20300 }, { "epoch": 0.6232052300360481, "grad_norm": 2.3667633533477783, "learning_rate": 4.69572267776282e-05, "loss": 1.2584, "step": 20400 }, { "epoch": 0.626260157634264, "grad_norm": 2.951887607574463, "learning_rate": 4.6941928772488066e-05, "loss": 1.2928, "step": 20500 }, { "epoch": 0.62931508523248, "grad_norm": 3.0843470096588135, "learning_rate": 4.692663076734794e-05, "loss": 1.351, "step": 20600 }, { "epoch": 0.6323700128306959, "grad_norm": 3.92063045501709, "learning_rate": 4.691133276220781e-05, "loss": 1.3723, "step": 20700 }, { "epoch": 0.6354249404289118, "grad_norm": 6.2461676597595215, "learning_rate": 4.689603475706768e-05, "loss": 1.3467, "step": 20800 }, { "epoch": 0.6384798680271278, "grad_norm": 5.256413459777832, "learning_rate": 4.688073675192755e-05, "loss": 1.2848, "step": 20900 }, { "epoch": 0.6415347956253437, "grad_norm": 2.6756303310394287, "learning_rate": 4.686543874678742e-05, "loss": 1.2802, "step": 21000 }, { "epoch": 0.6445897232235596, "grad_norm": 2.9479217529296875, "learning_rate": 4.685014074164729e-05, "loss": 1.3156, "step": 21100 }, { "epoch": 0.6476446508217755, "grad_norm": 1.6067280769348145, "learning_rate": 4.6834842736507164e-05, "loss": 1.2648, "step": 21200 }, { "epoch": 0.6506995784199915, "grad_norm": 2.9112439155578613, "learning_rate": 4.681954473136703e-05, "loss": 1.2988, "step": 21300 }, { "epoch": 0.6537545060182074, "grad_norm": 3.3154282569885254, "learning_rate": 4.68042467262269e-05, "loss": 1.2409, "step": 21400 }, { "epoch": 0.6568094336164233, "grad_norm": 1.7731764316558838, "learning_rate": 4.678894872108677e-05, "loss": 1.2897, "step": 21500 }, { "epoch": 0.6598643612146392, "grad_norm": 2.621236801147461, "learning_rate": 4.677365071594664e-05, "loss": 1.349, "step": 21600 }, { "epoch": 0.6629192888128551, "grad_norm": 1.563030481338501, "learning_rate": 4.6758352710806513e-05, "loss": 1.3203, "step": 21700 }, { "epoch": 0.665974216411071, "grad_norm": 3.4372918605804443, "learning_rate": 4.6743054705666385e-05, "loss": 1.3152, "step": 21800 }, { "epoch": 0.6690291440092869, "grad_norm": 5.096141815185547, "learning_rate": 4.6727756700526256e-05, "loss": 1.2419, "step": 21900 }, { "epoch": 0.6720840716075029, "grad_norm": 2.1512577533721924, "learning_rate": 4.671245869538613e-05, "loss": 1.2946, "step": 22000 }, { "epoch": 0.6751389992057188, "grad_norm": 11.683186531066895, "learning_rate": 4.669716069024599e-05, "loss": 1.2553, "step": 22100 }, { "epoch": 0.6781939268039348, "grad_norm": 9.039573669433594, "learning_rate": 4.6682015665157266e-05, "loss": 1.2411, "step": 22200 }, { "epoch": 0.6812488544021507, "grad_norm": 3.796337842941284, "learning_rate": 4.666671766001714e-05, "loss": 1.2908, "step": 22300 }, { "epoch": 0.6843037820003666, "grad_norm": 3.969041347503662, "learning_rate": 4.6651419654877e-05, "loss": 1.2525, "step": 22400 }, { "epoch": 0.6873587095985825, "grad_norm": 5.0099077224731445, "learning_rate": 4.663612164973687e-05, "loss": 1.265, "step": 22500 }, { "epoch": 0.6904136371967985, "grad_norm": 7.791167736053467, "learning_rate": 4.6620823644596744e-05, "loss": 1.3321, "step": 22600 }, { "epoch": 0.6934685647950144, "grad_norm": 2.604672431945801, "learning_rate": 4.6605525639456615e-05, "loss": 1.2495, "step": 22700 }, { "epoch": 0.6965234923932303, "grad_norm": 3.6437833309173584, "learning_rate": 4.6590227634316486e-05, "loss": 1.3683, "step": 22800 }, { "epoch": 0.6995784199914462, "grad_norm": 1.2340764999389648, "learning_rate": 4.657492962917636e-05, "loss": 1.2403, "step": 22900 }, { "epoch": 0.7026333475896621, "grad_norm": 7.440982341766357, "learning_rate": 4.655963162403623e-05, "loss": 1.2417, "step": 23000 }, { "epoch": 0.705688275187878, "grad_norm": 1.300933599472046, "learning_rate": 4.65443336188961e-05, "loss": 1.3279, "step": 23100 }, { "epoch": 0.7087432027860939, "grad_norm": 2.8632686138153076, "learning_rate": 4.6529035613755964e-05, "loss": 1.3214, "step": 23200 }, { "epoch": 0.71179813038431, "grad_norm": 10.080193519592285, "learning_rate": 4.6513737608615835e-05, "loss": 1.318, "step": 23300 }, { "epoch": 0.7148530579825259, "grad_norm": 2.173156976699829, "learning_rate": 4.6498439603475706e-05, "loss": 1.2769, "step": 23400 }, { "epoch": 0.7179079855807418, "grad_norm": 4.581544399261475, "learning_rate": 4.648329457838698e-05, "loss": 1.2981, "step": 23500 }, { "epoch": 0.7209629131789577, "grad_norm": 1.1701802015304565, "learning_rate": 4.646799657324685e-05, "loss": 1.2509, "step": 23600 }, { "epoch": 0.7240178407771736, "grad_norm": 4.420626640319824, "learning_rate": 4.6452698568106723e-05, "loss": 1.2835, "step": 23700 }, { "epoch": 0.7270727683753895, "grad_norm": 7.273269176483154, "learning_rate": 4.6437400562966595e-05, "loss": 1.3267, "step": 23800 }, { "epoch": 0.7301276959736054, "grad_norm": 4.186655521392822, "learning_rate": 4.6422102557826466e-05, "loss": 1.2442, "step": 23900 }, { "epoch": 0.7331826235718214, "grad_norm": 3.82904052734375, "learning_rate": 4.640680455268634e-05, "loss": 1.195, "step": 24000 }, { "epoch": 0.7362375511700373, "grad_norm": 2.4320216178894043, "learning_rate": 4.63915065475462e-05, "loss": 1.29, "step": 24100 }, { "epoch": 0.7392924787682532, "grad_norm": 2.309053897857666, "learning_rate": 4.637620854240607e-05, "loss": 1.3795, "step": 24200 }, { "epoch": 0.7423474063664691, "grad_norm": 3.3796703815460205, "learning_rate": 4.6360910537265944e-05, "loss": 1.2625, "step": 24300 }, { "epoch": 0.745402333964685, "grad_norm": 2.1054587364196777, "learning_rate": 4.6345612532125815e-05, "loss": 1.2281, "step": 24400 }, { "epoch": 0.7484572615629009, "grad_norm": 5.734375476837158, "learning_rate": 4.6330314526985686e-05, "loss": 1.2712, "step": 24500 }, { "epoch": 0.7515121891611168, "grad_norm": 6.285786151885986, "learning_rate": 4.631501652184556e-05, "loss": 1.1773, "step": 24600 }, { "epoch": 0.7545671167593329, "grad_norm": 4.265208721160889, "learning_rate": 4.629971851670543e-05, "loss": 1.1952, "step": 24700 }, { "epoch": 0.7576220443575488, "grad_norm": 4.139396667480469, "learning_rate": 4.62844205115653e-05, "loss": 1.2747, "step": 24800 }, { "epoch": 0.7606769719557647, "grad_norm": 3.3605008125305176, "learning_rate": 4.6269122506425164e-05, "loss": 1.2392, "step": 24900 }, { "epoch": 0.7637318995539806, "grad_norm": 3.1468260288238525, "learning_rate": 4.6253824501285035e-05, "loss": 1.2669, "step": 25000 }, { "epoch": 0.7667868271521965, "grad_norm": 1.4195454120635986, "learning_rate": 4.6238526496144906e-05, "loss": 1.3578, "step": 25100 }, { "epoch": 0.7698417547504124, "grad_norm": 3.4323678016662598, "learning_rate": 4.622322849100478e-05, "loss": 1.3418, "step": 25200 }, { "epoch": 0.7728966823486283, "grad_norm": 8.107824325561523, "learning_rate": 4.620793048586465e-05, "loss": 1.1864, "step": 25300 }, { "epoch": 0.7759516099468443, "grad_norm": 1.496487021446228, "learning_rate": 4.619263248072452e-05, "loss": 1.2452, "step": 25400 }, { "epoch": 0.7790065375450602, "grad_norm": 4.837622165679932, "learning_rate": 4.617733447558439e-05, "loss": 1.2801, "step": 25500 }, { "epoch": 0.7820614651432761, "grad_norm": 5.973275184631348, "learning_rate": 4.616203647044426e-05, "loss": 1.2408, "step": 25600 }, { "epoch": 0.785116392741492, "grad_norm": 4.390771865844727, "learning_rate": 4.6146738465304126e-05, "loss": 1.1513, "step": 25700 }, { "epoch": 0.7881713203397079, "grad_norm": 1.7596487998962402, "learning_rate": 4.6131440460164e-05, "loss": 1.2074, "step": 25800 }, { "epoch": 0.7912262479379238, "grad_norm": 1.3779590129852295, "learning_rate": 4.611614245502387e-05, "loss": 1.1349, "step": 25900 }, { "epoch": 0.7942811755361397, "grad_norm": 9.768011093139648, "learning_rate": 4.610084444988374e-05, "loss": 1.2668, "step": 26000 }, { "epoch": 0.7973361031343558, "grad_norm": 2.795086622238159, "learning_rate": 4.608554644474361e-05, "loss": 1.4519, "step": 26100 }, { "epoch": 0.8003910307325717, "grad_norm": 2.03102970123291, "learning_rate": 4.607024843960348e-05, "loss": 1.3429, "step": 26200 }, { "epoch": 0.8034459583307876, "grad_norm": 3.355180501937866, "learning_rate": 4.605495043446335e-05, "loss": 1.346, "step": 26300 }, { "epoch": 0.8065008859290035, "grad_norm": 4.949088096618652, "learning_rate": 4.6039652429323224e-05, "loss": 1.297, "step": 26400 }, { "epoch": 0.8095558135272194, "grad_norm": 3.3454296588897705, "learning_rate": 4.602435442418309e-05, "loss": 1.3045, "step": 26500 }, { "epoch": 0.8126107411254353, "grad_norm": 3.7700753211975098, "learning_rate": 4.600905641904295e-05, "loss": 1.2386, "step": 26600 }, { "epoch": 0.8156656687236512, "grad_norm": 3.9577269554138184, "learning_rate": 4.5993758413902824e-05, "loss": 1.336, "step": 26700 }, { "epoch": 0.8187205963218672, "grad_norm": 5.364854335784912, "learning_rate": 4.5978460408762695e-05, "loss": 1.3863, "step": 26800 }, { "epoch": 0.8217755239200831, "grad_norm": 3.6088366508483887, "learning_rate": 4.5963162403622567e-05, "loss": 1.2049, "step": 26900 }, { "epoch": 0.824830451518299, "grad_norm": 2.9241647720336914, "learning_rate": 4.594786439848244e-05, "loss": 1.3627, "step": 27000 }, { "epoch": 0.8278853791165149, "grad_norm": 4.087007999420166, "learning_rate": 4.593256639334231e-05, "loss": 1.2335, "step": 27100 }, { "epoch": 0.8309403067147308, "grad_norm": 2.52764630317688, "learning_rate": 4.591726838820218e-05, "loss": 1.288, "step": 27200 }, { "epoch": 0.8339952343129468, "grad_norm": 1.945397138595581, "learning_rate": 4.590197038306205e-05, "loss": 1.2157, "step": 27300 }, { "epoch": 0.8370501619111627, "grad_norm": 3.3560779094696045, "learning_rate": 4.5886672377921916e-05, "loss": 1.2333, "step": 27400 }, { "epoch": 0.8401050895093787, "grad_norm": 2.7868645191192627, "learning_rate": 4.587152735283319e-05, "loss": 1.1488, "step": 27500 }, { "epoch": 0.8431600171075946, "grad_norm": 2.6062686443328857, "learning_rate": 4.585622934769306e-05, "loss": 1.3793, "step": 27600 }, { "epoch": 0.8462149447058105, "grad_norm": 5.000110626220703, "learning_rate": 4.5841084322604336e-05, "loss": 1.3183, "step": 27700 }, { "epoch": 0.8492698723040264, "grad_norm": 3.7855002880096436, "learning_rate": 4.58257863174642e-05, "loss": 1.3456, "step": 27800 }, { "epoch": 0.8523247999022423, "grad_norm": 1.3873869180679321, "learning_rate": 4.581048831232407e-05, "loss": 1.3148, "step": 27900 }, { "epoch": 0.8553797275004582, "grad_norm": 2.0780625343322754, "learning_rate": 4.579519030718394e-05, "loss": 1.3301, "step": 28000 }, { "epoch": 0.8584346550986741, "grad_norm": 4.090784072875977, "learning_rate": 4.5779892302043814e-05, "loss": 1.2458, "step": 28100 }, { "epoch": 0.8614895826968901, "grad_norm": 3.2605838775634766, "learning_rate": 4.5764594296903685e-05, "loss": 1.2539, "step": 28200 }, { "epoch": 0.864544510295106, "grad_norm": 2.4564528465270996, "learning_rate": 4.5749296291763556e-05, "loss": 1.224, "step": 28300 }, { "epoch": 0.867599437893322, "grad_norm": 3.5012292861938477, "learning_rate": 4.573399828662343e-05, "loss": 1.1227, "step": 28400 }, { "epoch": 0.8706543654915379, "grad_norm": 2.125563621520996, "learning_rate": 4.57187002814833e-05, "loss": 1.1865, "step": 28500 }, { "epoch": 0.8737092930897538, "grad_norm": 7.136216640472412, "learning_rate": 4.570340227634316e-05, "loss": 1.157, "step": 28600 }, { "epoch": 0.8767642206879697, "grad_norm": 7.11937141418457, "learning_rate": 4.5688104271203034e-05, "loss": 1.2754, "step": 28700 }, { "epoch": 0.8798191482861856, "grad_norm": 0.7430057525634766, "learning_rate": 4.5672806266062905e-05, "loss": 1.2952, "step": 28800 }, { "epoch": 0.8828740758844016, "grad_norm": 4.634474277496338, "learning_rate": 4.5657508260922777e-05, "loss": 1.323, "step": 28900 }, { "epoch": 0.8859290034826175, "grad_norm": 3.6207408905029297, "learning_rate": 4.564221025578265e-05, "loss": 1.2202, "step": 29000 }, { "epoch": 0.8889839310808334, "grad_norm": 6.108366966247559, "learning_rate": 4.562691225064252e-05, "loss": 1.2362, "step": 29100 }, { "epoch": 0.8920388586790493, "grad_norm": 3.439833879470825, "learning_rate": 4.561161424550239e-05, "loss": 1.2378, "step": 29200 }, { "epoch": 0.8950937862772652, "grad_norm": 4.669163227081299, "learning_rate": 4.559631624036226e-05, "loss": 1.1483, "step": 29300 }, { "epoch": 0.8981487138754811, "grad_norm": 3.7762789726257324, "learning_rate": 4.5581018235222126e-05, "loss": 1.2534, "step": 29400 }, { "epoch": 0.901203641473697, "grad_norm": 5.76541805267334, "learning_rate": 4.5565720230082e-05, "loss": 1.2709, "step": 29500 }, { "epoch": 0.904258569071913, "grad_norm": 1.8639878034591675, "learning_rate": 4.555042222494187e-05, "loss": 1.1896, "step": 29600 }, { "epoch": 0.907313496670129, "grad_norm": 10.215450286865234, "learning_rate": 4.553512421980174e-05, "loss": 1.3192, "step": 29700 }, { "epoch": 0.9103684242683449, "grad_norm": 4.441849231719971, "learning_rate": 4.5519979194713014e-05, "loss": 1.2901, "step": 29800 }, { "epoch": 0.9134233518665608, "grad_norm": 3.3630282878875732, "learning_rate": 4.5504681189572885e-05, "loss": 1.2784, "step": 29900 }, { "epoch": 0.9164782794647767, "grad_norm": 5.191607475280762, "learning_rate": 4.5489383184432756e-05, "loss": 1.1283, "step": 30000 }, { "epoch": 0.9195332070629926, "grad_norm": 3.6215574741363525, "learning_rate": 4.547408517929263e-05, "loss": 1.1619, "step": 30100 }, { "epoch": 0.9225881346612085, "grad_norm": 5.8025946617126465, "learning_rate": 4.54587871741525e-05, "loss": 1.2857, "step": 30200 }, { "epoch": 0.9256430622594245, "grad_norm": 5.739350318908691, "learning_rate": 4.544348916901236e-05, "loss": 1.3402, "step": 30300 }, { "epoch": 0.9286979898576404, "grad_norm": 1.923235297203064, "learning_rate": 4.5428191163872234e-05, "loss": 1.3791, "step": 30400 }, { "epoch": 0.9317529174558563, "grad_norm": 2.0984885692596436, "learning_rate": 4.5412893158732105e-05, "loss": 1.3596, "step": 30500 }, { "epoch": 0.9348078450540722, "grad_norm": 11.962567329406738, "learning_rate": 4.5397595153591976e-05, "loss": 1.2984, "step": 30600 }, { "epoch": 0.9378627726522881, "grad_norm": 3.2890284061431885, "learning_rate": 4.538229714845185e-05, "loss": 1.2089, "step": 30700 }, { "epoch": 0.940917700250504, "grad_norm": 3.9245996475219727, "learning_rate": 4.536699914331172e-05, "loss": 1.3328, "step": 30800 }, { "epoch": 0.9439726278487199, "grad_norm": 2.281853675842285, "learning_rate": 4.535170113817159e-05, "loss": 1.2884, "step": 30900 }, { "epoch": 0.947027555446936, "grad_norm": 3.139033079147339, "learning_rate": 4.533640313303146e-05, "loss": 1.1133, "step": 31000 }, { "epoch": 0.9500824830451519, "grad_norm": 1.584032654762268, "learning_rate": 4.5321105127891325e-05, "loss": 1.1666, "step": 31100 }, { "epoch": 0.9531374106433678, "grad_norm": 3.9088025093078613, "learning_rate": 4.530580712275119e-05, "loss": 1.273, "step": 31200 }, { "epoch": 0.9561923382415837, "grad_norm": 4.956596374511719, "learning_rate": 4.529050911761106e-05, "loss": 1.3739, "step": 31300 }, { "epoch": 0.9592472658397996, "grad_norm": 8.017325401306152, "learning_rate": 4.527521111247093e-05, "loss": 1.2341, "step": 31400 }, { "epoch": 0.9623021934380155, "grad_norm": 3.9635438919067383, "learning_rate": 4.52599131073308e-05, "loss": 1.1666, "step": 31500 }, { "epoch": 0.9653571210362314, "grad_norm": 2.0401432514190674, "learning_rate": 4.5244615102190674e-05, "loss": 1.2702, "step": 31600 }, { "epoch": 0.9684120486344474, "grad_norm": 2.1671855449676514, "learning_rate": 4.5229317097050545e-05, "loss": 1.144, "step": 31700 }, { "epoch": 0.9714669762326633, "grad_norm": 5.701295375823975, "learning_rate": 4.5214019091910417e-05, "loss": 1.1877, "step": 31800 }, { "epoch": 0.9745219038308792, "grad_norm": 3.89962100982666, "learning_rate": 4.519872108677029e-05, "loss": 1.2391, "step": 31900 }, { "epoch": 0.9775768314290951, "grad_norm": 13.442463874816895, "learning_rate": 4.518342308163015e-05, "loss": 1.2846, "step": 32000 }, { "epoch": 0.980631759027311, "grad_norm": 6.973484039306641, "learning_rate": 4.516827805654143e-05, "loss": 1.2133, "step": 32100 }, { "epoch": 0.9836866866255269, "grad_norm": 3.8862478733062744, "learning_rate": 4.51529800514013e-05, "loss": 1.1195, "step": 32200 }, { "epoch": 0.9867416142237428, "grad_norm": 5.151338577270508, "learning_rate": 4.513768204626117e-05, "loss": 1.2547, "step": 32300 }, { "epoch": 0.9897965418219589, "grad_norm": 4.248752593994141, "learning_rate": 4.512238404112104e-05, "loss": 1.2451, "step": 32400 }, { "epoch": 0.9928514694201748, "grad_norm": 4.167698383331299, "learning_rate": 4.510708603598091e-05, "loss": 1.1808, "step": 32500 }, { "epoch": 0.9959063970183907, "grad_norm": 1.6347144842147827, "learning_rate": 4.509178803084078e-05, "loss": 1.3021, "step": 32600 }, { "epoch": 0.9989613246166066, "grad_norm": 3.7852065563201904, "learning_rate": 4.5076490025700654e-05, "loss": 1.1772, "step": 32700 }, { "epoch": 1.0, "eval_accuracy": 0.6393352477546282, "eval_loss": 0.969184935092926, "eval_runtime": 1839.346, "eval_samples_per_second": 17.797, "eval_steps_per_second": 4.449, "step": 32734 }, { "epoch": 1.0020162522148226, "grad_norm": 2.740159749984741, "learning_rate": 4.5061192020560525e-05, "loss": 1.3645, "step": 32800 }, { "epoch": 1.0050711798130385, "grad_norm": 2.5094220638275146, "learning_rate": 4.504589401542039e-05, "loss": 1.1684, "step": 32900 }, { "epoch": 1.0081261074112544, "grad_norm": 7.7797417640686035, "learning_rate": 4.503059601028026e-05, "loss": 1.2266, "step": 33000 }, { "epoch": 1.0111810350094703, "grad_norm": 2.7064406871795654, "learning_rate": 4.501529800514013e-05, "loss": 1.1893, "step": 33100 }, { "epoch": 1.0142359626076862, "grad_norm": 8.84860610961914, "learning_rate": 4.5e-05, "loss": 1.2385, "step": 33200 }, { "epoch": 1.0172908902059021, "grad_norm": 7.67369270324707, "learning_rate": 4.4984701994859874e-05, "loss": 1.2131, "step": 33300 }, { "epoch": 1.020345817804118, "grad_norm": 4.2192769050598145, "learning_rate": 4.4969403989719745e-05, "loss": 1.2668, "step": 33400 }, { "epoch": 1.023400745402334, "grad_norm": 4.694185733795166, "learning_rate": 4.4954105984579616e-05, "loss": 1.277, "step": 33500 }, { "epoch": 1.0264556730005499, "grad_norm": 4.47376823425293, "learning_rate": 4.493880797943949e-05, "loss": 1.1455, "step": 33600 }, { "epoch": 1.0295106005987658, "grad_norm": 1.780508279800415, "learning_rate": 4.492350997429935e-05, "loss": 1.194, "step": 33700 }, { "epoch": 1.0325655281969817, "grad_norm": 2.220668315887451, "learning_rate": 4.490821196915922e-05, "loss": 1.3191, "step": 33800 }, { "epoch": 1.0356204557951976, "grad_norm": 2.0353493690490723, "learning_rate": 4.4892913964019094e-05, "loss": 1.248, "step": 33900 }, { "epoch": 1.0386753833934135, "grad_norm": 3.851609230041504, "learning_rate": 4.4877615958878965e-05, "loss": 1.2369, "step": 34000 }, { "epoch": 1.0417303109916296, "grad_norm": 2.8817436695098877, "learning_rate": 4.4862317953738836e-05, "loss": 1.2093, "step": 34100 }, { "epoch": 1.0447852385898455, "grad_norm": 3.6299502849578857, "learning_rate": 4.484701994859871e-05, "loss": 1.3564, "step": 34200 }, { "epoch": 1.0478401661880614, "grad_norm": 3.872225284576416, "learning_rate": 4.483172194345858e-05, "loss": 1.2894, "step": 34300 }, { "epoch": 1.0508950937862773, "grad_norm": 1.910213828086853, "learning_rate": 4.481642393831845e-05, "loss": 1.2529, "step": 34400 }, { "epoch": 1.0539500213844932, "grad_norm": 7.973636150360107, "learning_rate": 4.4801125933178314e-05, "loss": 1.1647, "step": 34500 }, { "epoch": 1.0570049489827091, "grad_norm": 9.639257431030273, "learning_rate": 4.4785827928038185e-05, "loss": 1.2896, "step": 34600 }, { "epoch": 1.060059876580925, "grad_norm": 4.413392543792725, "learning_rate": 4.477052992289806e-05, "loss": 1.205, "step": 34700 }, { "epoch": 1.063114804179141, "grad_norm": 2.079163074493408, "learning_rate": 4.4755384897809325e-05, "loss": 1.2079, "step": 34800 }, { "epoch": 1.0661697317773569, "grad_norm": 3.3086769580841064, "learning_rate": 4.4740086892669196e-05, "loss": 1.1257, "step": 34900 }, { "epoch": 1.0692246593755728, "grad_norm": 2.8088862895965576, "learning_rate": 4.472478888752907e-05, "loss": 1.2156, "step": 35000 }, { "epoch": 1.0722795869737887, "grad_norm": 2.2325263023376465, "learning_rate": 4.470949088238894e-05, "loss": 1.2968, "step": 35100 }, { "epoch": 1.0753345145720046, "grad_norm": 1.6548900604248047, "learning_rate": 4.469419287724881e-05, "loss": 1.1761, "step": 35200 }, { "epoch": 1.0783894421702205, "grad_norm": 6.921013355255127, "learning_rate": 4.467889487210868e-05, "loss": 1.1929, "step": 35300 }, { "epoch": 1.0814443697684364, "grad_norm": 3.7694363594055176, "learning_rate": 4.466359686696855e-05, "loss": 1.3127, "step": 35400 }, { "epoch": 1.0844992973666523, "grad_norm": 5.201353073120117, "learning_rate": 4.4648298861828416e-05, "loss": 1.153, "step": 35500 }, { "epoch": 1.0875542249648684, "grad_norm": 2.9227755069732666, "learning_rate": 4.463300085668829e-05, "loss": 1.1579, "step": 35600 }, { "epoch": 1.0906091525630843, "grad_norm": 7.101742267608643, "learning_rate": 4.461770285154816e-05, "loss": 1.2353, "step": 35700 }, { "epoch": 1.0936640801613002, "grad_norm": 8.491111755371094, "learning_rate": 4.460240484640803e-05, "loss": 1.2764, "step": 35800 }, { "epoch": 1.0967190077595161, "grad_norm": 6.6550703048706055, "learning_rate": 4.45871068412679e-05, "loss": 1.1896, "step": 35900 }, { "epoch": 1.099773935357732, "grad_norm": 5.08082914352417, "learning_rate": 4.457180883612777e-05, "loss": 1.3892, "step": 36000 }, { "epoch": 1.102828862955948, "grad_norm": 2.673793077468872, "learning_rate": 4.455651083098764e-05, "loss": 1.2312, "step": 36100 }, { "epoch": 1.1058837905541639, "grad_norm": 6.248423099517822, "learning_rate": 4.4541212825847514e-05, "loss": 1.2901, "step": 36200 }, { "epoch": 1.1089387181523798, "grad_norm": 4.006623268127441, "learning_rate": 4.452591482070738e-05, "loss": 1.3631, "step": 36300 }, { "epoch": 1.1119936457505957, "grad_norm": 2.636411666870117, "learning_rate": 4.451061681556725e-05, "loss": 1.3197, "step": 36400 }, { "epoch": 1.1150485733488116, "grad_norm": 2.9009954929351807, "learning_rate": 4.449531881042712e-05, "loss": 1.3526, "step": 36500 }, { "epoch": 1.1181035009470275, "grad_norm": 6.70359992980957, "learning_rate": 4.448002080528699e-05, "loss": 1.1885, "step": 36600 }, { "epoch": 1.1211584285452434, "grad_norm": 12.23004150390625, "learning_rate": 4.446472280014686e-05, "loss": 1.1935, "step": 36700 }, { "epoch": 1.1242133561434593, "grad_norm": 2.4072132110595703, "learning_rate": 4.4449424795006734e-05, "loss": 1.1927, "step": 36800 }, { "epoch": 1.1272682837416754, "grad_norm": 5.668684959411621, "learning_rate": 4.4434126789866605e-05, "loss": 1.2155, "step": 36900 }, { "epoch": 1.1303232113398913, "grad_norm": 8.243475914001465, "learning_rate": 4.4418828784726476e-05, "loss": 1.3016, "step": 37000 }, { "epoch": 1.1333781389381072, "grad_norm": 6.644149303436279, "learning_rate": 4.440353077958634e-05, "loss": 1.2682, "step": 37100 }, { "epoch": 1.1364330665363231, "grad_norm": 1.5577857494354248, "learning_rate": 4.438823277444621e-05, "loss": 1.225, "step": 37200 }, { "epoch": 1.139487994134539, "grad_norm": 6.358497619628906, "learning_rate": 4.437293476930608e-05, "loss": 1.2043, "step": 37300 }, { "epoch": 1.142542921732755, "grad_norm": 4.491862773895264, "learning_rate": 4.4357636764165954e-05, "loss": 1.2241, "step": 37400 }, { "epoch": 1.1455978493309709, "grad_norm": 11.254999160766602, "learning_rate": 4.4342338759025826e-05, "loss": 1.3232, "step": 37500 }, { "epoch": 1.1486527769291868, "grad_norm": 6.52307653427124, "learning_rate": 4.43270407538857e-05, "loss": 1.1893, "step": 37600 }, { "epoch": 1.1517077045274027, "grad_norm": 1.5769741535186768, "learning_rate": 4.431174274874557e-05, "loss": 1.2663, "step": 37700 }, { "epoch": 1.1547626321256186, "grad_norm": 11.211283683776855, "learning_rate": 4.429644474360544e-05, "loss": 1.1015, "step": 37800 }, { "epoch": 1.1578175597238345, "grad_norm": 4.9694671630859375, "learning_rate": 4.42811467384653e-05, "loss": 1.1474, "step": 37900 }, { "epoch": 1.1608724873220504, "grad_norm": 4.092493057250977, "learning_rate": 4.426600171337658e-05, "loss": 1.4885, "step": 38000 }, { "epoch": 1.1639274149202663, "grad_norm": 7.638351917266846, "learning_rate": 4.425070370823645e-05, "loss": 1.3411, "step": 38100 }, { "epoch": 1.1669823425184824, "grad_norm": 1.5930650234222412, "learning_rate": 4.4235405703096314e-05, "loss": 1.0829, "step": 38200 }, { "epoch": 1.1700372701166981, "grad_norm": 2.2669153213500977, "learning_rate": 4.4220107697956185e-05, "loss": 1.2747, "step": 38300 }, { "epoch": 1.1730921977149142, "grad_norm": 9.886653900146484, "learning_rate": 4.4204809692816056e-05, "loss": 1.1561, "step": 38400 }, { "epoch": 1.1761471253131301, "grad_norm": 1.6329950094223022, "learning_rate": 4.418951168767593e-05, "loss": 1.1889, "step": 38500 }, { "epoch": 1.179202052911346, "grad_norm": 9.446663856506348, "learning_rate": 4.41742136825358e-05, "loss": 1.2176, "step": 38600 }, { "epoch": 1.182256980509562, "grad_norm": 3.5122339725494385, "learning_rate": 4.415891567739567e-05, "loss": 1.3452, "step": 38700 }, { "epoch": 1.1853119081077779, "grad_norm": 4.202631950378418, "learning_rate": 4.414361767225554e-05, "loss": 1.1617, "step": 38800 }, { "epoch": 1.1883668357059938, "grad_norm": 4.116245746612549, "learning_rate": 4.412831966711541e-05, "loss": 1.3976, "step": 38900 }, { "epoch": 1.1914217633042097, "grad_norm": 3.6785387992858887, "learning_rate": 4.4113021661975276e-05, "loss": 1.2608, "step": 39000 }, { "epoch": 1.1944766909024256, "grad_norm": 4.720080375671387, "learning_rate": 4.409772365683515e-05, "loss": 1.2868, "step": 39100 }, { "epoch": 1.1975316185006415, "grad_norm": 8.487894058227539, "learning_rate": 4.408242565169502e-05, "loss": 1.2178, "step": 39200 }, { "epoch": 1.2005865460988574, "grad_norm": 12.185461044311523, "learning_rate": 4.406712764655489e-05, "loss": 1.1529, "step": 39300 }, { "epoch": 1.2036414736970733, "grad_norm": 4.981146335601807, "learning_rate": 4.405182964141476e-05, "loss": 1.2508, "step": 39400 }, { "epoch": 1.2066964012952892, "grad_norm": 2.295912027359009, "learning_rate": 4.403653163627463e-05, "loss": 1.1667, "step": 39500 }, { "epoch": 1.2097513288935051, "grad_norm": 4.060471057891846, "learning_rate": 4.40212336311345e-05, "loss": 1.2019, "step": 39600 }, { "epoch": 1.2128062564917212, "grad_norm": 2.7551541328430176, "learning_rate": 4.4005935625994374e-05, "loss": 1.2756, "step": 39700 }, { "epoch": 1.2158611840899372, "grad_norm": 2.792501926422119, "learning_rate": 4.399063762085424e-05, "loss": 1.3358, "step": 39800 }, { "epoch": 1.218916111688153, "grad_norm": 16.728858947753906, "learning_rate": 4.397533961571411e-05, "loss": 1.2537, "step": 39900 }, { "epoch": 1.221971039286369, "grad_norm": 12.136491775512695, "learning_rate": 4.3960194590625384e-05, "loss": 1.3623, "step": 40000 }, { "epoch": 1.2250259668845849, "grad_norm": 3.166687488555908, "learning_rate": 4.3944896585485256e-05, "loss": 1.2557, "step": 40100 }, { "epoch": 1.2280808944828008, "grad_norm": 3.6806106567382812, "learning_rate": 4.392959858034513e-05, "loss": 1.3083, "step": 40200 }, { "epoch": 1.2311358220810167, "grad_norm": 2.7585878372192383, "learning_rate": 4.3914300575205e-05, "loss": 1.2948, "step": 40300 }, { "epoch": 1.2341907496792326, "grad_norm": 8.795299530029297, "learning_rate": 4.389900257006487e-05, "loss": 1.2998, "step": 40400 }, { "epoch": 1.2372456772774485, "grad_norm": 5.048892498016357, "learning_rate": 4.388370456492474e-05, "loss": 1.2627, "step": 40500 }, { "epoch": 1.2403006048756644, "grad_norm": 1.6317594051361084, "learning_rate": 4.386840655978461e-05, "loss": 1.344, "step": 40600 }, { "epoch": 1.2433555324738803, "grad_norm": 4.000895977020264, "learning_rate": 4.3853108554644476e-05, "loss": 1.1493, "step": 40700 }, { "epoch": 1.2464104600720962, "grad_norm": 4.536451816558838, "learning_rate": 4.383781054950435e-05, "loss": 1.185, "step": 40800 }, { "epoch": 1.2494653876703121, "grad_norm": 1.5634673833847046, "learning_rate": 4.382251254436422e-05, "loss": 1.1928, "step": 40900 }, { "epoch": 1.2525203152685283, "grad_norm": 3.4632925987243652, "learning_rate": 4.380721453922409e-05, "loss": 1.1444, "step": 41000 }, { "epoch": 1.255575242866744, "grad_norm": 1.5671478509902954, "learning_rate": 4.379191653408396e-05, "loss": 1.1633, "step": 41100 }, { "epoch": 1.25863017046496, "grad_norm": 4.259028434753418, "learning_rate": 4.377661852894383e-05, "loss": 1.3727, "step": 41200 }, { "epoch": 1.261685098063176, "grad_norm": 4.04819393157959, "learning_rate": 4.37613205238037e-05, "loss": 1.2986, "step": 41300 }, { "epoch": 1.2647400256613919, "grad_norm": 5.768520355224609, "learning_rate": 4.3746022518663574e-05, "loss": 1.2835, "step": 41400 }, { "epoch": 1.2677949532596078, "grad_norm": 2.447608470916748, "learning_rate": 4.373072451352344e-05, "loss": 1.1759, "step": 41500 }, { "epoch": 1.2708498808578237, "grad_norm": 9.211963653564453, "learning_rate": 4.371542650838331e-05, "loss": 1.3477, "step": 41600 }, { "epoch": 1.2739048084560396, "grad_norm": 4.363831043243408, "learning_rate": 4.370012850324318e-05, "loss": 1.3011, "step": 41700 }, { "epoch": 1.2769597360542555, "grad_norm": 5.887541770935059, "learning_rate": 4.368483049810305e-05, "loss": 1.2085, "step": 41800 }, { "epoch": 1.2800146636524714, "grad_norm": 4.435642719268799, "learning_rate": 4.366953249296292e-05, "loss": 1.2673, "step": 41900 }, { "epoch": 1.2830695912506873, "grad_norm": 5.838521480560303, "learning_rate": 4.3654234487822794e-05, "loss": 1.366, "step": 42000 }, { "epoch": 1.2861245188489032, "grad_norm": 3.4145562648773193, "learning_rate": 4.3638936482682665e-05, "loss": 1.1811, "step": 42100 }, { "epoch": 1.2891794464471191, "grad_norm": 6.8241868019104, "learning_rate": 4.362379145759393e-05, "loss": 1.3298, "step": 42200 }, { "epoch": 1.2922343740453353, "grad_norm": 11.387208938598633, "learning_rate": 4.3608493452453804e-05, "loss": 1.2655, "step": 42300 }, { "epoch": 1.295289301643551, "grad_norm": 2.6375372409820557, "learning_rate": 4.3593195447313676e-05, "loss": 1.2064, "step": 42400 }, { "epoch": 1.298344229241767, "grad_norm": 5.372217178344727, "learning_rate": 4.357789744217354e-05, "loss": 1.3267, "step": 42500 }, { "epoch": 1.301399156839983, "grad_norm": 4.916769027709961, "learning_rate": 4.356259943703341e-05, "loss": 1.1147, "step": 42600 }, { "epoch": 1.3044540844381989, "grad_norm": 4.316017150878906, "learning_rate": 4.354730143189328e-05, "loss": 1.3126, "step": 42700 }, { "epoch": 1.3075090120364148, "grad_norm": 2.563248634338379, "learning_rate": 4.353200342675315e-05, "loss": 1.2966, "step": 42800 }, { "epoch": 1.3105639396346307, "grad_norm": 3.107074022293091, "learning_rate": 4.3516705421613025e-05, "loss": 1.3133, "step": 42900 }, { "epoch": 1.3136188672328466, "grad_norm": 6.040319919586182, "learning_rate": 4.3501407416472896e-05, "loss": 1.2535, "step": 43000 }, { "epoch": 1.3166737948310625, "grad_norm": 3.597527503967285, "learning_rate": 4.348610941133277e-05, "loss": 1.2451, "step": 43100 }, { "epoch": 1.3197287224292784, "grad_norm": 4.143153190612793, "learning_rate": 4.347081140619264e-05, "loss": 1.2788, "step": 43200 }, { "epoch": 1.3227836500274943, "grad_norm": 3.319704055786133, "learning_rate": 4.34555134010525e-05, "loss": 1.1359, "step": 43300 }, { "epoch": 1.3258385776257102, "grad_norm": 5.486532211303711, "learning_rate": 4.3440215395912374e-05, "loss": 1.1321, "step": 43400 }, { "epoch": 1.3288935052239261, "grad_norm": 6.724696636199951, "learning_rate": 4.3424917390772245e-05, "loss": 1.0973, "step": 43500 }, { "epoch": 1.331948432822142, "grad_norm": 5.1426591873168945, "learning_rate": 4.3409619385632116e-05, "loss": 1.2437, "step": 43600 }, { "epoch": 1.335003360420358, "grad_norm": 8.841686248779297, "learning_rate": 4.339432138049199e-05, "loss": 1.3292, "step": 43700 }, { "epoch": 1.338058288018574, "grad_norm": 3.7120001316070557, "learning_rate": 4.337902337535186e-05, "loss": 1.1654, "step": 43800 }, { "epoch": 1.3411132156167898, "grad_norm": 1.776436686515808, "learning_rate": 4.336372537021173e-05, "loss": 1.2399, "step": 43900 }, { "epoch": 1.3441681432150059, "grad_norm": 3.7421560287475586, "learning_rate": 4.33484273650716e-05, "loss": 1.3145, "step": 44000 }, { "epoch": 1.3472230708132218, "grad_norm": 14.425456047058105, "learning_rate": 4.3333129359931465e-05, "loss": 1.2523, "step": 44100 }, { "epoch": 1.3502779984114377, "grad_norm": 5.034737586975098, "learning_rate": 4.331798433484274e-05, "loss": 1.2599, "step": 44200 }, { "epoch": 1.3533329260096536, "grad_norm": 9.89389705657959, "learning_rate": 4.330268632970261e-05, "loss": 1.2971, "step": 44300 }, { "epoch": 1.3563878536078695, "grad_norm": 2.4960923194885254, "learning_rate": 4.3287388324562475e-05, "loss": 1.2746, "step": 44400 }, { "epoch": 1.3594427812060854, "grad_norm": 12.708929061889648, "learning_rate": 4.3272090319422346e-05, "loss": 1.1215, "step": 44500 }, { "epoch": 1.3624977088043013, "grad_norm": 5.918141841888428, "learning_rate": 4.325679231428222e-05, "loss": 1.155, "step": 44600 }, { "epoch": 1.3655526364025172, "grad_norm": 1.8433784246444702, "learning_rate": 4.324149430914209e-05, "loss": 1.4129, "step": 44700 }, { "epoch": 1.3686075640007331, "grad_norm": 9.525556564331055, "learning_rate": 4.322619630400196e-05, "loss": 1.2244, "step": 44800 }, { "epoch": 1.371662491598949, "grad_norm": 3.9486777782440186, "learning_rate": 4.321089829886183e-05, "loss": 1.2334, "step": 44900 }, { "epoch": 1.374717419197165, "grad_norm": 2.2689762115478516, "learning_rate": 4.31956002937217e-05, "loss": 1.0697, "step": 45000 }, { "epoch": 1.377772346795381, "grad_norm": 7.874233245849609, "learning_rate": 4.318030228858157e-05, "loss": 1.2112, "step": 45100 }, { "epoch": 1.3808272743935968, "grad_norm": 1.4350064992904663, "learning_rate": 4.316500428344144e-05, "loss": 1.0985, "step": 45200 }, { "epoch": 1.3838822019918129, "grad_norm": 3.0830788612365723, "learning_rate": 4.314970627830131e-05, "loss": 1.1946, "step": 45300 }, { "epoch": 1.3869371295900288, "grad_norm": 2.0088775157928467, "learning_rate": 4.313440827316118e-05, "loss": 1.3359, "step": 45400 }, { "epoch": 1.3899920571882447, "grad_norm": 3.8786025047302246, "learning_rate": 4.311911026802105e-05, "loss": 1.2748, "step": 45500 }, { "epoch": 1.3930469847864606, "grad_norm": 6.242239952087402, "learning_rate": 4.310381226288092e-05, "loss": 1.4031, "step": 45600 }, { "epoch": 1.3961019123846765, "grad_norm": 2.5156424045562744, "learning_rate": 4.3088514257740793e-05, "loss": 1.289, "step": 45700 }, { "epoch": 1.3991568399828924, "grad_norm": 4.84622049331665, "learning_rate": 4.3073216252600665e-05, "loss": 1.1921, "step": 45800 }, { "epoch": 1.4022117675811083, "grad_norm": 4.694828033447266, "learning_rate": 4.3057918247460536e-05, "loss": 1.1959, "step": 45900 }, { "epoch": 1.4052666951793242, "grad_norm": 3.817330837249756, "learning_rate": 4.30426202423204e-05, "loss": 1.3833, "step": 46000 }, { "epoch": 1.4083216227775401, "grad_norm": 4.423964500427246, "learning_rate": 4.302732223718027e-05, "loss": 1.2469, "step": 46100 }, { "epoch": 1.411376550375756, "grad_norm": 6.940568447113037, "learning_rate": 4.301202423204014e-05, "loss": 1.0547, "step": 46200 }, { "epoch": 1.414431477973972, "grad_norm": 12.37020206451416, "learning_rate": 4.2996726226900014e-05, "loss": 1.2522, "step": 46300 }, { "epoch": 1.4174864055721879, "grad_norm": 2.3224236965179443, "learning_rate": 4.298158120181129e-05, "loss": 1.1729, "step": 46400 }, { "epoch": 1.4205413331704038, "grad_norm": 4.079832553863525, "learning_rate": 4.296628319667116e-05, "loss": 1.1279, "step": 46500 }, { "epoch": 1.42359626076862, "grad_norm": 1.1195186376571655, "learning_rate": 4.295098519153103e-05, "loss": 1.4004, "step": 46600 }, { "epoch": 1.4266511883668356, "grad_norm": 4.244978904724121, "learning_rate": 4.29356871863909e-05, "loss": 1.1016, "step": 46700 }, { "epoch": 1.4297061159650517, "grad_norm": 7.724494934082031, "learning_rate": 4.292038918125077e-05, "loss": 1.2749, "step": 46800 }, { "epoch": 1.4327610435632676, "grad_norm": 4.730603218078613, "learning_rate": 4.290509117611064e-05, "loss": 1.1651, "step": 46900 }, { "epoch": 1.4358159711614835, "grad_norm": 2.9875614643096924, "learning_rate": 4.28897931709705e-05, "loss": 1.2168, "step": 47000 }, { "epoch": 1.4388708987596994, "grad_norm": 3.3418843746185303, "learning_rate": 4.287449516583037e-05, "loss": 1.1436, "step": 47100 }, { "epoch": 1.4419258263579153, "grad_norm": 6.813102722167969, "learning_rate": 4.2859197160690244e-05, "loss": 1.1113, "step": 47200 }, { "epoch": 1.4449807539561312, "grad_norm": 6.928761959075928, "learning_rate": 4.2843899155550115e-05, "loss": 1.2056, "step": 47300 }, { "epoch": 1.4480356815543471, "grad_norm": 3.8548502922058105, "learning_rate": 4.2828601150409986e-05, "loss": 1.2025, "step": 47400 }, { "epoch": 1.451090609152563, "grad_norm": 1.8238892555236816, "learning_rate": 4.281330314526986e-05, "loss": 1.3061, "step": 47500 }, { "epoch": 1.454145536750779, "grad_norm": 4.400331974029541, "learning_rate": 4.279800514012973e-05, "loss": 1.2991, "step": 47600 }, { "epoch": 1.4572004643489949, "grad_norm": 6.429203510284424, "learning_rate": 4.27827071349896e-05, "loss": 1.3095, "step": 47700 }, { "epoch": 1.4602553919472108, "grad_norm": 4.6192803382873535, "learning_rate": 4.2767409129849464e-05, "loss": 1.3917, "step": 47800 }, { "epoch": 1.463310319545427, "grad_norm": 1.5252143144607544, "learning_rate": 4.2752111124709335e-05, "loss": 1.1591, "step": 47900 }, { "epoch": 1.4663652471436426, "grad_norm": 6.269423484802246, "learning_rate": 4.2736813119569206e-05, "loss": 1.1887, "step": 48000 }, { "epoch": 1.4694201747418587, "grad_norm": 13.407902717590332, "learning_rate": 4.272151511442908e-05, "loss": 1.178, "step": 48100 }, { "epoch": 1.4724751023400746, "grad_norm": 6.713379859924316, "learning_rate": 4.270621710928895e-05, "loss": 1.1894, "step": 48200 }, { "epoch": 1.4755300299382905, "grad_norm": 9.634526252746582, "learning_rate": 4.269091910414882e-05, "loss": 1.3624, "step": 48300 }, { "epoch": 1.4785849575365064, "grad_norm": 1.1485018730163574, "learning_rate": 4.267562109900869e-05, "loss": 1.0988, "step": 48400 }, { "epoch": 1.4816398851347223, "grad_norm": 6.0236358642578125, "learning_rate": 4.266032309386856e-05, "loss": 1.16, "step": 48500 }, { "epoch": 1.4846948127329382, "grad_norm": 2.204787254333496, "learning_rate": 4.264517806877984e-05, "loss": 1.22, "step": 48600 }, { "epoch": 1.4877497403311541, "grad_norm": 8.660630226135254, "learning_rate": 4.26298800636397e-05, "loss": 1.3341, "step": 48700 }, { "epoch": 1.49080466792937, "grad_norm": 18.680225372314453, "learning_rate": 4.261458205849957e-05, "loss": 1.1561, "step": 48800 }, { "epoch": 1.493859595527586, "grad_norm": 10.239827156066895, "learning_rate": 4.2599284053359444e-05, "loss": 1.1995, "step": 48900 }, { "epoch": 1.4969145231258019, "grad_norm": 12.529239654541016, "learning_rate": 4.2583986048219315e-05, "loss": 1.3687, "step": 49000 }, { "epoch": 1.4999694507240178, "grad_norm": 5.2959465980529785, "learning_rate": 4.2568688043079186e-05, "loss": 1.2584, "step": 49100 }, { "epoch": 1.503024378322234, "grad_norm": 4.77400541305542, "learning_rate": 4.255339003793906e-05, "loss": 1.2276, "step": 49200 }, { "epoch": 1.5060793059204496, "grad_norm": 2.8349411487579346, "learning_rate": 4.253809203279893e-05, "loss": 1.1254, "step": 49300 }, { "epoch": 1.5091342335186657, "grad_norm": 4.453884601593018, "learning_rate": 4.25227940276588e-05, "loss": 1.2116, "step": 49400 }, { "epoch": 1.5121891611168814, "grad_norm": 5.747006416320801, "learning_rate": 4.2507496022518664e-05, "loss": 1.0758, "step": 49500 }, { "epoch": 1.5152440887150975, "grad_norm": 3.7872657775878906, "learning_rate": 4.2492198017378535e-05, "loss": 1.2889, "step": 49600 }, { "epoch": 1.5182990163133132, "grad_norm": 5.094516277313232, "learning_rate": 4.2476900012238406e-05, "loss": 1.2575, "step": 49700 }, { "epoch": 1.5213539439115293, "grad_norm": 8.902609825134277, "learning_rate": 4.246160200709828e-05, "loss": 1.1852, "step": 49800 }, { "epoch": 1.5244088715097452, "grad_norm": 4.5981292724609375, "learning_rate": 4.244630400195815e-05, "loss": 1.105, "step": 49900 }, { "epoch": 1.5274637991079612, "grad_norm": 8.625105857849121, "learning_rate": 4.243100599681802e-05, "loss": 1.171, "step": 50000 }, { "epoch": 1.530518726706177, "grad_norm": 4.452911853790283, "learning_rate": 4.241570799167789e-05, "loss": 1.205, "step": 50100 }, { "epoch": 1.533573654304393, "grad_norm": 6.153209686279297, "learning_rate": 4.240040998653776e-05, "loss": 1.1071, "step": 50200 }, { "epoch": 1.5366285819026089, "grad_norm": 3.6518421173095703, "learning_rate": 4.2385111981397626e-05, "loss": 1.4151, "step": 50300 }, { "epoch": 1.5396835095008248, "grad_norm": 3.2835469245910645, "learning_rate": 4.23698139762575e-05, "loss": 1.2222, "step": 50400 }, { "epoch": 1.542738437099041, "grad_norm": 4.837490558624268, "learning_rate": 4.235451597111737e-05, "loss": 1.2641, "step": 50500 }, { "epoch": 1.5457933646972566, "grad_norm": 3.214068651199341, "learning_rate": 4.233937094602864e-05, "loss": 1.3104, "step": 50600 }, { "epoch": 1.5488482922954727, "grad_norm": 3.558753728866577, "learning_rate": 4.232407294088851e-05, "loss": 1.3417, "step": 50700 }, { "epoch": 1.5519032198936884, "grad_norm": 3.0552566051483154, "learning_rate": 4.230877493574838e-05, "loss": 1.2345, "step": 50800 }, { "epoch": 1.5549581474919045, "grad_norm": 5.351703643798828, "learning_rate": 4.229347693060825e-05, "loss": 1.0696, "step": 50900 }, { "epoch": 1.5580130750901202, "grad_norm": 3.6090290546417236, "learning_rate": 4.227817892546812e-05, "loss": 1.0948, "step": 51000 }, { "epoch": 1.5610680026883363, "grad_norm": 3.92417049407959, "learning_rate": 4.226288092032799e-05, "loss": 1.1325, "step": 51100 }, { "epoch": 1.5641229302865522, "grad_norm": 5.536214351654053, "learning_rate": 4.2247582915187864e-05, "loss": 1.1196, "step": 51200 }, { "epoch": 1.5671778578847682, "grad_norm": 1.7182681560516357, "learning_rate": 4.2232284910047735e-05, "loss": 1.3105, "step": 51300 }, { "epoch": 1.570232785482984, "grad_norm": 2.2255499362945557, "learning_rate": 4.22169869049076e-05, "loss": 1.0988, "step": 51400 }, { "epoch": 1.5732877130812, "grad_norm": 2.182570695877075, "learning_rate": 4.220168889976747e-05, "loss": 1.0547, "step": 51500 }, { "epoch": 1.5763426406794159, "grad_norm": 4.099241733551025, "learning_rate": 4.218639089462734e-05, "loss": 1.2795, "step": 51600 }, { "epoch": 1.5793975682776318, "grad_norm": 2.609814167022705, "learning_rate": 4.217109288948721e-05, "loss": 1.132, "step": 51700 }, { "epoch": 1.582452495875848, "grad_norm": 4.413924217224121, "learning_rate": 4.2155794884347084e-05, "loss": 1.1425, "step": 51800 }, { "epoch": 1.5855074234740636, "grad_norm": 1.853229284286499, "learning_rate": 4.2140496879206955e-05, "loss": 1.2918, "step": 51900 }, { "epoch": 1.5885623510722797, "grad_norm": 1.5005695819854736, "learning_rate": 4.2125198874066826e-05, "loss": 1.1795, "step": 52000 }, { "epoch": 1.5916172786704954, "grad_norm": 2.4563257694244385, "learning_rate": 4.21099008689267e-05, "loss": 1.2424, "step": 52100 }, { "epoch": 1.5946722062687115, "grad_norm": 1.5169684886932373, "learning_rate": 4.209460286378656e-05, "loss": 1.2374, "step": 52200 }, { "epoch": 1.5977271338669272, "grad_norm": 3.0069658756256104, "learning_rate": 4.207930485864643e-05, "loss": 1.2134, "step": 52300 }, { "epoch": 1.6007820614651433, "grad_norm": 1.1836366653442383, "learning_rate": 4.2064006853506304e-05, "loss": 1.0342, "step": 52400 }, { "epoch": 1.6038369890633593, "grad_norm": 5.595178604125977, "learning_rate": 4.2048708848366175e-05, "loss": 1.1342, "step": 52500 }, { "epoch": 1.6068919166615752, "grad_norm": 5.822509288787842, "learning_rate": 4.203356382327744e-05, "loss": 1.5188, "step": 52600 }, { "epoch": 1.609946844259791, "grad_norm": 3.0418248176574707, "learning_rate": 4.2018265818137314e-05, "loss": 1.0407, "step": 52700 }, { "epoch": 1.613001771858007, "grad_norm": 1.6683357954025269, "learning_rate": 4.2002967812997185e-05, "loss": 1.1343, "step": 52800 }, { "epoch": 1.6160566994562229, "grad_norm": 5.703723907470703, "learning_rate": 4.1987669807857057e-05, "loss": 1.1517, "step": 52900 }, { "epoch": 1.6191116270544388, "grad_norm": 3.9760212898254395, "learning_rate": 4.197237180271693e-05, "loss": 1.2402, "step": 53000 }, { "epoch": 1.6221665546526547, "grad_norm": 5.224232196807861, "learning_rate": 4.19570737975768e-05, "loss": 1.089, "step": 53100 }, { "epoch": 1.6252214822508706, "grad_norm": 8.600234985351562, "learning_rate": 4.194177579243666e-05, "loss": 1.2579, "step": 53200 }, { "epoch": 1.6282764098490867, "grad_norm": 2.693863868713379, "learning_rate": 4.1926477787296534e-05, "loss": 1.2177, "step": 53300 }, { "epoch": 1.6313313374473024, "grad_norm": 2.958672046661377, "learning_rate": 4.1911179782156406e-05, "loss": 1.1448, "step": 53400 }, { "epoch": 1.6343862650455185, "grad_norm": 4.920342445373535, "learning_rate": 4.189588177701628e-05, "loss": 1.2276, "step": 53500 }, { "epoch": 1.6374411926437342, "grad_norm": 3.0722827911376953, "learning_rate": 4.188058377187615e-05, "loss": 1.1841, "step": 53600 }, { "epoch": 1.6404961202419504, "grad_norm": 4.592638969421387, "learning_rate": 4.186528576673602e-05, "loss": 1.3231, "step": 53700 }, { "epoch": 1.643551047840166, "grad_norm": 11.629206657409668, "learning_rate": 4.184998776159589e-05, "loss": 1.2177, "step": 53800 }, { "epoch": 1.6466059754383822, "grad_norm": 7.090597152709961, "learning_rate": 4.183468975645576e-05, "loss": 1.17, "step": 53900 }, { "epoch": 1.649660903036598, "grad_norm": 2.535602331161499, "learning_rate": 4.1819391751315626e-05, "loss": 1.1368, "step": 54000 }, { "epoch": 1.652715830634814, "grad_norm": 3.7535698413848877, "learning_rate": 4.18040937461755e-05, "loss": 1.241, "step": 54100 }, { "epoch": 1.6557707582330299, "grad_norm": 2.1686646938323975, "learning_rate": 4.178879574103537e-05, "loss": 1.2229, "step": 54200 }, { "epoch": 1.6588256858312458, "grad_norm": 5.6875433921813965, "learning_rate": 4.177349773589524e-05, "loss": 1.0793, "step": 54300 }, { "epoch": 1.6618806134294617, "grad_norm": 8.435276985168457, "learning_rate": 4.175819973075511e-05, "loss": 1.317, "step": 54400 }, { "epoch": 1.6649355410276776, "grad_norm": 4.2010369300842285, "learning_rate": 4.174290172561498e-05, "loss": 1.1259, "step": 54500 }, { "epoch": 1.6679904686258937, "grad_norm": 3.4158153533935547, "learning_rate": 4.172760372047485e-05, "loss": 1.2646, "step": 54600 }, { "epoch": 1.6710453962241094, "grad_norm": 12.006670951843262, "learning_rate": 4.1712305715334724e-05, "loss": 1.1726, "step": 54700 }, { "epoch": 1.6741003238223255, "grad_norm": 3.64186954498291, "learning_rate": 4.1697160690246e-05, "loss": 1.513, "step": 54800 }, { "epoch": 1.6771552514205412, "grad_norm": 6.152316093444824, "learning_rate": 4.168186268510586e-05, "loss": 1.1317, "step": 54900 }, { "epoch": 1.6802101790187574, "grad_norm": 5.903543472290039, "learning_rate": 4.1666564679965734e-05, "loss": 1.2468, "step": 55000 }, { "epoch": 1.683265106616973, "grad_norm": 8.211106300354004, "learning_rate": 4.1651266674825605e-05, "loss": 1.261, "step": 55100 }, { "epoch": 1.6863200342151892, "grad_norm": 2.6759305000305176, "learning_rate": 4.1635968669685476e-05, "loss": 1.2852, "step": 55200 }, { "epoch": 1.689374961813405, "grad_norm": 11.397104263305664, "learning_rate": 4.162067066454535e-05, "loss": 1.0922, "step": 55300 }, { "epoch": 1.692429889411621, "grad_norm": 7.850461959838867, "learning_rate": 4.160537265940522e-05, "loss": 1.1855, "step": 55400 }, { "epoch": 1.6954848170098369, "grad_norm": 9.271197319030762, "learning_rate": 4.159007465426509e-05, "loss": 1.1907, "step": 55500 }, { "epoch": 1.6985397446080528, "grad_norm": 2.4433703422546387, "learning_rate": 4.157477664912496e-05, "loss": 1.1701, "step": 55600 }, { "epoch": 1.7015946722062687, "grad_norm": 1.9493122100830078, "learning_rate": 4.1559478643984825e-05, "loss": 1.2341, "step": 55700 }, { "epoch": 1.7046495998044846, "grad_norm": 1.9953112602233887, "learning_rate": 4.1544180638844697e-05, "loss": 1.2181, "step": 55800 }, { "epoch": 1.7077045274027005, "grad_norm": 4.831192493438721, "learning_rate": 4.152888263370457e-05, "loss": 1.0811, "step": 55900 }, { "epoch": 1.7107594550009164, "grad_norm": 4.006892204284668, "learning_rate": 4.151358462856444e-05, "loss": 1.1862, "step": 56000 }, { "epoch": 1.7138143825991325, "grad_norm": 3.466545343399048, "learning_rate": 4.149828662342431e-05, "loss": 1.1184, "step": 56100 }, { "epoch": 1.7168693101973482, "grad_norm": 2.2860095500946045, "learning_rate": 4.148298861828418e-05, "loss": 1.1433, "step": 56200 }, { "epoch": 1.7199242377955644, "grad_norm": 2.8950862884521484, "learning_rate": 4.146769061314405e-05, "loss": 1.1777, "step": 56300 }, { "epoch": 1.72297916539378, "grad_norm": 4.594806671142578, "learning_rate": 4.1452392608003923e-05, "loss": 1.2518, "step": 56400 }, { "epoch": 1.7260340929919962, "grad_norm": 9.209367752075195, "learning_rate": 4.143709460286379e-05, "loss": 1.0943, "step": 56500 }, { "epoch": 1.7290890205902119, "grad_norm": 7.0374908447265625, "learning_rate": 4.142179659772366e-05, "loss": 1.0603, "step": 56600 }, { "epoch": 1.732143948188428, "grad_norm": 5.589990615844727, "learning_rate": 4.140649859258353e-05, "loss": 1.1151, "step": 56700 }, { "epoch": 1.735198875786644, "grad_norm": 2.3889482021331787, "learning_rate": 4.13912005874434e-05, "loss": 1.2266, "step": 56800 }, { "epoch": 1.7382538033848598, "grad_norm": 8.789185523986816, "learning_rate": 4.137605556235467e-05, "loss": 1.1758, "step": 56900 }, { "epoch": 1.7413087309830757, "grad_norm": 2.146101474761963, "learning_rate": 4.136075755721454e-05, "loss": 1.0922, "step": 57000 }, { "epoch": 1.7443636585812916, "grad_norm": 5.941165924072266, "learning_rate": 4.134545955207441e-05, "loss": 1.3349, "step": 57100 }, { "epoch": 1.7474185861795075, "grad_norm": 1.3134785890579224, "learning_rate": 4.133016154693428e-05, "loss": 1.1338, "step": 57200 }, { "epoch": 1.7504735137777234, "grad_norm": 3.1560745239257812, "learning_rate": 4.1314863541794154e-05, "loss": 1.2561, "step": 57300 }, { "epoch": 1.7535284413759396, "grad_norm": 1.139880895614624, "learning_rate": 4.1299565536654025e-05, "loss": 1.0852, "step": 57400 }, { "epoch": 1.7565833689741552, "grad_norm": 1.4170360565185547, "learning_rate": 4.128426753151389e-05, "loss": 1.342, "step": 57500 }, { "epoch": 1.7596382965723714, "grad_norm": 7.173251628875732, "learning_rate": 4.126896952637376e-05, "loss": 1.1708, "step": 57600 }, { "epoch": 1.762693224170587, "grad_norm": 3.3711791038513184, "learning_rate": 4.125367152123363e-05, "loss": 1.3737, "step": 57700 }, { "epoch": 1.7657481517688032, "grad_norm": 3.1640279293060303, "learning_rate": 4.12383735160935e-05, "loss": 1.2542, "step": 57800 }, { "epoch": 1.7688030793670189, "grad_norm": 5.858263969421387, "learning_rate": 4.1223075510953374e-05, "loss": 1.048, "step": 57900 }, { "epoch": 1.771858006965235, "grad_norm": 4.2827911376953125, "learning_rate": 4.1207777505813245e-05, "loss": 1.0881, "step": 58000 }, { "epoch": 1.774912934563451, "grad_norm": 1.3887640237808228, "learning_rate": 4.1192479500673116e-05, "loss": 1.2263, "step": 58100 }, { "epoch": 1.7779678621616668, "grad_norm": 3.8297247886657715, "learning_rate": 4.117718149553299e-05, "loss": 1.1554, "step": 58200 }, { "epoch": 1.7810227897598827, "grad_norm": 8.833450317382812, "learning_rate": 4.116188349039285e-05, "loss": 1.1723, "step": 58300 }, { "epoch": 1.7840777173580986, "grad_norm": 6.00846529006958, "learning_rate": 4.114658548525272e-05, "loss": 1.121, "step": 58400 }, { "epoch": 1.7871326449563145, "grad_norm": 7.4083733558654785, "learning_rate": 4.1131287480112594e-05, "loss": 1.231, "step": 58500 }, { "epoch": 1.7901875725545304, "grad_norm": 9.703535079956055, "learning_rate": 4.1115989474972465e-05, "loss": 1.3855, "step": 58600 }, { "epoch": 1.7932425001527463, "grad_norm": 2.700291395187378, "learning_rate": 4.1100691469832337e-05, "loss": 1.213, "step": 58700 }, { "epoch": 1.7962974277509622, "grad_norm": 9.795341491699219, "learning_rate": 4.108539346469221e-05, "loss": 1.3951, "step": 58800 }, { "epoch": 1.7993523553491784, "grad_norm": 4.961639881134033, "learning_rate": 4.107009545955208e-05, "loss": 1.0949, "step": 58900 }, { "epoch": 1.802407282947394, "grad_norm": 14.932546615600586, "learning_rate": 4.105495043446335e-05, "loss": 1.225, "step": 59000 }, { "epoch": 1.8054622105456102, "grad_norm": 0.001140201580710709, "learning_rate": 4.103965242932322e-05, "loss": 1.2837, "step": 59100 }, { "epoch": 1.8085171381438259, "grad_norm": 3.3460593223571777, "learning_rate": 4.102435442418309e-05, "loss": 1.1232, "step": 59200 }, { "epoch": 1.811572065742042, "grad_norm": 9.262133598327637, "learning_rate": 4.100905641904296e-05, "loss": 1.1341, "step": 59300 }, { "epoch": 1.8146269933402577, "grad_norm": 3.290485382080078, "learning_rate": 4.0993758413902825e-05, "loss": 1.1736, "step": 59400 }, { "epoch": 1.8176819209384738, "grad_norm": 8.72140121459961, "learning_rate": 4.0978460408762696e-05, "loss": 1.305, "step": 59500 }, { "epoch": 1.8207368485366897, "grad_norm": 2.2909820079803467, "learning_rate": 4.096316240362257e-05, "loss": 1.1022, "step": 59600 }, { "epoch": 1.8237917761349056, "grad_norm": 3.055588960647583, "learning_rate": 4.094786439848244e-05, "loss": 1.148, "step": 59700 }, { "epoch": 1.8268467037331215, "grad_norm": 3.7513368129730225, "learning_rate": 4.093256639334231e-05, "loss": 1.4451, "step": 59800 }, { "epoch": 1.8299016313313374, "grad_norm": 4.179325580596924, "learning_rate": 4.091726838820218e-05, "loss": 1.3197, "step": 59900 }, { "epoch": 1.8329565589295533, "grad_norm": 5.003783702850342, "learning_rate": 4.090197038306205e-05, "loss": 1.1448, "step": 60000 }, { "epoch": 1.8360114865277692, "grad_norm": 3.986809492111206, "learning_rate": 4.088667237792192e-05, "loss": 1.2398, "step": 60100 }, { "epoch": 1.8390664141259854, "grad_norm": 5.309907913208008, "learning_rate": 4.087137437278179e-05, "loss": 1.3249, "step": 60200 }, { "epoch": 1.842121341724201, "grad_norm": 1.3858393430709839, "learning_rate": 4.085607636764166e-05, "loss": 1.2089, "step": 60300 }, { "epoch": 1.8451762693224172, "grad_norm": 3.5477840900421143, "learning_rate": 4.084077836250153e-05, "loss": 1.1067, "step": 60400 }, { "epoch": 1.8482311969206329, "grad_norm": 3.756406545639038, "learning_rate": 4.08254803573614e-05, "loss": 1.3526, "step": 60500 }, { "epoch": 1.851286124518849, "grad_norm": 7.084662437438965, "learning_rate": 4.081018235222127e-05, "loss": 1.2392, "step": 60600 }, { "epoch": 1.8543410521170647, "grad_norm": 5.864764213562012, "learning_rate": 4.079488434708114e-05, "loss": 1.4818, "step": 60700 }, { "epoch": 1.8573959797152808, "grad_norm": 1.9365818500518799, "learning_rate": 4.0779586341941014e-05, "loss": 1.1989, "step": 60800 }, { "epoch": 1.8604509073134967, "grad_norm": 3.3976101875305176, "learning_rate": 4.0764288336800885e-05, "loss": 1.1127, "step": 60900 }, { "epoch": 1.8635058349117126, "grad_norm": 4.278215408325195, "learning_rate": 4.074899033166075e-05, "loss": 1.0595, "step": 61000 }, { "epoch": 1.8665607625099285, "grad_norm": 5.625969409942627, "learning_rate": 4.0733845306572024e-05, "loss": 1.3236, "step": 61100 }, { "epoch": 1.8696156901081444, "grad_norm": 5.3394365310668945, "learning_rate": 4.0718547301431896e-05, "loss": 1.2003, "step": 61200 }, { "epoch": 1.8726706177063603, "grad_norm": 6.125877380371094, "learning_rate": 4.0703402276343164e-05, "loss": 1.4561, "step": 61300 }, { "epoch": 1.8757255453045762, "grad_norm": 6.432266712188721, "learning_rate": 4.0688104271203035e-05, "loss": 1.273, "step": 61400 }, { "epoch": 1.8787804729027922, "grad_norm": 1.3589688539505005, "learning_rate": 4.0672806266062906e-05, "loss": 1.1759, "step": 61500 }, { "epoch": 1.881835400501008, "grad_norm": 22.972280502319336, "learning_rate": 4.065750826092278e-05, "loss": 1.2454, "step": 61600 }, { "epoch": 1.8848903280992242, "grad_norm": 4.607998371124268, "learning_rate": 4.064221025578265e-05, "loss": 1.351, "step": 61700 }, { "epoch": 1.8879452556974399, "grad_norm": 7.128314018249512, "learning_rate": 4.062691225064252e-05, "loss": 1.1201, "step": 61800 }, { "epoch": 1.891000183295656, "grad_norm": 6.283421993255615, "learning_rate": 4.061161424550239e-05, "loss": 1.3758, "step": 61900 }, { "epoch": 1.8940551108938717, "grad_norm": 2.3519177436828613, "learning_rate": 4.059631624036226e-05, "loss": 1.1033, "step": 62000 }, { "epoch": 1.8971100384920878, "grad_norm": 5.391026973724365, "learning_rate": 4.0581018235222126e-05, "loss": 1.1464, "step": 62100 }, { "epoch": 1.9001649660903035, "grad_norm": 2.404320240020752, "learning_rate": 4.0565720230082e-05, "loss": 1.1599, "step": 62200 }, { "epoch": 1.9032198936885196, "grad_norm": 1.6671347618103027, "learning_rate": 4.055042222494187e-05, "loss": 1.1476, "step": 62300 }, { "epoch": 1.9062748212867355, "grad_norm": 1.6540522575378418, "learning_rate": 4.053512421980174e-05, "loss": 1.1896, "step": 62400 }, { "epoch": 1.9093297488849514, "grad_norm": 17.489120483398438, "learning_rate": 4.051982621466161e-05, "loss": 1.0109, "step": 62500 }, { "epoch": 1.9123846764831673, "grad_norm": 8.429814338684082, "learning_rate": 4.050452820952148e-05, "loss": 1.0935, "step": 62600 }, { "epoch": 1.9154396040813833, "grad_norm": 6.822543144226074, "learning_rate": 4.048923020438135e-05, "loss": 1.1555, "step": 62700 }, { "epoch": 1.9184945316795992, "grad_norm": 3.066483497619629, "learning_rate": 4.0473932199241224e-05, "loss": 1.255, "step": 62800 }, { "epoch": 1.921549459277815, "grad_norm": 3.349703550338745, "learning_rate": 4.045863419410109e-05, "loss": 1.4048, "step": 62900 }, { "epoch": 1.9246043868760312, "grad_norm": 4.069706916809082, "learning_rate": 4.044333618896096e-05, "loss": 1.1313, "step": 63000 }, { "epoch": 1.9276593144742469, "grad_norm": 3.599609613418579, "learning_rate": 4.042803818382083e-05, "loss": 1.4575, "step": 63100 }, { "epoch": 1.930714242072463, "grad_norm": 4.190948486328125, "learning_rate": 4.04127401786807e-05, "loss": 1.2446, "step": 63200 }, { "epoch": 1.9337691696706787, "grad_norm": 4.354323863983154, "learning_rate": 4.039744217354057e-05, "loss": 1.1491, "step": 63300 }, { "epoch": 1.9368240972688948, "grad_norm": 5.348269939422607, "learning_rate": 4.0382144168400444e-05, "loss": 1.3107, "step": 63400 }, { "epoch": 1.9398790248671105, "grad_norm": 3.1174705028533936, "learning_rate": 4.0366846163260315e-05, "loss": 1.1938, "step": 63500 }, { "epoch": 1.9429339524653266, "grad_norm": 12.221755981445312, "learning_rate": 4.0351548158120187e-05, "loss": 1.1975, "step": 63600 }, { "epoch": 1.9459888800635425, "grad_norm": 7.461677551269531, "learning_rate": 4.033625015298005e-05, "loss": 1.0214, "step": 63700 }, { "epoch": 1.9490438076617584, "grad_norm": 4.798915863037109, "learning_rate": 4.032095214783992e-05, "loss": 1.3375, "step": 63800 }, { "epoch": 1.9520987352599743, "grad_norm": 3.232374906539917, "learning_rate": 4.030565414269979e-05, "loss": 1.1388, "step": 63900 }, { "epoch": 1.9551536628581903, "grad_norm": 6.82913875579834, "learning_rate": 4.0290356137559664e-05, "loss": 1.1365, "step": 64000 }, { "epoch": 1.9582085904564062, "grad_norm": 18.124427795410156, "learning_rate": 4.0275058132419536e-05, "loss": 1.196, "step": 64100 }, { "epoch": 1.961263518054622, "grad_norm": 2.9495561122894287, "learning_rate": 4.025976012727941e-05, "loss": 1.2184, "step": 64200 }, { "epoch": 1.964318445652838, "grad_norm": 5.870420455932617, "learning_rate": 4.024446212213928e-05, "loss": 1.1024, "step": 64300 }, { "epoch": 1.9673733732510539, "grad_norm": 36.10947036743164, "learning_rate": 4.022916411699915e-05, "loss": 1.5031, "step": 64400 }, { "epoch": 1.97042830084927, "grad_norm": 6.070558071136475, "learning_rate": 4.0213866111859013e-05, "loss": 1.3974, "step": 64500 }, { "epoch": 1.9734832284474857, "grad_norm": 2.5208094120025635, "learning_rate": 4.0198568106718885e-05, "loss": 1.1695, "step": 64600 }, { "epoch": 1.9765381560457018, "grad_norm": 4.907090187072754, "learning_rate": 4.0183270101578756e-05, "loss": 1.1204, "step": 64700 }, { "epoch": 1.9795930836439175, "grad_norm": 3.0557048320770264, "learning_rate": 4.016797209643863e-05, "loss": 1.2246, "step": 64800 }, { "epoch": 1.9826480112421336, "grad_norm": 9.957263946533203, "learning_rate": 4.01526740912985e-05, "loss": 1.0912, "step": 64900 }, { "epoch": 1.9857029388403493, "grad_norm": 12.20952320098877, "learning_rate": 4.013737608615837e-05, "loss": 1.1763, "step": 65000 }, { "epoch": 1.9887578664385654, "grad_norm": 2.2833447456359863, "learning_rate": 4.012207808101824e-05, "loss": 1.2259, "step": 65100 }, { "epoch": 1.9918127940367814, "grad_norm": 3.358288049697876, "learning_rate": 4.010678007587811e-05, "loss": 1.2257, "step": 65200 }, { "epoch": 1.9948677216349973, "grad_norm": 2.8749561309814453, "learning_rate": 4.0091482070737976e-05, "loss": 1.2282, "step": 65300 }, { "epoch": 1.9979226492332132, "grad_norm": 3.331350803375244, "learning_rate": 4.007633704564925e-05, "loss": 1.1915, "step": 65400 }, { "epoch": 2.0, "eval_accuracy": 0.676513716624916, "eval_loss": 0.857006847858429, "eval_runtime": 1782.201, "eval_samples_per_second": 18.367, "eval_steps_per_second": 4.592, "step": 65468 }, { "epoch": 2.000977576831429, "grad_norm": 5.58222770690918, "learning_rate": 4.006103904050912e-05, "loss": 1.1468, "step": 65500 }, { "epoch": 2.004032504429645, "grad_norm": 3.9189741611480713, "learning_rate": 4.0045741035368986e-05, "loss": 1.044, "step": 65600 }, { "epoch": 2.007087432027861, "grad_norm": 5.215442180633545, "learning_rate": 4.003044303022886e-05, "loss": 1.1866, "step": 65700 }, { "epoch": 2.010142359626077, "grad_norm": 5.244242191314697, "learning_rate": 4.001514502508873e-05, "loss": 1.2026, "step": 65800 }, { "epoch": 2.0131972872242927, "grad_norm": 10.0203218460083, "learning_rate": 3.99998470199486e-05, "loss": 1.2438, "step": 65900 }, { "epoch": 2.016252214822509, "grad_norm": 24.34562873840332, "learning_rate": 3.998454901480847e-05, "loss": 1.2173, "step": 66000 }, { "epoch": 2.0193071424207245, "grad_norm": 13.229280471801758, "learning_rate": 3.996925100966834e-05, "loss": 1.2143, "step": 66100 }, { "epoch": 2.0223620700189406, "grad_norm": 5.116329669952393, "learning_rate": 3.995395300452821e-05, "loss": 1.2472, "step": 66200 }, { "epoch": 2.0254169976171563, "grad_norm": 2.4487433433532715, "learning_rate": 3.9938654999388084e-05, "loss": 1.3071, "step": 66300 }, { "epoch": 2.0284719252153725, "grad_norm": 1.7104898691177368, "learning_rate": 3.992335699424795e-05, "loss": 1.1499, "step": 66400 }, { "epoch": 2.031526852813588, "grad_norm": 1.6790919303894043, "learning_rate": 3.990805898910782e-05, "loss": 1.0116, "step": 66500 }, { "epoch": 2.0345817804118043, "grad_norm": 3.6588943004608154, "learning_rate": 3.989276098396769e-05, "loss": 1.2898, "step": 66600 }, { "epoch": 2.03763670801002, "grad_norm": 9.815014839172363, "learning_rate": 3.987746297882756e-05, "loss": 1.1312, "step": 66700 }, { "epoch": 2.040691635608236, "grad_norm": 3.727752208709717, "learning_rate": 3.986216497368743e-05, "loss": 1.1964, "step": 66800 }, { "epoch": 2.043746563206452, "grad_norm": 4.570804595947266, "learning_rate": 3.9846866968547304e-05, "loss": 1.1733, "step": 66900 }, { "epoch": 2.046801490804668, "grad_norm": 5.729971408843994, "learning_rate": 3.9831568963407176e-05, "loss": 1.1444, "step": 67000 }, { "epoch": 2.049856418402884, "grad_norm": 3.439488649368286, "learning_rate": 3.981627095826705e-05, "loss": 1.1501, "step": 67100 }, { "epoch": 2.0529113460010997, "grad_norm": 8.150869369506836, "learning_rate": 3.980097295312691e-05, "loss": 1.1009, "step": 67200 }, { "epoch": 2.055966273599316, "grad_norm": 5.003384590148926, "learning_rate": 3.978567494798678e-05, "loss": 1.1368, "step": 67300 }, { "epoch": 2.0590212011975315, "grad_norm": Infinity, "learning_rate": 3.977052992289805e-05, "loss": 1.0697, "step": 67400 }, { "epoch": 2.0620761287957476, "grad_norm": 2.6935789585113525, "learning_rate": 3.975523191775792e-05, "loss": 1.2134, "step": 67500 }, { "epoch": 2.0651310563939633, "grad_norm": 4.748971462249756, "learning_rate": 3.973993391261779e-05, "loss": 1.2622, "step": 67600 }, { "epoch": 2.0681859839921795, "grad_norm": 8.399434089660645, "learning_rate": 3.9724635907477664e-05, "loss": 1.1459, "step": 67700 }, { "epoch": 2.071240911590395, "grad_norm": 2.4680871963500977, "learning_rate": 3.9709337902337535e-05, "loss": 1.0392, "step": 67800 }, { "epoch": 2.0742958391886113, "grad_norm": 6.203686714172363, "learning_rate": 3.9694039897197406e-05, "loss": 1.0936, "step": 67900 }, { "epoch": 2.077350766786827, "grad_norm": 2.707479476928711, "learning_rate": 3.967874189205728e-05, "loss": 1.282, "step": 68000 }, { "epoch": 2.080405694385043, "grad_norm": 6.8733720779418945, "learning_rate": 3.966344388691715e-05, "loss": 1.2343, "step": 68100 }, { "epoch": 2.083460621983259, "grad_norm": 0.5146437883377075, "learning_rate": 3.964814588177701e-05, "loss": 1.0634, "step": 68200 }, { "epoch": 2.086515549581475, "grad_norm": 2.7069003582000732, "learning_rate": 3.9632847876636884e-05, "loss": 1.1075, "step": 68300 }, { "epoch": 2.089570477179691, "grad_norm": 4.112438678741455, "learning_rate": 3.9617549871496755e-05, "loss": 1.1765, "step": 68400 }, { "epoch": 2.0926254047779067, "grad_norm": 5.514732360839844, "learning_rate": 3.9602251866356626e-05, "loss": 1.1638, "step": 68500 }, { "epoch": 2.095680332376123, "grad_norm": 9.44277286529541, "learning_rate": 3.95871068412679e-05, "loss": 1.1738, "step": 68600 }, { "epoch": 2.0987352599743385, "grad_norm": 4.027563571929932, "learning_rate": 3.957180883612777e-05, "loss": 1.2366, "step": 68700 }, { "epoch": 2.1017901875725546, "grad_norm": 2.5387253761291504, "learning_rate": 3.955651083098764e-05, "loss": 1.3013, "step": 68800 }, { "epoch": 2.1048451151707703, "grad_norm": 5.709287643432617, "learning_rate": 3.9541212825847514e-05, "loss": 1.2576, "step": 68900 }, { "epoch": 2.1079000427689865, "grad_norm": 3.3813297748565674, "learning_rate": 3.9525914820707386e-05, "loss": 1.2087, "step": 69000 }, { "epoch": 2.110954970367202, "grad_norm": 8.29941177368164, "learning_rate": 3.951061681556725e-05, "loss": 1.1075, "step": 69100 }, { "epoch": 2.1140098979654183, "grad_norm": 4.356234073638916, "learning_rate": 3.949531881042712e-05, "loss": 1.0665, "step": 69200 }, { "epoch": 2.117064825563634, "grad_norm": 2.651749849319458, "learning_rate": 3.948002080528699e-05, "loss": 1.1263, "step": 69300 }, { "epoch": 2.12011975316185, "grad_norm": 12.057615280151367, "learning_rate": 3.9464722800146863e-05, "loss": 1.1807, "step": 69400 }, { "epoch": 2.1231746807600658, "grad_norm": 3.4325954914093018, "learning_rate": 3.9449424795006735e-05, "loss": 1.1849, "step": 69500 }, { "epoch": 2.126229608358282, "grad_norm": 1.0763434171676636, "learning_rate": 3.9434126789866606e-05, "loss": 1.0558, "step": 69600 }, { "epoch": 2.129284535956498, "grad_norm": 10.428380012512207, "learning_rate": 3.941882878472648e-05, "loss": 1.2732, "step": 69700 }, { "epoch": 2.1323394635547137, "grad_norm": 1.0638651847839355, "learning_rate": 3.940353077958635e-05, "loss": 1.1089, "step": 69800 }, { "epoch": 2.13539439115293, "grad_norm": 3.8725669384002686, "learning_rate": 3.938823277444621e-05, "loss": 1.3897, "step": 69900 }, { "epoch": 2.1384493187511455, "grad_norm": 2.9482781887054443, "learning_rate": 3.9372934769306084e-05, "loss": 1.2474, "step": 70000 }, { "epoch": 2.1415042463493617, "grad_norm": 2.1782474517822266, "learning_rate": 3.9357636764165955e-05, "loss": 1.1152, "step": 70100 }, { "epoch": 2.1445591739475773, "grad_norm": 2.9231390953063965, "learning_rate": 3.9342338759025826e-05, "loss": 1.198, "step": 70200 }, { "epoch": 2.1476141015457935, "grad_norm": 6.574098587036133, "learning_rate": 3.93270407538857e-05, "loss": 1.0875, "step": 70300 }, { "epoch": 2.150669029144009, "grad_norm": 10.772805213928223, "learning_rate": 3.931174274874557e-05, "loss": 1.2619, "step": 70400 }, { "epoch": 2.1537239567422253, "grad_norm": 2.2910847663879395, "learning_rate": 3.929644474360544e-05, "loss": 1.217, "step": 70500 }, { "epoch": 2.156778884340441, "grad_norm": 3.5544190406799316, "learning_rate": 3.928114673846531e-05, "loss": 1.1816, "step": 70600 }, { "epoch": 2.159833811938657, "grad_norm": 11.924046516418457, "learning_rate": 3.9265848733325175e-05, "loss": 1.1116, "step": 70700 }, { "epoch": 2.1628887395368728, "grad_norm": 4.180816173553467, "learning_rate": 3.9250550728185046e-05, "loss": 1.0373, "step": 70800 }, { "epoch": 2.165943667135089, "grad_norm": 7.700500011444092, "learning_rate": 3.923525272304492e-05, "loss": 1.2157, "step": 70900 }, { "epoch": 2.1689985947333046, "grad_norm": 3.0716445446014404, "learning_rate": 3.921995471790479e-05, "loss": 1.1892, "step": 71000 }, { "epoch": 2.1720535223315207, "grad_norm": 6.259620189666748, "learning_rate": 3.920465671276466e-05, "loss": 1.1653, "step": 71100 }, { "epoch": 2.175108449929737, "grad_norm": 6.640773773193359, "learning_rate": 3.918935870762453e-05, "loss": 1.1338, "step": 71200 }, { "epoch": 2.1781633775279525, "grad_norm": 7.011377811431885, "learning_rate": 3.91740607024844e-05, "loss": 1.1523, "step": 71300 }, { "epoch": 2.1812183051261687, "grad_norm": 2.073110580444336, "learning_rate": 3.915876269734427e-05, "loss": 1.1036, "step": 71400 }, { "epoch": 2.1842732327243843, "grad_norm": 1.5513907670974731, "learning_rate": 3.914346469220414e-05, "loss": 1.1772, "step": 71500 }, { "epoch": 2.1873281603226005, "grad_norm": 4.17803430557251, "learning_rate": 3.912816668706401e-05, "loss": 1.1437, "step": 71600 }, { "epoch": 2.190383087920816, "grad_norm": 2.0113983154296875, "learning_rate": 3.911286868192388e-05, "loss": 1.0487, "step": 71700 }, { "epoch": 2.1934380155190323, "grad_norm": 0.9932565093040466, "learning_rate": 3.909757067678375e-05, "loss": 1.1172, "step": 71800 }, { "epoch": 2.196492943117248, "grad_norm": 1.959645390510559, "learning_rate": 3.908227267164362e-05, "loss": 1.1405, "step": 71900 }, { "epoch": 2.199547870715464, "grad_norm": 4.294275760650635, "learning_rate": 3.906697466650349e-05, "loss": 1.4798, "step": 72000 }, { "epoch": 2.2026027983136798, "grad_norm": 2.311213493347168, "learning_rate": 3.9051676661363364e-05, "loss": 1.1973, "step": 72100 }, { "epoch": 2.205657725911896, "grad_norm": 10.680132865905762, "learning_rate": 3.9036378656223236e-05, "loss": 1.0047, "step": 72200 }, { "epoch": 2.208712653510112, "grad_norm": 5.8984904289245605, "learning_rate": 3.90210806510831e-05, "loss": 1.3464, "step": 72300 }, { "epoch": 2.2117675811083277, "grad_norm": 1.8181579113006592, "learning_rate": 3.900578264594297e-05, "loss": 1.1493, "step": 72400 }, { "epoch": 2.214822508706544, "grad_norm": 2.351229190826416, "learning_rate": 3.899048464080284e-05, "loss": 1.0957, "step": 72500 }, { "epoch": 2.2178774363047595, "grad_norm": 3.8195061683654785, "learning_rate": 3.897518663566271e-05, "loss": 1.2018, "step": 72600 }, { "epoch": 2.2209323639029757, "grad_norm": 3.727231740951538, "learning_rate": 3.896004161057398e-05, "loss": 1.3295, "step": 72700 }, { "epoch": 2.2239872915011913, "grad_norm": 3.8231828212738037, "learning_rate": 3.894474360543385e-05, "loss": 1.0968, "step": 72800 }, { "epoch": 2.2270422190994075, "grad_norm": 6.009609699249268, "learning_rate": 3.8929445600293724e-05, "loss": 1.0188, "step": 72900 }, { "epoch": 2.230097146697623, "grad_norm": 17.05879783630371, "learning_rate": 3.8914147595153595e-05, "loss": 1.2487, "step": 73000 }, { "epoch": 2.2331520742958393, "grad_norm": 5.385801792144775, "learning_rate": 3.8898849590013466e-05, "loss": 1.1919, "step": 73100 }, { "epoch": 2.236207001894055, "grad_norm": 5.083590030670166, "learning_rate": 3.888355158487334e-05, "loss": 1.048, "step": 73200 }, { "epoch": 2.239261929492271, "grad_norm": 5.49558687210083, "learning_rate": 3.88682535797332e-05, "loss": 1.3023, "step": 73300 }, { "epoch": 2.242316857090487, "grad_norm": 5.167341709136963, "learning_rate": 3.885295557459307e-05, "loss": 1.242, "step": 73400 }, { "epoch": 2.245371784688703, "grad_norm": 7.800171852111816, "learning_rate": 3.8837657569452944e-05, "loss": 0.942, "step": 73500 }, { "epoch": 2.2484267122869186, "grad_norm": 4.706507682800293, "learning_rate": 3.8822359564312815e-05, "loss": 1.0041, "step": 73600 }, { "epoch": 2.2514816398851347, "grad_norm": 12.750622749328613, "learning_rate": 3.8807061559172686e-05, "loss": 1.0336, "step": 73700 }, { "epoch": 2.254536567483351, "grad_norm": 8.06450366973877, "learning_rate": 3.879176355403256e-05, "loss": 1.1878, "step": 73800 }, { "epoch": 2.2575914950815665, "grad_norm": 4.1706953048706055, "learning_rate": 3.877646554889243e-05, "loss": 1.2378, "step": 73900 }, { "epoch": 2.2606464226797827, "grad_norm": 9.237957954406738, "learning_rate": 3.87611675437523e-05, "loss": 1.316, "step": 74000 }, { "epoch": 2.2637013502779983, "grad_norm": 15.578306198120117, "learning_rate": 3.8745869538612164e-05, "loss": 1.1625, "step": 74100 }, { "epoch": 2.2667562778762145, "grad_norm": 4.735302925109863, "learning_rate": 3.8730571533472035e-05, "loss": 1.2111, "step": 74200 }, { "epoch": 2.26981120547443, "grad_norm": 3.8716487884521484, "learning_rate": 3.8715273528331906e-05, "loss": 1.2688, "step": 74300 }, { "epoch": 2.2728661330726463, "grad_norm": 1.822843074798584, "learning_rate": 3.869997552319178e-05, "loss": 1.182, "step": 74400 }, { "epoch": 2.275921060670862, "grad_norm": 4.336980819702148, "learning_rate": 3.868467751805165e-05, "loss": 1.1425, "step": 74500 }, { "epoch": 2.278975988269078, "grad_norm": 10.102789878845215, "learning_rate": 3.866937951291152e-05, "loss": 1.2088, "step": 74600 }, { "epoch": 2.282030915867294, "grad_norm": 1.985364317893982, "learning_rate": 3.865408150777139e-05, "loss": 1.1665, "step": 74700 }, { "epoch": 2.28508584346551, "grad_norm": 11.066972732543945, "learning_rate": 3.863893648268266e-05, "loss": 1.1125, "step": 74800 }, { "epoch": 2.2881407710637256, "grad_norm": 3.418748617172241, "learning_rate": 3.862363847754253e-05, "loss": 1.245, "step": 74900 }, { "epoch": 2.2911956986619417, "grad_norm": 3.346759557723999, "learning_rate": 3.86083404724024e-05, "loss": 1.025, "step": 75000 }, { "epoch": 2.2942506262601574, "grad_norm": 5.483870983123779, "learning_rate": 3.859304246726227e-05, "loss": 1.1594, "step": 75100 }, { "epoch": 2.2973055538583735, "grad_norm": 2.363171339035034, "learning_rate": 3.857774446212214e-05, "loss": 1.2255, "step": 75200 }, { "epoch": 2.3003604814565897, "grad_norm": 3.270732879638672, "learning_rate": 3.856244645698201e-05, "loss": 1.1561, "step": 75300 }, { "epoch": 2.3034154090548054, "grad_norm": 6.600897789001465, "learning_rate": 3.854714845184188e-05, "loss": 1.4451, "step": 75400 }, { "epoch": 2.3064703366530215, "grad_norm": 9.933273315429688, "learning_rate": 3.853185044670175e-05, "loss": 1.1166, "step": 75500 }, { "epoch": 2.309525264251237, "grad_norm": 9.13111400604248, "learning_rate": 3.851655244156162e-05, "loss": 1.2099, "step": 75600 }, { "epoch": 2.3125801918494533, "grad_norm": 0.8172520995140076, "learning_rate": 3.850125443642149e-05, "loss": 1.4097, "step": 75700 }, { "epoch": 2.315635119447669, "grad_norm": 2.1834285259246826, "learning_rate": 3.8485956431281364e-05, "loss": 1.1568, "step": 75800 }, { "epoch": 2.318690047045885, "grad_norm": 3.5525593757629395, "learning_rate": 3.8470658426141235e-05, "loss": 1.3119, "step": 75900 }, { "epoch": 2.321744974644101, "grad_norm": 5.114786624908447, "learning_rate": 3.84553604210011e-05, "loss": 1.0931, "step": 76000 }, { "epoch": 2.324799902242317, "grad_norm": 3.612886428833008, "learning_rate": 3.844006241586097e-05, "loss": 1.1166, "step": 76100 }, { "epoch": 2.3278548298405326, "grad_norm": 2.317949056625366, "learning_rate": 3.842476441072084e-05, "loss": 1.2106, "step": 76200 }, { "epoch": 2.3309097574387487, "grad_norm": 7.012603282928467, "learning_rate": 3.840946640558071e-05, "loss": 1.1442, "step": 76300 }, { "epoch": 2.333964685036965, "grad_norm": 4.5827717781066895, "learning_rate": 3.8394168400440584e-05, "loss": 1.161, "step": 76400 }, { "epoch": 2.3370196126351805, "grad_norm": 8.932015419006348, "learning_rate": 3.8378870395300455e-05, "loss": 0.9779, "step": 76500 }, { "epoch": 2.3400745402333962, "grad_norm": 4.946058750152588, "learning_rate": 3.8363572390160326e-05, "loss": 1.1326, "step": 76600 }, { "epoch": 2.3431294678316124, "grad_norm": 3.1590678691864014, "learning_rate": 3.83482743850202e-05, "loss": 1.328, "step": 76700 }, { "epoch": 2.3461843954298285, "grad_norm": 6.242021083831787, "learning_rate": 3.833297637988006e-05, "loss": 1.0906, "step": 76800 }, { "epoch": 2.349239323028044, "grad_norm": 6.75691556930542, "learning_rate": 3.8317831354791336e-05, "loss": 1.4751, "step": 76900 }, { "epoch": 2.3522942506262603, "grad_norm": 8.400306701660156, "learning_rate": 3.830253334965121e-05, "loss": 1.1472, "step": 77000 }, { "epoch": 2.355349178224476, "grad_norm": 14.731451034545898, "learning_rate": 3.828723534451108e-05, "loss": 1.2924, "step": 77100 }, { "epoch": 2.358404105822692, "grad_norm": 5.320265293121338, "learning_rate": 3.827193733937095e-05, "loss": 1.2347, "step": 77200 }, { "epoch": 2.361459033420908, "grad_norm": 2.3398170471191406, "learning_rate": 3.825663933423082e-05, "loss": 1.354, "step": 77300 }, { "epoch": 2.364513961019124, "grad_norm": 4.871092319488525, "learning_rate": 3.824134132909069e-05, "loss": 1.3481, "step": 77400 }, { "epoch": 2.3675688886173396, "grad_norm": 5.175891876220703, "learning_rate": 3.8226043323950563e-05, "loss": 1.2666, "step": 77500 }, { "epoch": 2.3706238162155557, "grad_norm": 4.1197686195373535, "learning_rate": 3.8210745318810435e-05, "loss": 1.1468, "step": 77600 }, { "epoch": 2.3736787438137714, "grad_norm": 7.0223164558410645, "learning_rate": 3.81954473136703e-05, "loss": 1.2952, "step": 77700 }, { "epoch": 2.3767336714119875, "grad_norm": 6.587507724761963, "learning_rate": 3.818014930853017e-05, "loss": 1.1041, "step": 77800 }, { "epoch": 2.3797885990102037, "grad_norm": 3.74049711227417, "learning_rate": 3.816485130339004e-05, "loss": 1.2098, "step": 77900 }, { "epoch": 2.3828435266084194, "grad_norm": 16.616947174072266, "learning_rate": 3.814955329824991e-05, "loss": 1.2535, "step": 78000 }, { "epoch": 2.3858984542066355, "grad_norm": 2.7336437702178955, "learning_rate": 3.8134255293109784e-05, "loss": 1.1514, "step": 78100 }, { "epoch": 2.388953381804851, "grad_norm": 4.99415397644043, "learning_rate": 3.8118957287969655e-05, "loss": 1.2196, "step": 78200 }, { "epoch": 2.3920083094030673, "grad_norm": 3.396183729171753, "learning_rate": 3.8103659282829526e-05, "loss": 1.1826, "step": 78300 }, { "epoch": 2.395063237001283, "grad_norm": 4.325381278991699, "learning_rate": 3.80883612776894e-05, "loss": 1.1332, "step": 78400 }, { "epoch": 2.398118164599499, "grad_norm": 4.070424556732178, "learning_rate": 3.807306327254926e-05, "loss": 1.3016, "step": 78500 }, { "epoch": 2.401173092197715, "grad_norm": 4.667051792144775, "learning_rate": 3.8057765267409126e-05, "loss": 1.1709, "step": 78600 }, { "epoch": 2.404228019795931, "grad_norm": 10.673643112182617, "learning_rate": 3.8042467262269e-05, "loss": 1.2628, "step": 78700 }, { "epoch": 2.4072829473941466, "grad_norm": 5.150026321411133, "learning_rate": 3.802716925712887e-05, "loss": 1.3025, "step": 78800 }, { "epoch": 2.4103378749923627, "grad_norm": 10.943438529968262, "learning_rate": 3.801187125198874e-05, "loss": 1.228, "step": 78900 }, { "epoch": 2.4133928025905784, "grad_norm": 5.951319217681885, "learning_rate": 3.7996726226900014e-05, "loss": 1.1702, "step": 79000 }, { "epoch": 2.4164477301887946, "grad_norm": 8.593432426452637, "learning_rate": 3.7981428221759885e-05, "loss": 1.3426, "step": 79100 }, { "epoch": 2.4195026577870102, "grad_norm": 4.801974773406982, "learning_rate": 3.7966130216619756e-05, "loss": 1.0271, "step": 79200 }, { "epoch": 2.4225575853852264, "grad_norm": 4.933838367462158, "learning_rate": 3.795083221147963e-05, "loss": 1.0458, "step": 79300 }, { "epoch": 2.4256125129834425, "grad_norm": 2.4156911373138428, "learning_rate": 3.79355342063395e-05, "loss": 1.2557, "step": 79400 }, { "epoch": 2.428667440581658, "grad_norm": 18.329809188842773, "learning_rate": 3.792023620119936e-05, "loss": 1.0828, "step": 79500 }, { "epoch": 2.4317223681798743, "grad_norm": 3.4031643867492676, "learning_rate": 3.7904938196059234e-05, "loss": 1.1404, "step": 79600 }, { "epoch": 2.43477729577809, "grad_norm": 29.743398666381836, "learning_rate": 3.7889640190919105e-05, "loss": 1.1868, "step": 79700 }, { "epoch": 2.437832223376306, "grad_norm": 3.8394060134887695, "learning_rate": 3.7874342185778976e-05, "loss": 1.1176, "step": 79800 }, { "epoch": 2.440887150974522, "grad_norm": 3.4328103065490723, "learning_rate": 3.785904418063885e-05, "loss": 1.3779, "step": 79900 }, { "epoch": 2.443942078572738, "grad_norm": 1.7140569686889648, "learning_rate": 3.784374617549872e-05, "loss": 1.0813, "step": 80000 }, { "epoch": 2.4469970061709536, "grad_norm": 11.085058212280273, "learning_rate": 3.782844817035859e-05, "loss": 1.1175, "step": 80100 }, { "epoch": 2.4500519337691697, "grad_norm": 7.249815464019775, "learning_rate": 3.781315016521846e-05, "loss": 1.1873, "step": 80200 }, { "epoch": 2.4531068613673854, "grad_norm": 10.735973358154297, "learning_rate": 3.7797852160078325e-05, "loss": 1.2008, "step": 80300 }, { "epoch": 2.4561617889656016, "grad_norm": 6.401257514953613, "learning_rate": 3.77825541549382e-05, "loss": 1.2233, "step": 80400 }, { "epoch": 2.4592167165638177, "grad_norm": 1.9271202087402344, "learning_rate": 3.776725614979807e-05, "loss": 1.2601, "step": 80500 }, { "epoch": 2.4622716441620334, "grad_norm": 2.7985684871673584, "learning_rate": 3.775195814465794e-05, "loss": 1.1866, "step": 80600 }, { "epoch": 2.465326571760249, "grad_norm": 5.0379509925842285, "learning_rate": 3.773666013951781e-05, "loss": 1.1215, "step": 80700 }, { "epoch": 2.468381499358465, "grad_norm": 19.3048152923584, "learning_rate": 3.772136213437768e-05, "loss": 1.1121, "step": 80800 }, { "epoch": 2.4714364269566813, "grad_norm": 5.526917457580566, "learning_rate": 3.770606412923755e-05, "loss": 1.023, "step": 80900 }, { "epoch": 2.474491354554897, "grad_norm": 3.810134172439575, "learning_rate": 3.7690766124097424e-05, "loss": 1.1027, "step": 81000 }, { "epoch": 2.477546282153113, "grad_norm": 4.790931224822998, "learning_rate": 3.767562109900869e-05, "loss": 1.229, "step": 81100 }, { "epoch": 2.480601209751329, "grad_norm": 4.748075008392334, "learning_rate": 3.766032309386856e-05, "loss": 1.1788, "step": 81200 }, { "epoch": 2.483656137349545, "grad_norm": 7.152939796447754, "learning_rate": 3.7645025088728434e-05, "loss": 0.9895, "step": 81300 }, { "epoch": 2.4867110649477606, "grad_norm": 9.12071418762207, "learning_rate": 3.76297270835883e-05, "loss": 1.4014, "step": 81400 }, { "epoch": 2.4897659925459767, "grad_norm": 2.651780605316162, "learning_rate": 3.761442907844817e-05, "loss": 1.2332, "step": 81500 }, { "epoch": 2.4928209201441924, "grad_norm": 1.2754027843475342, "learning_rate": 3.759913107330804e-05, "loss": 1.2788, "step": 81600 }, { "epoch": 2.4958758477424086, "grad_norm": 4.905016899108887, "learning_rate": 3.758383306816791e-05, "loss": 1.1859, "step": 81700 }, { "epoch": 2.4989307753406242, "grad_norm": 6.369762897491455, "learning_rate": 3.756853506302778e-05, "loss": 1.2471, "step": 81800 }, { "epoch": 2.5019857029388404, "grad_norm": 0.8389211297035217, "learning_rate": 3.7553237057887654e-05, "loss": 1.128, "step": 81900 }, { "epoch": 2.5050406305370565, "grad_norm": 6.334069728851318, "learning_rate": 3.7537939052747525e-05, "loss": 1.4241, "step": 82000 }, { "epoch": 2.508095558135272, "grad_norm": 4.841616153717041, "learning_rate": 3.7522641047607396e-05, "loss": 1.0423, "step": 82100 }, { "epoch": 2.511150485733488, "grad_norm": 6.821234226226807, "learning_rate": 3.750734304246726e-05, "loss": 1.1206, "step": 82200 }, { "epoch": 2.514205413331704, "grad_norm": 3.092060089111328, "learning_rate": 3.749204503732713e-05, "loss": 1.0781, "step": 82300 }, { "epoch": 2.51726034092992, "grad_norm": 4.442360877990723, "learning_rate": 3.7476747032187e-05, "loss": 1.3774, "step": 82400 }, { "epoch": 2.520315268528136, "grad_norm": 5.051080226898193, "learning_rate": 3.7461449027046874e-05, "loss": 1.2533, "step": 82500 }, { "epoch": 2.523370196126352, "grad_norm": 2.790591239929199, "learning_rate": 3.7446151021906745e-05, "loss": 1.2949, "step": 82600 }, { "epoch": 2.5264251237245676, "grad_norm": 4.006244659423828, "learning_rate": 3.7430853016766617e-05, "loss": 1.1801, "step": 82700 }, { "epoch": 2.5294800513227838, "grad_norm": 4.384695053100586, "learning_rate": 3.741555501162649e-05, "loss": 1.1284, "step": 82800 }, { "epoch": 2.5325349789209994, "grad_norm": 6.040104866027832, "learning_rate": 3.740025700648636e-05, "loss": 1.3011, "step": 82900 }, { "epoch": 2.5355899065192156, "grad_norm": 14.280811309814453, "learning_rate": 3.738495900134622e-05, "loss": 1.1675, "step": 83000 }, { "epoch": 2.5386448341174317, "grad_norm": 5.165515899658203, "learning_rate": 3.73698139762575e-05, "loss": 1.0729, "step": 83100 }, { "epoch": 2.5416997617156474, "grad_norm": 16.37873077392578, "learning_rate": 3.735451597111736e-05, "loss": 1.1082, "step": 83200 }, { "epoch": 2.544754689313863, "grad_norm": 3.9798662662506104, "learning_rate": 3.7339217965977233e-05, "loss": 1.1677, "step": 83300 }, { "epoch": 2.547809616912079, "grad_norm": 8.16595458984375, "learning_rate": 3.7323919960837105e-05, "loss": 1.1792, "step": 83400 }, { "epoch": 2.5508645445102953, "grad_norm": 1.22407865524292, "learning_rate": 3.7308621955696976e-05, "loss": 1.1477, "step": 83500 }, { "epoch": 2.553919472108511, "grad_norm": 5.23829984664917, "learning_rate": 3.729332395055685e-05, "loss": 1.1713, "step": 83600 }, { "epoch": 2.5569743997067267, "grad_norm": 2.0325045585632324, "learning_rate": 3.727802594541672e-05, "loss": 1.2988, "step": 83700 }, { "epoch": 2.560029327304943, "grad_norm": 5.054375648498535, "learning_rate": 3.726272794027659e-05, "loss": 1.2534, "step": 83800 }, { "epoch": 2.563084254903159, "grad_norm": 5.4261155128479, "learning_rate": 3.724742993513646e-05, "loss": 1.1863, "step": 83900 }, { "epoch": 2.5661391825013746, "grad_norm": 4.739152908325195, "learning_rate": 3.7232131929996325e-05, "loss": 1.1728, "step": 84000 }, { "epoch": 2.5691941100995908, "grad_norm": 4.027005672454834, "learning_rate": 3.7216833924856196e-05, "loss": 1.1391, "step": 84100 }, { "epoch": 2.5722490376978064, "grad_norm": 2.6760764122009277, "learning_rate": 3.720153591971607e-05, "loss": 1.208, "step": 84200 }, { "epoch": 2.5753039652960226, "grad_norm": 7.8956217765808105, "learning_rate": 3.718623791457594e-05, "loss": 1.1109, "step": 84300 }, { "epoch": 2.5783588928942383, "grad_norm": 5.012131690979004, "learning_rate": 3.717093990943581e-05, "loss": 1.2318, "step": 84400 }, { "epoch": 2.5814138204924544, "grad_norm": 7.069117069244385, "learning_rate": 3.715564190429568e-05, "loss": 1.44, "step": 84500 }, { "epoch": 2.5844687480906705, "grad_norm": 7.637176990509033, "learning_rate": 3.714034389915555e-05, "loss": 1.18, "step": 84600 }, { "epoch": 2.587523675688886, "grad_norm": 5.741633415222168, "learning_rate": 3.712504589401542e-05, "loss": 1.2804, "step": 84700 }, { "epoch": 2.590578603287102, "grad_norm": 7.103206157684326, "learning_rate": 3.710974788887529e-05, "loss": 1.3692, "step": 84800 }, { "epoch": 2.593633530885318, "grad_norm": 4.079885959625244, "learning_rate": 3.709444988373516e-05, "loss": 1.276, "step": 84900 }, { "epoch": 2.596688458483534, "grad_norm": 5.2246479988098145, "learning_rate": 3.707915187859503e-05, "loss": 1.2933, "step": 85000 }, { "epoch": 2.59974338608175, "grad_norm": 4.833429336547852, "learning_rate": 3.70638538734549e-05, "loss": 1.3188, "step": 85100 }, { "epoch": 2.602798313679966, "grad_norm": 11.477587699890137, "learning_rate": 3.7048708848366176e-05, "loss": 1.1811, "step": 85200 }, { "epoch": 2.6058532412781816, "grad_norm": 5.056561470031738, "learning_rate": 3.703341084322605e-05, "loss": 1.124, "step": 85300 }, { "epoch": 2.6089081688763978, "grad_norm": 2.5134527683258057, "learning_rate": 3.701811283808592e-05, "loss": 1.1202, "step": 85400 }, { "epoch": 2.6119630964746134, "grad_norm": 2.793060779571533, "learning_rate": 3.700281483294579e-05, "loss": 1.1034, "step": 85500 }, { "epoch": 2.6150180240728296, "grad_norm": 4.571871757507324, "learning_rate": 3.698751682780566e-05, "loss": 1.0702, "step": 85600 }, { "epoch": 2.6180729516710453, "grad_norm": 3.5763347148895264, "learning_rate": 3.6972218822665525e-05, "loss": 1.2301, "step": 85700 }, { "epoch": 2.6211278792692614, "grad_norm": 11.870597839355469, "learning_rate": 3.6956920817525396e-05, "loss": 1.3386, "step": 85800 }, { "epoch": 2.624182806867477, "grad_norm": 7.040190696716309, "learning_rate": 3.694162281238527e-05, "loss": 1.1226, "step": 85900 }, { "epoch": 2.627237734465693, "grad_norm": 3.1670444011688232, "learning_rate": 3.692632480724514e-05, "loss": 1.0523, "step": 86000 }, { "epoch": 2.6302926620639093, "grad_norm": 0.7648332715034485, "learning_rate": 3.691102680210501e-05, "loss": 1.1571, "step": 86100 }, { "epoch": 2.633347589662125, "grad_norm": 27.462358474731445, "learning_rate": 3.689572879696488e-05, "loss": 1.3515, "step": 86200 }, { "epoch": 2.6364025172603407, "grad_norm": 3.599266529083252, "learning_rate": 3.688043079182475e-05, "loss": 1.2769, "step": 86300 }, { "epoch": 2.639457444858557, "grad_norm": 4.287918567657471, "learning_rate": 3.686513278668462e-05, "loss": 1.1693, "step": 86400 }, { "epoch": 2.642512372456773, "grad_norm": 10.636252403259277, "learning_rate": 3.684983478154449e-05, "loss": 1.2402, "step": 86500 }, { "epoch": 2.6455673000549886, "grad_norm": 6.561535358428955, "learning_rate": 3.683453677640436e-05, "loss": 1.1243, "step": 86600 }, { "epoch": 2.6486222276532048, "grad_norm": 3.99881649017334, "learning_rate": 3.681923877126423e-05, "loss": 1.1089, "step": 86700 }, { "epoch": 2.6516771552514204, "grad_norm": 5.040919780731201, "learning_rate": 3.68039407661241e-05, "loss": 1.1632, "step": 86800 }, { "epoch": 2.6547320828496366, "grad_norm": 1.5201897621154785, "learning_rate": 3.678864276098397e-05, "loss": 1.1074, "step": 86900 }, { "epoch": 2.6577870104478523, "grad_norm": 3.217228412628174, "learning_rate": 3.677334475584384e-05, "loss": 1.1696, "step": 87000 }, { "epoch": 2.6608419380460684, "grad_norm": 2.4591336250305176, "learning_rate": 3.6758046750703714e-05, "loss": 1.3259, "step": 87100 }, { "epoch": 2.663896865644284, "grad_norm": 15.164278030395508, "learning_rate": 3.674290172561498e-05, "loss": 1.1429, "step": 87200 }, { "epoch": 2.6669517932425, "grad_norm": 5.4548845291137695, "learning_rate": 3.672760372047485e-05, "loss": 1.4199, "step": 87300 }, { "epoch": 2.670006720840716, "grad_norm": 7.886258602142334, "learning_rate": 3.6712305715334724e-05, "loss": 1.1499, "step": 87400 }, { "epoch": 2.673061648438932, "grad_norm": 7.226693630218506, "learning_rate": 3.669700771019459e-05, "loss": 1.1624, "step": 87500 }, { "epoch": 2.676116576037148, "grad_norm": 9.446024894714355, "learning_rate": 3.668170970505446e-05, "loss": 1.1044, "step": 87600 }, { "epoch": 2.679171503635364, "grad_norm": 2.4468486309051514, "learning_rate": 3.666641169991433e-05, "loss": 1.1352, "step": 87700 }, { "epoch": 2.6822264312335795, "grad_norm": 3.7844583988189697, "learning_rate": 3.66511136947742e-05, "loss": 1.3393, "step": 87800 }, { "epoch": 2.6852813588317956, "grad_norm": 3.6775496006011963, "learning_rate": 3.663581568963407e-05, "loss": 1.1504, "step": 87900 }, { "epoch": 2.6883362864300118, "grad_norm": 6.129753112792969, "learning_rate": 3.6620517684493944e-05, "loss": 1.0526, "step": 88000 }, { "epoch": 2.6913912140282275, "grad_norm": 11.557202339172363, "learning_rate": 3.6605219679353816e-05, "loss": 1.3494, "step": 88100 }, { "epoch": 2.6944461416264436, "grad_norm": 9.309710502624512, "learning_rate": 3.658992167421369e-05, "loss": 1.2957, "step": 88200 }, { "epoch": 2.6975010692246593, "grad_norm": 4.096008777618408, "learning_rate": 3.657462366907355e-05, "loss": 1.3352, "step": 88300 }, { "epoch": 2.7005559968228754, "grad_norm": 8.638252258300781, "learning_rate": 3.655932566393342e-05, "loss": 1.1125, "step": 88400 }, { "epoch": 2.703610924421091, "grad_norm": 3.886626958847046, "learning_rate": 3.654402765879329e-05, "loss": 0.9983, "step": 88500 }, { "epoch": 2.706665852019307, "grad_norm": 0.3538905382156372, "learning_rate": 3.6528729653653165e-05, "loss": 1.304, "step": 88600 }, { "epoch": 2.7097207796175233, "grad_norm": 5.2617926597595215, "learning_rate": 3.6513431648513036e-05, "loss": 1.1223, "step": 88700 }, { "epoch": 2.712775707215739, "grad_norm": 15.284337997436523, "learning_rate": 3.649813364337291e-05, "loss": 1.1987, "step": 88800 }, { "epoch": 2.7158306348139547, "grad_norm": 5.358856201171875, "learning_rate": 3.648283563823278e-05, "loss": 1.3171, "step": 88900 }, { "epoch": 2.718885562412171, "grad_norm": 3.3485710620880127, "learning_rate": 3.646753763309265e-05, "loss": 1.19, "step": 89000 }, { "epoch": 2.721940490010387, "grad_norm": 3.345658302307129, "learning_rate": 3.6452239627952514e-05, "loss": 1.2676, "step": 89100 }, { "epoch": 2.7249954176086026, "grad_norm": 9.497928619384766, "learning_rate": 3.6436941622812385e-05, "loss": 1.1102, "step": 89200 }, { "epoch": 2.7280503452068183, "grad_norm": 7.673614978790283, "learning_rate": 3.6421643617672256e-05, "loss": 0.9989, "step": 89300 }, { "epoch": 2.7311052728050345, "grad_norm": 3.095411777496338, "learning_rate": 3.6406498592583524e-05, "loss": 1.1854, "step": 89400 }, { "epoch": 2.7341602004032506, "grad_norm": 11.466713905334473, "learning_rate": 3.6391200587443395e-05, "loss": 1.5457, "step": 89500 }, { "epoch": 2.7372151280014663, "grad_norm": 11.617925643920898, "learning_rate": 3.6375902582303266e-05, "loss": 1.115, "step": 89600 }, { "epoch": 2.7402700555996824, "grad_norm": 7.993494987487793, "learning_rate": 3.636060457716314e-05, "loss": 1.3089, "step": 89700 }, { "epoch": 2.743324983197898, "grad_norm": 4.979888916015625, "learning_rate": 3.634530657202301e-05, "loss": 1.142, "step": 89800 }, { "epoch": 2.746379910796114, "grad_norm": 6.180172920227051, "learning_rate": 3.633000856688288e-05, "loss": 1.311, "step": 89900 }, { "epoch": 2.74943483839433, "grad_norm": 2.8136937618255615, "learning_rate": 3.631471056174275e-05, "loss": 1.1723, "step": 90000 }, { "epoch": 2.752489765992546, "grad_norm": 12.260469436645508, "learning_rate": 3.629941255660262e-05, "loss": 1.4877, "step": 90100 }, { "epoch": 2.755544693590762, "grad_norm": 8.104580879211426, "learning_rate": 3.6284114551462486e-05, "loss": 1.0212, "step": 90200 }, { "epoch": 2.758599621188978, "grad_norm": 4.810283660888672, "learning_rate": 3.626881654632236e-05, "loss": 1.1489, "step": 90300 }, { "epoch": 2.7616545487871935, "grad_norm": 4.126216888427734, "learning_rate": 3.625351854118223e-05, "loss": 1.107, "step": 90400 }, { "epoch": 2.7647094763854096, "grad_norm": 4.295004367828369, "learning_rate": 3.62382205360421e-05, "loss": 1.0719, "step": 90500 }, { "epoch": 2.7677644039836258, "grad_norm": 7.011680603027344, "learning_rate": 3.622292253090197e-05, "loss": 1.2707, "step": 90600 }, { "epoch": 2.7708193315818415, "grad_norm": 2.7568280696868896, "learning_rate": 3.620762452576184e-05, "loss": 1.1977, "step": 90700 }, { "epoch": 2.7738742591800576, "grad_norm": 4.980713367462158, "learning_rate": 3.619232652062171e-05, "loss": 1.1931, "step": 90800 }, { "epoch": 2.7769291867782733, "grad_norm": 4.1475067138671875, "learning_rate": 3.6177028515481584e-05, "loss": 1.4218, "step": 90900 }, { "epoch": 2.7799841143764894, "grad_norm": 17.66010856628418, "learning_rate": 3.616173051034145e-05, "loss": 1.0312, "step": 91000 }, { "epoch": 2.783039041974705, "grad_norm": 11.250906944274902, "learning_rate": 3.614643250520132e-05, "loss": 1.4046, "step": 91100 }, { "epoch": 2.786093969572921, "grad_norm": 3.021059513092041, "learning_rate": 3.613113450006119e-05, "loss": 1.2493, "step": 91200 }, { "epoch": 2.789148897171137, "grad_norm": 3.5739548206329346, "learning_rate": 3.611583649492106e-05, "loss": 1.0393, "step": 91300 }, { "epoch": 2.792203824769353, "grad_norm": 3.9998271465301514, "learning_rate": 3.6100538489780933e-05, "loss": 1.088, "step": 91400 }, { "epoch": 2.7952587523675687, "grad_norm": 7.070734977722168, "learning_rate": 3.608539346469221e-05, "loss": 1.1436, "step": 91500 }, { "epoch": 2.798313679965785, "grad_norm": 4.2065839767456055, "learning_rate": 3.607009545955208e-05, "loss": 1.0456, "step": 91600 }, { "epoch": 2.801368607564001, "grad_norm": 8.773880958557129, "learning_rate": 3.605479745441195e-05, "loss": 1.184, "step": 91700 }, { "epoch": 2.8044235351622167, "grad_norm": 6.073965549468994, "learning_rate": 3.603949944927182e-05, "loss": 1.1699, "step": 91800 }, { "epoch": 2.8074784627604323, "grad_norm": 4.126655101776123, "learning_rate": 3.6024201444131686e-05, "loss": 1.052, "step": 91900 }, { "epoch": 2.8105333903586485, "grad_norm": 2.200648784637451, "learning_rate": 3.600890343899156e-05, "loss": 1.3698, "step": 92000 }, { "epoch": 2.8135883179568646, "grad_norm": 5.055072784423828, "learning_rate": 3.599360543385143e-05, "loss": 1.0363, "step": 92100 }, { "epoch": 2.8166432455550803, "grad_norm": 3.9162421226501465, "learning_rate": 3.59783074287113e-05, "loss": 1.3419, "step": 92200 }, { "epoch": 2.8196981731532964, "grad_norm": 4.10654878616333, "learning_rate": 3.596300942357117e-05, "loss": 1.2493, "step": 92300 }, { "epoch": 2.822753100751512, "grad_norm": 5.665062427520752, "learning_rate": 3.594771141843104e-05, "loss": 0.998, "step": 92400 }, { "epoch": 2.825808028349728, "grad_norm": 4.1935577392578125, "learning_rate": 3.593241341329091e-05, "loss": 1.1017, "step": 92500 }, { "epoch": 2.828862955947944, "grad_norm": 11.51850414276123, "learning_rate": 3.5917115408150784e-05, "loss": 1.2248, "step": 92600 }, { "epoch": 2.83191788354616, "grad_norm": 11.625690460205078, "learning_rate": 3.590181740301065e-05, "loss": 1.1488, "step": 92700 }, { "epoch": 2.8349728111443757, "grad_norm": 3.1991724967956543, "learning_rate": 3.588651939787052e-05, "loss": 1.1318, "step": 92800 }, { "epoch": 2.838027738742592, "grad_norm": 1.6752854585647583, "learning_rate": 3.587122139273039e-05, "loss": 1.0442, "step": 92900 }, { "epoch": 2.8410826663408075, "grad_norm": 5.884594917297363, "learning_rate": 3.585592338759026e-05, "loss": 1.0908, "step": 93000 }, { "epoch": 2.8441375939390237, "grad_norm": 10.092119216918945, "learning_rate": 3.584062538245013e-05, "loss": 1.1826, "step": 93100 }, { "epoch": 2.84719252153724, "grad_norm": 2.494101047515869, "learning_rate": 3.5825327377310004e-05, "loss": 1.2812, "step": 93200 }, { "epoch": 2.8502474491354555, "grad_norm": 4.2990641593933105, "learning_rate": 3.5810029372169875e-05, "loss": 1.1301, "step": 93300 }, { "epoch": 2.853302376733671, "grad_norm": 1.3677186965942383, "learning_rate": 3.5794731367029747e-05, "loss": 1.3642, "step": 93400 }, { "epoch": 2.8563573043318873, "grad_norm": 2.856877565383911, "learning_rate": 3.5779586341941015e-05, "loss": 1.1834, "step": 93500 }, { "epoch": 2.8594122319301034, "grad_norm": 5.638931751251221, "learning_rate": 3.5764288336800886e-05, "loss": 1.1668, "step": 93600 }, { "epoch": 2.862467159528319, "grad_norm": 3.2125723361968994, "learning_rate": 3.574899033166075e-05, "loss": 1.1466, "step": 93700 }, { "epoch": 2.8655220871265352, "grad_norm": 4.384485721588135, "learning_rate": 3.573369232652062e-05, "loss": 1.4209, "step": 93800 }, { "epoch": 2.868577014724751, "grad_norm": 4.23681640625, "learning_rate": 3.571839432138049e-05, "loss": 1.0677, "step": 93900 }, { "epoch": 2.871631942322967, "grad_norm": 5.866428375244141, "learning_rate": 3.5703096316240364e-05, "loss": 1.1571, "step": 94000 }, { "epoch": 2.8746868699211827, "grad_norm": 5.092837333679199, "learning_rate": 3.5687798311100235e-05, "loss": 1.3121, "step": 94100 }, { "epoch": 2.877741797519399, "grad_norm": 1.3914189338684082, "learning_rate": 3.56726532860115e-05, "loss": 1.1406, "step": 94200 }, { "epoch": 2.880796725117615, "grad_norm": 1.6459842920303345, "learning_rate": 3.5657355280871374e-05, "loss": 1.0341, "step": 94300 }, { "epoch": 2.8838516527158307, "grad_norm": 6.084038734436035, "learning_rate": 3.5642057275731245e-05, "loss": 1.0185, "step": 94400 }, { "epoch": 2.8869065803140463, "grad_norm": 2.1041250228881836, "learning_rate": 3.5626759270591116e-05, "loss": 1.13, "step": 94500 }, { "epoch": 2.8899615079122625, "grad_norm": 0.017624640837311745, "learning_rate": 3.561146126545099e-05, "loss": 1.1235, "step": 94600 }, { "epoch": 2.8930164355104786, "grad_norm": 2.9495761394500732, "learning_rate": 3.559616326031086e-05, "loss": 1.2972, "step": 94700 }, { "epoch": 2.8960713631086943, "grad_norm": 0.7582499384880066, "learning_rate": 3.558086525517072e-05, "loss": 1.2099, "step": 94800 }, { "epoch": 2.89912629070691, "grad_norm": 5.334167003631592, "learning_rate": 3.5565567250030594e-05, "loss": 1.2623, "step": 94900 }, { "epoch": 2.902181218305126, "grad_norm": 17.52392578125, "learning_rate": 3.5550269244890465e-05, "loss": 1.2304, "step": 95000 }, { "epoch": 2.9052361459033422, "grad_norm": 2.240028142929077, "learning_rate": 3.5534971239750336e-05, "loss": 1.1974, "step": 95100 }, { "epoch": 2.908291073501558, "grad_norm": 3.9233059883117676, "learning_rate": 3.551967323461021e-05, "loss": 1.0685, "step": 95200 }, { "epoch": 2.911346001099774, "grad_norm": 3.115082263946533, "learning_rate": 3.550437522947008e-05, "loss": 0.9877, "step": 95300 }, { "epoch": 2.9144009286979897, "grad_norm": 6.149535655975342, "learning_rate": 3.548907722432995e-05, "loss": 1.1253, "step": 95400 }, { "epoch": 2.917455856296206, "grad_norm": 6.213714122772217, "learning_rate": 3.547377921918982e-05, "loss": 1.256, "step": 95500 }, { "epoch": 2.9205107838944215, "grad_norm": 10.935718536376953, "learning_rate": 3.5458481214049685e-05, "loss": 1.1948, "step": 95600 }, { "epoch": 2.9235657114926377, "grad_norm": 3.0346519947052, "learning_rate": 3.5443183208909556e-05, "loss": 1.1022, "step": 95700 }, { "epoch": 2.926620639090854, "grad_norm": 8.563993453979492, "learning_rate": 3.542788520376943e-05, "loss": 1.0593, "step": 95800 }, { "epoch": 2.9296755666890695, "grad_norm": 1.6278303861618042, "learning_rate": 3.54125871986293e-05, "loss": 1.2639, "step": 95900 }, { "epoch": 2.932730494287285, "grad_norm": 1.3326663970947266, "learning_rate": 3.539728919348917e-05, "loss": 1.1684, "step": 96000 }, { "epoch": 2.9357854218855013, "grad_norm": 8.367751121520996, "learning_rate": 3.538199118834904e-05, "loss": 1.2524, "step": 96100 }, { "epoch": 2.9388403494837174, "grad_norm": 7.878076553344727, "learning_rate": 3.536669318320891e-05, "loss": 1.0507, "step": 96200 }, { "epoch": 2.941895277081933, "grad_norm": 3.120497703552246, "learning_rate": 3.5351395178068783e-05, "loss": 1.0112, "step": 96300 }, { "epoch": 2.9449502046801492, "grad_norm": 9.512770652770996, "learning_rate": 3.533609717292865e-05, "loss": 1.2574, "step": 96400 }, { "epoch": 2.948005132278365, "grad_norm": 10.365994453430176, "learning_rate": 3.532079916778852e-05, "loss": 1.0112, "step": 96500 }, { "epoch": 2.951060059876581, "grad_norm": 13.449652671813965, "learning_rate": 3.530550116264839e-05, "loss": 1.3926, "step": 96600 }, { "epoch": 2.9541149874747967, "grad_norm": 8.135039329528809, "learning_rate": 3.529020315750826e-05, "loss": 1.1585, "step": 96700 }, { "epoch": 2.957169915073013, "grad_norm": 4.279914855957031, "learning_rate": 3.527490515236813e-05, "loss": 1.1064, "step": 96800 }, { "epoch": 2.9602248426712285, "grad_norm": 2.8328933715820312, "learning_rate": 3.5259607147228004e-05, "loss": 1.0075, "step": 96900 }, { "epoch": 2.9632797702694447, "grad_norm": 5.973811149597168, "learning_rate": 3.5244309142087875e-05, "loss": 1.0137, "step": 97000 }, { "epoch": 2.9663346978676604, "grad_norm": 5.440450668334961, "learning_rate": 3.5229011136947746e-05, "loss": 0.983, "step": 97100 }, { "epoch": 2.9693896254658765, "grad_norm": 7.617842197418213, "learning_rate": 3.521371313180761e-05, "loss": 1.096, "step": 97200 }, { "epoch": 2.9724445530640926, "grad_norm": 2.359002113342285, "learning_rate": 3.519841512666748e-05, "loss": 1.425, "step": 97300 }, { "epoch": 2.9754994806623083, "grad_norm": 7.6081156730651855, "learning_rate": 3.518311712152735e-05, "loss": 1.3298, "step": 97400 }, { "epoch": 2.978554408260524, "grad_norm": 14.962251663208008, "learning_rate": 3.5167819116387224e-05, "loss": 1.3354, "step": 97500 }, { "epoch": 2.98160933585874, "grad_norm": 6.850179672241211, "learning_rate": 3.5152521111247095e-05, "loss": 1.2817, "step": 97600 }, { "epoch": 2.9846642634569562, "grad_norm": 4.044615745544434, "learning_rate": 3.5137223106106966e-05, "loss": 1.1176, "step": 97700 }, { "epoch": 2.987719191055172, "grad_norm": 7.540977478027344, "learning_rate": 3.512192510096684e-05, "loss": 1.2165, "step": 97800 }, { "epoch": 2.990774118653388, "grad_norm": 10.70987319946289, "learning_rate": 3.510662709582671e-05, "loss": 1.0701, "step": 97900 }, { "epoch": 2.9938290462516037, "grad_norm": 2.2019271850585938, "learning_rate": 3.509148207073798e-05, "loss": 1.3646, "step": 98000 }, { "epoch": 2.99688397384982, "grad_norm": 5.027231693267822, "learning_rate": 3.507618406559785e-05, "loss": 1.3256, "step": 98100 }, { "epoch": 2.9999389014480355, "grad_norm": 11.271944999694824, "learning_rate": 3.506088606045772e-05, "loss": 1.198, "step": 98200 }, { "epoch": 3.0, "eval_accuracy": 0.7097207796175231, "eval_loss": 0.7809557318687439, "eval_runtime": 1782.749, "eval_samples_per_second": 18.362, "eval_steps_per_second": 4.591, "step": 98202 }, { "epoch": 3.0029938290462517, "grad_norm": 4.658991813659668, "learning_rate": 3.504558805531759e-05, "loss": 1.1387, "step": 98300 }, { "epoch": 3.0060487566444674, "grad_norm": 9.121160507202148, "learning_rate": 3.503029005017746e-05, "loss": 1.2615, "step": 98400 }, { "epoch": 3.0091036842426835, "grad_norm": 10.340275764465332, "learning_rate": 3.501499204503733e-05, "loss": 0.9685, "step": 98500 }, { "epoch": 3.012158611840899, "grad_norm": 6.134807109832764, "learning_rate": 3.49996940398972e-05, "loss": 1.3476, "step": 98600 }, { "epoch": 3.0152135394391153, "grad_norm": 6.84115743637085, "learning_rate": 3.4984396034757074e-05, "loss": 1.1311, "step": 98700 }, { "epoch": 3.0182684670373314, "grad_norm": 9.961685180664062, "learning_rate": 3.4969098029616946e-05, "loss": 1.0183, "step": 98800 }, { "epoch": 3.021323394635547, "grad_norm": 10.408890724182129, "learning_rate": 3.495380002447681e-05, "loss": 1.1134, "step": 98900 }, { "epoch": 3.0243783222337632, "grad_norm": 5.6719183921813965, "learning_rate": 3.4938502019336674e-05, "loss": 1.2485, "step": 99000 }, { "epoch": 3.027433249831979, "grad_norm": 4.5017523765563965, "learning_rate": 3.4923204014196546e-05, "loss": 1.0285, "step": 99100 }, { "epoch": 3.030488177430195, "grad_norm": 3.855346918106079, "learning_rate": 3.490790600905642e-05, "loss": 1.2724, "step": 99200 }, { "epoch": 3.0335431050284107, "grad_norm": 7.594958782196045, "learning_rate": 3.489260800391629e-05, "loss": 1.0123, "step": 99300 }, { "epoch": 3.036598032626627, "grad_norm": 5.651984214782715, "learning_rate": 3.487730999877616e-05, "loss": 1.0763, "step": 99400 }, { "epoch": 3.0396529602248425, "grad_norm": 3.7878353595733643, "learning_rate": 3.486201199363603e-05, "loss": 1.1441, "step": 99500 }, { "epoch": 3.0427078878230587, "grad_norm": 3.542327404022217, "learning_rate": 3.48467139884959e-05, "loss": 1.0631, "step": 99600 }, { "epoch": 3.0457628154212744, "grad_norm": 2.071479082107544, "learning_rate": 3.483141598335577e-05, "loss": 1.1079, "step": 99700 }, { "epoch": 3.0488177430194905, "grad_norm": 7.825259208679199, "learning_rate": 3.481611797821564e-05, "loss": 1.1047, "step": 99800 }, { "epoch": 3.051872670617706, "grad_norm": 6.0007452964782715, "learning_rate": 3.480081997307551e-05, "loss": 1.3561, "step": 99900 }, { "epoch": 3.0549275982159223, "grad_norm": 3.6153903007507324, "learning_rate": 3.478552196793538e-05, "loss": 1.1561, "step": 100000 }, { "epoch": 3.057982525814138, "grad_norm": 5.3658552169799805, "learning_rate": 3.477022396279525e-05, "loss": 1.1673, "step": 100100 }, { "epoch": 3.061037453412354, "grad_norm": 6.298893928527832, "learning_rate": 3.475492595765512e-05, "loss": 1.0913, "step": 100200 }, { "epoch": 3.0640923810105702, "grad_norm": 3.745986223220825, "learning_rate": 3.473962795251499e-05, "loss": 1.2041, "step": 100300 }, { "epoch": 3.067147308608786, "grad_norm": 2.570671558380127, "learning_rate": 3.4724329947374864e-05, "loss": 1.2944, "step": 100400 }, { "epoch": 3.070202236207002, "grad_norm": 2.7574243545532227, "learning_rate": 3.4709031942234735e-05, "loss": 1.2773, "step": 100500 }, { "epoch": 3.0732571638052177, "grad_norm": 2.7146689891815186, "learning_rate": 3.46937339370946e-05, "loss": 1.0322, "step": 100600 }, { "epoch": 3.076312091403434, "grad_norm": 5.170768737792969, "learning_rate": 3.467843593195447e-05, "loss": 1.084, "step": 100700 }, { "epoch": 3.0793670190016496, "grad_norm": 6.875863552093506, "learning_rate": 3.466313792681434e-05, "loss": 1.2769, "step": 100800 }, { "epoch": 3.0824219465998657, "grad_norm": 10.701215744018555, "learning_rate": 3.464783992167421e-05, "loss": 1.3305, "step": 100900 }, { "epoch": 3.0854768741980814, "grad_norm": 5.790326118469238, "learning_rate": 3.4632541916534084e-05, "loss": 0.9886, "step": 101000 }, { "epoch": 3.0885318017962975, "grad_norm": 10.692790031433105, "learning_rate": 3.4617243911393955e-05, "loss": 1.0596, "step": 101100 }, { "epoch": 3.091586729394513, "grad_norm": 0.11457054316997528, "learning_rate": 3.4601945906253826e-05, "loss": 1.0932, "step": 101200 }, { "epoch": 3.0946416569927293, "grad_norm": 2.845723867416382, "learning_rate": 3.45866479011137e-05, "loss": 1.1524, "step": 101300 }, { "epoch": 3.097696584590945, "grad_norm": 8.70290470123291, "learning_rate": 3.457134989597356e-05, "loss": 1.2215, "step": 101400 }, { "epoch": 3.100751512189161, "grad_norm": 3.0512425899505615, "learning_rate": 3.455605189083343e-05, "loss": 1.051, "step": 101500 }, { "epoch": 3.1038064397873772, "grad_norm": 4.125731945037842, "learning_rate": 3.4540753885693304e-05, "loss": 1.3042, "step": 101600 }, { "epoch": 3.106861367385593, "grad_norm": 10.031002044677734, "learning_rate": 3.4525455880553175e-05, "loss": 1.1348, "step": 101700 }, { "epoch": 3.109916294983809, "grad_norm": 7.4683637619018555, "learning_rate": 3.4510157875413046e-05, "loss": 1.2125, "step": 101800 }, { "epoch": 3.1129712225820247, "grad_norm": 3.547574520111084, "learning_rate": 3.449485987027292e-05, "loss": 1.1046, "step": 101900 }, { "epoch": 3.116026150180241, "grad_norm": 17.192188262939453, "learning_rate": 3.447971484518419e-05, "loss": 1.2065, "step": 102000 }, { "epoch": 3.1190810777784566, "grad_norm": 2.1484580039978027, "learning_rate": 3.4464416840044063e-05, "loss": 1.235, "step": 102100 }, { "epoch": 3.1221360053766727, "grad_norm": 2.382115602493286, "learning_rate": 3.4449118834903935e-05, "loss": 1.0214, "step": 102200 }, { "epoch": 3.1251909329748884, "grad_norm": 9.626750946044922, "learning_rate": 3.44338208297638e-05, "loss": 1.2341, "step": 102300 }, { "epoch": 3.1282458605731045, "grad_norm": 7.635776042938232, "learning_rate": 3.441852282462367e-05, "loss": 1.3048, "step": 102400 }, { "epoch": 3.13130078817132, "grad_norm": 1.5449594259262085, "learning_rate": 3.440322481948354e-05, "loss": 1.1685, "step": 102500 }, { "epoch": 3.1343557157695363, "grad_norm": 2.670468807220459, "learning_rate": 3.438792681434341e-05, "loss": 1.0141, "step": 102600 }, { "epoch": 3.137410643367752, "grad_norm": 7.990098476409912, "learning_rate": 3.4372628809203284e-05, "loss": 1.1673, "step": 102700 }, { "epoch": 3.140465570965968, "grad_norm": 11.402932167053223, "learning_rate": 3.435748378411455e-05, "loss": 1.2708, "step": 102800 }, { "epoch": 3.1435204985641843, "grad_norm": 10.994685173034668, "learning_rate": 3.434218577897442e-05, "loss": 1.2813, "step": 102900 }, { "epoch": 3.1465754261624, "grad_norm": 8.80547046661377, "learning_rate": 3.4326887773834294e-05, "loss": 1.2177, "step": 103000 }, { "epoch": 3.149630353760616, "grad_norm": 8.42204475402832, "learning_rate": 3.4311589768694165e-05, "loss": 1.0678, "step": 103100 }, { "epoch": 3.1526852813588317, "grad_norm": 4.392109394073486, "learning_rate": 3.4296291763554036e-05, "loss": 1.0808, "step": 103200 }, { "epoch": 3.155740208957048, "grad_norm": 9.018641471862793, "learning_rate": 3.428099375841391e-05, "loss": 1.1999, "step": 103300 }, { "epoch": 3.1587951365552636, "grad_norm": 7.8954854011535645, "learning_rate": 3.426569575327377e-05, "loss": 0.9657, "step": 103400 }, { "epoch": 3.1618500641534797, "grad_norm": 1.4429759979248047, "learning_rate": 3.425039774813364e-05, "loss": 1.0844, "step": 103500 }, { "epoch": 3.1649049917516954, "grad_norm": 5.75102424621582, "learning_rate": 3.4235099742993514e-05, "loss": 1.2737, "step": 103600 }, { "epoch": 3.1679599193499115, "grad_norm": 6.8921122550964355, "learning_rate": 3.4219801737853385e-05, "loss": 1.0959, "step": 103700 }, { "epoch": 3.171014846948127, "grad_norm": 3.6340596675872803, "learning_rate": 3.4204503732713256e-05, "loss": 1.0954, "step": 103800 }, { "epoch": 3.1740697745463433, "grad_norm": 19.560989379882812, "learning_rate": 3.418920572757313e-05, "loss": 1.0977, "step": 103900 }, { "epoch": 3.177124702144559, "grad_norm": 3.10819673538208, "learning_rate": 3.4173907722433e-05, "loss": 1.0771, "step": 104000 }, { "epoch": 3.180179629742775, "grad_norm": 21.455175399780273, "learning_rate": 3.415860971729287e-05, "loss": 1.0025, "step": 104100 }, { "epoch": 3.183234557340991, "grad_norm": 2.593312978744507, "learning_rate": 3.4143311712152734e-05, "loss": 1.0311, "step": 104200 }, { "epoch": 3.186289484939207, "grad_norm": 9.986220359802246, "learning_rate": 3.4128013707012605e-05, "loss": 1.0512, "step": 104300 }, { "epoch": 3.189344412537423, "grad_norm": 5.130756378173828, "learning_rate": 3.4112715701872477e-05, "loss": 0.8577, "step": 104400 }, { "epoch": 3.1923993401356388, "grad_norm": 2.575563669204712, "learning_rate": 3.409741769673235e-05, "loss": 1.2148, "step": 104500 }, { "epoch": 3.195454267733855, "grad_norm": 10.687899589538574, "learning_rate": 3.408211969159222e-05, "loss": 1.1314, "step": 104600 }, { "epoch": 3.1985091953320706, "grad_norm": 19.845109939575195, "learning_rate": 3.406682168645209e-05, "loss": 1.1975, "step": 104700 }, { "epoch": 3.2015641229302867, "grad_norm": 5.279234409332275, "learning_rate": 3.405167666136336e-05, "loss": 1.1001, "step": 104800 }, { "epoch": 3.2046190505285024, "grad_norm": 1.6543632745742798, "learning_rate": 3.403637865622323e-05, "loss": 1.2652, "step": 104900 }, { "epoch": 3.2076739781267185, "grad_norm": 3.378157377243042, "learning_rate": 3.40210806510831e-05, "loss": 1.11, "step": 105000 }, { "epoch": 3.210728905724934, "grad_norm": 6.591047763824463, "learning_rate": 3.400578264594297e-05, "loss": 1.201, "step": 105100 }, { "epoch": 3.2137838333231503, "grad_norm": 2.332491636276245, "learning_rate": 3.3990484640802836e-05, "loss": 1.0388, "step": 105200 }, { "epoch": 3.216838760921366, "grad_norm": 1.7545006275177002, "learning_rate": 3.397518663566271e-05, "loss": 1.1818, "step": 105300 }, { "epoch": 3.219893688519582, "grad_norm": 6.403434753417969, "learning_rate": 3.395988863052258e-05, "loss": 1.3045, "step": 105400 }, { "epoch": 3.222948616117798, "grad_norm": 2.699486255645752, "learning_rate": 3.394459062538245e-05, "loss": 1.0548, "step": 105500 }, { "epoch": 3.226003543716014, "grad_norm": 3.450688600540161, "learning_rate": 3.392929262024232e-05, "loss": 1.2869, "step": 105600 }, { "epoch": 3.2290584713142296, "grad_norm": 8.332239151000977, "learning_rate": 3.391399461510219e-05, "loss": 1.2839, "step": 105700 }, { "epoch": 3.2321133989124458, "grad_norm": 5.950933933258057, "learning_rate": 3.389869660996206e-05, "loss": 1.1488, "step": 105800 }, { "epoch": 3.235168326510662, "grad_norm": 4.980724811553955, "learning_rate": 3.3883398604821934e-05, "loss": 1.2878, "step": 105900 }, { "epoch": 3.2382232541088776, "grad_norm": 6.563425540924072, "learning_rate": 3.38681005996818e-05, "loss": 1.1434, "step": 106000 }, { "epoch": 3.2412781817070937, "grad_norm": 6.405630588531494, "learning_rate": 3.385280259454167e-05, "loss": 1.2905, "step": 106100 }, { "epoch": 3.2443331093053094, "grad_norm": 5.564048767089844, "learning_rate": 3.383750458940154e-05, "loss": 1.3337, "step": 106200 }, { "epoch": 3.2473880369035255, "grad_norm": 12.21498966217041, "learning_rate": 3.382220658426141e-05, "loss": 0.9791, "step": 106300 }, { "epoch": 3.250442964501741, "grad_norm": 4.009174823760986, "learning_rate": 3.380690857912128e-05, "loss": 1.1921, "step": 106400 }, { "epoch": 3.2534978920999573, "grad_norm": 6.204054832458496, "learning_rate": 3.3791610573981154e-05, "loss": 1.061, "step": 106500 }, { "epoch": 3.256552819698173, "grad_norm": 1.8835408687591553, "learning_rate": 3.3776312568841025e-05, "loss": 1.1738, "step": 106600 }, { "epoch": 3.259607747296389, "grad_norm": 3.0553841590881348, "learning_rate": 3.3761014563700896e-05, "loss": 1.1092, "step": 106700 }, { "epoch": 3.262662674894605, "grad_norm": 2.027163028717041, "learning_rate": 3.374571655856076e-05, "loss": 1.1073, "step": 106800 }, { "epoch": 3.265717602492821, "grad_norm": 6.837578773498535, "learning_rate": 3.373041855342063e-05, "loss": 1.0253, "step": 106900 }, { "epoch": 3.268772530091037, "grad_norm": 5.914719104766846, "learning_rate": 3.371527352833191e-05, "loss": 1.3284, "step": 107000 }, { "epoch": 3.2718274576892528, "grad_norm": 2.149627447128296, "learning_rate": 3.369997552319178e-05, "loss": 1.0899, "step": 107100 }, { "epoch": 3.2748823852874684, "grad_norm": 5.665188312530518, "learning_rate": 3.368467751805165e-05, "loss": 1.1932, "step": 107200 }, { "epoch": 3.2779373128856846, "grad_norm": 3.7738704681396484, "learning_rate": 3.366937951291152e-05, "loss": 1.1043, "step": 107300 }, { "epoch": 3.2809922404839007, "grad_norm": 8.023479461669922, "learning_rate": 3.365408150777139e-05, "loss": 1.062, "step": 107400 }, { "epoch": 3.2840471680821164, "grad_norm": 13.017199516296387, "learning_rate": 3.363878350263126e-05, "loss": 1.275, "step": 107500 }, { "epoch": 3.2871020956803325, "grad_norm": 9.704121589660645, "learning_rate": 3.3623485497491134e-05, "loss": 1.2566, "step": 107600 }, { "epoch": 3.290157023278548, "grad_norm": 1.2066890001296997, "learning_rate": 3.3608187492351e-05, "loss": 1.1705, "step": 107700 }, { "epoch": 3.2932119508767643, "grad_norm": 7.091336250305176, "learning_rate": 3.359288948721087e-05, "loss": 1.1722, "step": 107800 }, { "epoch": 3.29626687847498, "grad_norm": 5.988069534301758, "learning_rate": 3.357759148207074e-05, "loss": 1.2878, "step": 107900 }, { "epoch": 3.299321806073196, "grad_norm": 8.537280082702637, "learning_rate": 3.356229347693061e-05, "loss": 1.1086, "step": 108000 }, { "epoch": 3.302376733671412, "grad_norm": 10.873692512512207, "learning_rate": 3.354699547179048e-05, "loss": 1.0681, "step": 108100 }, { "epoch": 3.305431661269628, "grad_norm": 6.819619178771973, "learning_rate": 3.3531697466650354e-05, "loss": 1.1786, "step": 108200 }, { "epoch": 3.3084865888678436, "grad_norm": 3.012190103530884, "learning_rate": 3.3516399461510225e-05, "loss": 1.1915, "step": 108300 }, { "epoch": 3.3115415164660598, "grad_norm": 1.4920376539230347, "learning_rate": 3.3501101456370096e-05, "loss": 1.0751, "step": 108400 }, { "epoch": 3.314596444064276, "grad_norm": 0.5798110365867615, "learning_rate": 3.348580345122996e-05, "loss": 1.0887, "step": 108500 }, { "epoch": 3.3176513716624916, "grad_norm": 3.4853806495666504, "learning_rate": 3.347050544608983e-05, "loss": 1.1365, "step": 108600 }, { "epoch": 3.3207062992607077, "grad_norm": 6.759474754333496, "learning_rate": 3.34552074409497e-05, "loss": 1.2419, "step": 108700 }, { "epoch": 3.3237612268589234, "grad_norm": 2.99652099609375, "learning_rate": 3.3439909435809574e-05, "loss": 1.4198, "step": 108800 }, { "epoch": 3.3268161544571395, "grad_norm": 9.81747055053711, "learning_rate": 3.3424611430669445e-05, "loss": 1.2768, "step": 108900 }, { "epoch": 3.329871082055355, "grad_norm": 6.646118640899658, "learning_rate": 3.340946640558071e-05, "loss": 1.4317, "step": 109000 }, { "epoch": 3.3329260096535713, "grad_norm": 2.144011974334717, "learning_rate": 3.3394168400440584e-05, "loss": 1.2185, "step": 109100 }, { "epoch": 3.335980937251787, "grad_norm": 3.1215665340423584, "learning_rate": 3.3378870395300455e-05, "loss": 1.1337, "step": 109200 }, { "epoch": 3.339035864850003, "grad_norm": 8.02299976348877, "learning_rate": 3.3363572390160327e-05, "loss": 1.1333, "step": 109300 }, { "epoch": 3.342090792448219, "grad_norm": 11.480727195739746, "learning_rate": 3.33482743850202e-05, "loss": 1.0622, "step": 109400 }, { "epoch": 3.345145720046435, "grad_norm": 0.7370632290840149, "learning_rate": 3.333297637988006e-05, "loss": 1.1977, "step": 109500 }, { "epoch": 3.3482006476446506, "grad_norm": 8.795260429382324, "learning_rate": 3.331767837473993e-05, "loss": 1.058, "step": 109600 }, { "epoch": 3.3512555752428668, "grad_norm": 3.5496344566345215, "learning_rate": 3.3302380369599804e-05, "loss": 1.2247, "step": 109700 }, { "epoch": 3.3543105028410825, "grad_norm": 2.8768632411956787, "learning_rate": 3.3287082364459676e-05, "loss": 1.1976, "step": 109800 }, { "epoch": 3.3573654304392986, "grad_norm": 0.17894093692302704, "learning_rate": 3.327178435931955e-05, "loss": 1.0641, "step": 109900 }, { "epoch": 3.3604203580375147, "grad_norm": 4.932908535003662, "learning_rate": 3.325648635417942e-05, "loss": 1.1351, "step": 110000 }, { "epoch": 3.3634752856357304, "grad_norm": 4.198153972625732, "learning_rate": 3.324118834903929e-05, "loss": 1.1921, "step": 110100 }, { "epoch": 3.3665302132339465, "grad_norm": 4.184053897857666, "learning_rate": 3.322589034389916e-05, "loss": 1.1354, "step": 110200 }, { "epoch": 3.369585140832162, "grad_norm": 6.549489974975586, "learning_rate": 3.3210592338759025e-05, "loss": 1.3167, "step": 110300 }, { "epoch": 3.3726400684303783, "grad_norm": 10.888463973999023, "learning_rate": 3.3195294333618896e-05, "loss": 1.542, "step": 110400 }, { "epoch": 3.375694996028594, "grad_norm": 3.289114475250244, "learning_rate": 3.317999632847877e-05, "loss": 1.2994, "step": 110500 }, { "epoch": 3.37874992362681, "grad_norm": 21.504901885986328, "learning_rate": 3.316469832333864e-05, "loss": 1.4213, "step": 110600 }, { "epoch": 3.381804851225026, "grad_norm": 15.273396492004395, "learning_rate": 3.314940031819851e-05, "loss": 1.2337, "step": 110700 }, { "epoch": 3.384859778823242, "grad_norm": 9.156777381896973, "learning_rate": 3.313410231305838e-05, "loss": 1.1234, "step": 110800 }, { "epoch": 3.3879147064214576, "grad_norm": 12.19512939453125, "learning_rate": 3.311880430791825e-05, "loss": 1.275, "step": 110900 }, { "epoch": 3.3909696340196738, "grad_norm": 5.635876655578613, "learning_rate": 3.310350630277812e-05, "loss": 1.1857, "step": 111000 }, { "epoch": 3.39402456161789, "grad_norm": 4.696619510650635, "learning_rate": 3.308820829763799e-05, "loss": 1.1384, "step": 111100 }, { "epoch": 3.3970794892161056, "grad_norm": 4.503530979156494, "learning_rate": 3.307291029249786e-05, "loss": 1.0952, "step": 111200 }, { "epoch": 3.4001344168143213, "grad_norm": 3.3427395820617676, "learning_rate": 3.305761228735773e-05, "loss": 1.1545, "step": 111300 }, { "epoch": 3.4031893444125374, "grad_norm": 10.8120698928833, "learning_rate": 3.30423142822176e-05, "loss": 1.0966, "step": 111400 }, { "epoch": 3.4062442720107535, "grad_norm": 3.6469085216522217, "learning_rate": 3.302701627707747e-05, "loss": 1.1971, "step": 111500 }, { "epoch": 3.409299199608969, "grad_norm": 3.417675256729126, "learning_rate": 3.301171827193734e-05, "loss": 0.9999, "step": 111600 }, { "epoch": 3.4123541272071853, "grad_norm": 5.76259183883667, "learning_rate": 3.2996420266797214e-05, "loss": 1.1023, "step": 111700 }, { "epoch": 3.415409054805401, "grad_norm": 3.8383865356445312, "learning_rate": 3.2981122261657085e-05, "loss": 1.2614, "step": 111800 }, { "epoch": 3.418463982403617, "grad_norm": 6.684022903442383, "learning_rate": 3.296582425651695e-05, "loss": 0.9996, "step": 111900 }, { "epoch": 3.421518910001833, "grad_norm": 4.4537129402160645, "learning_rate": 3.295052625137682e-05, "loss": 1.1022, "step": 112000 }, { "epoch": 3.424573837600049, "grad_norm": 4.510301113128662, "learning_rate": 3.293522824623669e-05, "loss": 1.0628, "step": 112100 }, { "epoch": 3.4276287651982646, "grad_norm": 6.627735614776611, "learning_rate": 3.291993024109656e-05, "loss": 1.288, "step": 112200 }, { "epoch": 3.4306836927964808, "grad_norm": 3.7537848949432373, "learning_rate": 3.2904632235956434e-05, "loss": 1.2107, "step": 112300 }, { "epoch": 3.4337386203946965, "grad_norm": 9.15528392791748, "learning_rate": 3.2889334230816305e-05, "loss": 1.3645, "step": 112400 }, { "epoch": 3.4367935479929126, "grad_norm": 3.508101224899292, "learning_rate": 3.2874036225676177e-05, "loss": 1.1195, "step": 112500 }, { "epoch": 3.4398484755911287, "grad_norm": 17.049793243408203, "learning_rate": 3.285873822053605e-05, "loss": 1.0965, "step": 112600 }, { "epoch": 3.4429034031893444, "grad_norm": 13.650372505187988, "learning_rate": 3.2843593195447316e-05, "loss": 1.1358, "step": 112700 }, { "epoch": 3.44595833078756, "grad_norm": 2.2558670043945312, "learning_rate": 3.282829519030719e-05, "loss": 1.0418, "step": 112800 }, { "epoch": 3.449013258385776, "grad_norm": 5.237791538238525, "learning_rate": 3.281299718516706e-05, "loss": 1.2597, "step": 112900 }, { "epoch": 3.4520681859839923, "grad_norm": 14.691550254821777, "learning_rate": 3.279769918002692e-05, "loss": 1.1266, "step": 113000 }, { "epoch": 3.455123113582208, "grad_norm": 12.126757621765137, "learning_rate": 3.2782401174886793e-05, "loss": 1.1784, "step": 113100 }, { "epoch": 3.458178041180424, "grad_norm": 4.572924613952637, "learning_rate": 3.2767103169746665e-05, "loss": 0.97, "step": 113200 }, { "epoch": 3.46123296877864, "grad_norm": 3.525087594985962, "learning_rate": 3.2751805164606536e-05, "loss": 1.0714, "step": 113300 }, { "epoch": 3.464287896376856, "grad_norm": 6.896517276763916, "learning_rate": 3.273650715946641e-05, "loss": 1.3458, "step": 113400 }, { "epoch": 3.4673428239750717, "grad_norm": 4.984236240386963, "learning_rate": 3.272120915432628e-05, "loss": 1.3075, "step": 113500 }, { "epoch": 3.470397751573288, "grad_norm": 8.506633758544922, "learning_rate": 3.270591114918615e-05, "loss": 1.0258, "step": 113600 }, { "epoch": 3.4734526791715035, "grad_norm": 5.038571357727051, "learning_rate": 3.269061314404602e-05, "loss": 1.1029, "step": 113700 }, { "epoch": 3.4765076067697196, "grad_norm": 7.582324028015137, "learning_rate": 3.2675315138905885e-05, "loss": 1.5252, "step": 113800 }, { "epoch": 3.4795625343679353, "grad_norm": 2.692021131515503, "learning_rate": 3.2660017133765756e-05, "loss": 1.1836, "step": 113900 }, { "epoch": 3.4826174619661514, "grad_norm": 1.044533133506775, "learning_rate": 3.264471912862563e-05, "loss": 1.1172, "step": 114000 }, { "epoch": 3.4856723895643675, "grad_norm": 4.991665840148926, "learning_rate": 3.26294211234855e-05, "loss": 1.2193, "step": 114100 }, { "epoch": 3.488727317162583, "grad_norm": 7.2558393478393555, "learning_rate": 3.261412311834537e-05, "loss": 1.0472, "step": 114200 }, { "epoch": 3.4917822447607993, "grad_norm": 16.940465927124023, "learning_rate": 3.259882511320524e-05, "loss": 1.279, "step": 114300 }, { "epoch": 3.494837172359015, "grad_norm": 4.868178844451904, "learning_rate": 3.258352710806511e-05, "loss": 1.0672, "step": 114400 }, { "epoch": 3.497892099957231, "grad_norm": 4.789906978607178, "learning_rate": 3.256822910292498e-05, "loss": 1.058, "step": 114500 }, { "epoch": 3.500947027555447, "grad_norm": 3.3610029220581055, "learning_rate": 3.255293109778485e-05, "loss": 1.1544, "step": 114600 }, { "epoch": 3.504001955153663, "grad_norm": 6.413651466369629, "learning_rate": 3.253763309264472e-05, "loss": 1.1485, "step": 114700 }, { "epoch": 3.5070568827518787, "grad_norm": 3.62406587600708, "learning_rate": 3.252233508750459e-05, "loss": 1.1035, "step": 114800 }, { "epoch": 3.510111810350095, "grad_norm": 2.6523101329803467, "learning_rate": 3.250703708236446e-05, "loss": 1.0266, "step": 114900 }, { "epoch": 3.5131667379483105, "grad_norm": 10.615732192993164, "learning_rate": 3.249173907722433e-05, "loss": 1.1927, "step": 115000 }, { "epoch": 3.5162216655465266, "grad_norm": 4.684859275817871, "learning_rate": 3.24764410720842e-05, "loss": 1.2108, "step": 115100 }, { "epoch": 3.5192765931447427, "grad_norm": 5.885335922241211, "learning_rate": 3.2461143066944074e-05, "loss": 1.1479, "step": 115200 }, { "epoch": 3.5223315207429584, "grad_norm": 16.513399124145508, "learning_rate": 3.2445845061803945e-05, "loss": 1.2975, "step": 115300 }, { "epoch": 3.525386448341174, "grad_norm": 7.12460470199585, "learning_rate": 3.243054705666381e-05, "loss": 1.134, "step": 115400 }, { "epoch": 3.5284413759393902, "grad_norm": 14.912757873535156, "learning_rate": 3.241524905152368e-05, "loss": 1.1733, "step": 115500 }, { "epoch": 3.5314963035376064, "grad_norm": 3.456683397293091, "learning_rate": 3.239995104638355e-05, "loss": 1.1846, "step": 115600 }, { "epoch": 3.534551231135822, "grad_norm": 1.5884753465652466, "learning_rate": 3.238465304124342e-05, "loss": 1.2762, "step": 115700 }, { "epoch": 3.537606158734038, "grad_norm": 4.264137268066406, "learning_rate": 3.2369355036103294e-05, "loss": 1.2741, "step": 115800 }, { "epoch": 3.540661086332254, "grad_norm": 7.918459892272949, "learning_rate": 3.2354057030963166e-05, "loss": 1.095, "step": 115900 }, { "epoch": 3.54371601393047, "grad_norm": 21.39558219909668, "learning_rate": 3.233875902582304e-05, "loss": 1.2129, "step": 116000 }, { "epoch": 3.5467709415286857, "grad_norm": 12.257303237915039, "learning_rate": 3.232346102068291e-05, "loss": 1.015, "step": 116100 }, { "epoch": 3.549825869126902, "grad_norm": 6.592789173126221, "learning_rate": 3.230816301554277e-05, "loss": 1.2219, "step": 116200 }, { "epoch": 3.5528807967251175, "grad_norm": 10.32773494720459, "learning_rate": 3.2292865010402643e-05, "loss": 1.157, "step": 116300 }, { "epoch": 3.5559357243233336, "grad_norm": 2.9441211223602295, "learning_rate": 3.2277567005262515e-05, "loss": 1.1678, "step": 116400 }, { "epoch": 3.5589906519215493, "grad_norm": 4.8874897956848145, "learning_rate": 3.2262269000122386e-05, "loss": 1.1722, "step": 116500 }, { "epoch": 3.5620455795197654, "grad_norm": 17.394983291625977, "learning_rate": 3.224697099498226e-05, "loss": 1.3023, "step": 116600 }, { "epoch": 3.5651005071179815, "grad_norm": 3.721679449081421, "learning_rate": 3.2231825969893525e-05, "loss": 1.3001, "step": 116700 }, { "epoch": 3.5681554347161972, "grad_norm": 3.0622265338897705, "learning_rate": 3.2216527964753396e-05, "loss": 1.2024, "step": 116800 }, { "epoch": 3.571210362314413, "grad_norm": 7.172619819641113, "learning_rate": 3.220122995961327e-05, "loss": 1.1626, "step": 116900 }, { "epoch": 3.574265289912629, "grad_norm": 7.2092790603637695, "learning_rate": 3.218593195447314e-05, "loss": 1.374, "step": 117000 }, { "epoch": 3.577320217510845, "grad_norm": 9.982060432434082, "learning_rate": 3.217063394933301e-05, "loss": 1.1833, "step": 117100 }, { "epoch": 3.580375145109061, "grad_norm": 4.011295318603516, "learning_rate": 3.2155335944192874e-05, "loss": 1.1522, "step": 117200 }, { "epoch": 3.583430072707277, "grad_norm": 12.791787147521973, "learning_rate": 3.2140037939052745e-05, "loss": 1.0462, "step": 117300 }, { "epoch": 3.5864850003054927, "grad_norm": 14.182363510131836, "learning_rate": 3.2124739933912616e-05, "loss": 1.2998, "step": 117400 }, { "epoch": 3.589539927903709, "grad_norm": 4.25027322769165, "learning_rate": 3.210944192877249e-05, "loss": 1.0125, "step": 117500 }, { "epoch": 3.5925948555019245, "grad_norm": 1.9661411046981812, "learning_rate": 3.209414392363236e-05, "loss": 1.186, "step": 117600 }, { "epoch": 3.5956497831001406, "grad_norm": 3.385308027267456, "learning_rate": 3.207884591849223e-05, "loss": 1.0252, "step": 117700 }, { "epoch": 3.5987047106983563, "grad_norm": 9.881473541259766, "learning_rate": 3.20635479133521e-05, "loss": 1.1523, "step": 117800 }, { "epoch": 3.6017596382965724, "grad_norm": 5.886788845062256, "learning_rate": 3.204824990821197e-05, "loss": 1.2045, "step": 117900 }, { "epoch": 3.604814565894788, "grad_norm": 7.499330520629883, "learning_rate": 3.2032951903071836e-05, "loss": 1.2884, "step": 118000 }, { "epoch": 3.6078694934930042, "grad_norm": 11.073080062866211, "learning_rate": 3.201765389793171e-05, "loss": 1.0986, "step": 118100 }, { "epoch": 3.6109244210912204, "grad_norm": 3.1535775661468506, "learning_rate": 3.200235589279158e-05, "loss": 1.2812, "step": 118200 }, { "epoch": 3.613979348689436, "grad_norm": 6.089379787445068, "learning_rate": 3.198705788765145e-05, "loss": 1.1118, "step": 118300 }, { "epoch": 3.6170342762876517, "grad_norm": 2.273271083831787, "learning_rate": 3.197175988251132e-05, "loss": 1.1566, "step": 118400 }, { "epoch": 3.620089203885868, "grad_norm": 5.785590648651123, "learning_rate": 3.195646187737119e-05, "loss": 1.0192, "step": 118500 }, { "epoch": 3.623144131484084, "grad_norm": 4.7622480392456055, "learning_rate": 3.194116387223106e-05, "loss": 1.0438, "step": 118600 }, { "epoch": 3.6261990590822997, "grad_norm": 5.416982173919678, "learning_rate": 3.192601884714234e-05, "loss": 1.2225, "step": 118700 }, { "epoch": 3.629253986680516, "grad_norm": 3.769057512283325, "learning_rate": 3.191072084200221e-05, "loss": 1.115, "step": 118800 }, { "epoch": 3.6323089142787315, "grad_norm": 4.778010845184326, "learning_rate": 3.1895422836862074e-05, "loss": 1.1272, "step": 118900 }, { "epoch": 3.6353638418769476, "grad_norm": 3.8819594383239746, "learning_rate": 3.1880124831721945e-05, "loss": 1.2657, "step": 119000 }, { "epoch": 3.6384187694751633, "grad_norm": 2.2964558601379395, "learning_rate": 3.1864826826581816e-05, "loss": 1.1895, "step": 119100 }, { "epoch": 3.6414736970733794, "grad_norm": 2.7459022998809814, "learning_rate": 3.184952882144169e-05, "loss": 1.0707, "step": 119200 }, { "epoch": 3.6445286246715956, "grad_norm": 10.472131729125977, "learning_rate": 3.183423081630156e-05, "loss": 1.3939, "step": 119300 }, { "epoch": 3.6475835522698112, "grad_norm": 3.7901763916015625, "learning_rate": 3.181893281116143e-05, "loss": 1.3832, "step": 119400 }, { "epoch": 3.650638479868027, "grad_norm": 5.768411636352539, "learning_rate": 3.18036348060213e-05, "loss": 1.0307, "step": 119500 }, { "epoch": 3.653693407466243, "grad_norm": 6.40388298034668, "learning_rate": 3.178833680088117e-05, "loss": 1.1032, "step": 119600 }, { "epoch": 3.656748335064459, "grad_norm": 10.709514617919922, "learning_rate": 3.1773038795741036e-05, "loss": 1.0204, "step": 119700 }, { "epoch": 3.659803262662675, "grad_norm": 7.594704627990723, "learning_rate": 3.175774079060091e-05, "loss": 1.1728, "step": 119800 }, { "epoch": 3.6628581902608905, "grad_norm": 2.95936918258667, "learning_rate": 3.174244278546078e-05, "loss": 1.1619, "step": 119900 }, { "epoch": 3.6659131178591067, "grad_norm": 8.665496826171875, "learning_rate": 3.172714478032065e-05, "loss": 1.0554, "step": 120000 }, { "epoch": 3.668968045457323, "grad_norm": 6.218582630157471, "learning_rate": 3.171184677518052e-05, "loss": 1.2974, "step": 120100 }, { "epoch": 3.6720229730555385, "grad_norm": 1.6152470111846924, "learning_rate": 3.169654877004039e-05, "loss": 1.2447, "step": 120200 }, { "epoch": 3.6750779006537546, "grad_norm": 4.754295349121094, "learning_rate": 3.168125076490026e-05, "loss": 1.2054, "step": 120300 }, { "epoch": 3.6781328282519703, "grad_norm": 6.933786392211914, "learning_rate": 3.1665952759760134e-05, "loss": 1.1301, "step": 120400 }, { "epoch": 3.6811877558501864, "grad_norm": 1.8776979446411133, "learning_rate": 3.165065475462e-05, "loss": 1.1499, "step": 120500 }, { "epoch": 3.684242683448402, "grad_norm": 6.305478096008301, "learning_rate": 3.163535674947987e-05, "loss": 0.9828, "step": 120600 }, { "epoch": 3.6872976110466182, "grad_norm": 11.32885456085205, "learning_rate": 3.1620211724391144e-05, "loss": 1.386, "step": 120700 }, { "epoch": 3.6903525386448344, "grad_norm": 3.6041526794433594, "learning_rate": 3.160491371925101e-05, "loss": 1.1403, "step": 120800 }, { "epoch": 3.69340746624305, "grad_norm": 5.200024127960205, "learning_rate": 3.158961571411088e-05, "loss": 1.1867, "step": 120900 }, { "epoch": 3.6964623938412657, "grad_norm": 4.821595668792725, "learning_rate": 3.157431770897075e-05, "loss": 1.1326, "step": 121000 }, { "epoch": 3.699517321439482, "grad_norm": 6.276610374450684, "learning_rate": 3.155901970383062e-05, "loss": 1.2586, "step": 121100 }, { "epoch": 3.702572249037698, "grad_norm": 2.143345355987549, "learning_rate": 3.1543721698690493e-05, "loss": 1.2188, "step": 121200 }, { "epoch": 3.7056271766359137, "grad_norm": 2.437786340713501, "learning_rate": 3.1528423693550365e-05, "loss": 1.246, "step": 121300 }, { "epoch": 3.70868210423413, "grad_norm": 4.366087913513184, "learning_rate": 3.1513125688410236e-05, "loss": 0.9416, "step": 121400 }, { "epoch": 3.7117370318323455, "grad_norm": 10.98507308959961, "learning_rate": 3.149782768327011e-05, "loss": 1.2776, "step": 121500 }, { "epoch": 3.7147919594305616, "grad_norm": 8.077423095703125, "learning_rate": 3.148252967812997e-05, "loss": 1.0204, "step": 121600 }, { "epoch": 3.7178468870287773, "grad_norm": 11.440945625305176, "learning_rate": 3.146723167298984e-05, "loss": 1.2277, "step": 121700 }, { "epoch": 3.7209018146269934, "grad_norm": 13.08006477355957, "learning_rate": 3.1451933667849714e-05, "loss": 1.1211, "step": 121800 }, { "epoch": 3.723956742225209, "grad_norm": 14.114601135253906, "learning_rate": 3.1436635662709585e-05, "loss": 1.338, "step": 121900 }, { "epoch": 3.7270116698234252, "grad_norm": 4.147833824157715, "learning_rate": 3.1421337657569456e-05, "loss": 1.1939, "step": 122000 }, { "epoch": 3.730066597421641, "grad_norm": 2.173335075378418, "learning_rate": 3.140603965242933e-05, "loss": 0.9904, "step": 122100 }, { "epoch": 3.733121525019857, "grad_norm": 1.0117582082748413, "learning_rate": 3.13907416472892e-05, "loss": 1.1817, "step": 122200 }, { "epoch": 3.736176452618073, "grad_norm": 4.836818695068359, "learning_rate": 3.137544364214907e-05, "loss": 1.153, "step": 122300 }, { "epoch": 3.739231380216289, "grad_norm": 3.414306402206421, "learning_rate": 3.1360145637008934e-05, "loss": 1.1321, "step": 122400 }, { "epoch": 3.7422863078145046, "grad_norm": 5.739386558532715, "learning_rate": 3.1344847631868805e-05, "loss": 1.1302, "step": 122500 }, { "epoch": 3.7453412354127207, "grad_norm": 8.772282600402832, "learning_rate": 3.1329549626728676e-05, "loss": 1.1282, "step": 122600 }, { "epoch": 3.748396163010937, "grad_norm": 7.5907464027404785, "learning_rate": 3.1314404601639944e-05, "loss": 1.1954, "step": 122700 }, { "epoch": 3.7514510906091525, "grad_norm": 2.3881170749664307, "learning_rate": 3.1299106596499815e-05, "loss": 1.1865, "step": 122800 }, { "epoch": 3.7545060182073686, "grad_norm": 5.920051574707031, "learning_rate": 3.1283808591359686e-05, "loss": 1.0913, "step": 122900 }, { "epoch": 3.7575609458055843, "grad_norm": 6.340502738952637, "learning_rate": 3.126851058621956e-05, "loss": 1.2684, "step": 123000 }, { "epoch": 3.7606158734038004, "grad_norm": 13.953529357910156, "learning_rate": 3.125321258107943e-05, "loss": 1.1196, "step": 123100 }, { "epoch": 3.763670801002016, "grad_norm": 2.678821325302124, "learning_rate": 3.12379145759393e-05, "loss": 1.148, "step": 123200 }, { "epoch": 3.7667257286002322, "grad_norm": 4.08374547958374, "learning_rate": 3.122261657079917e-05, "loss": 0.9931, "step": 123300 }, { "epoch": 3.769780656198448, "grad_norm": 4.527195453643799, "learning_rate": 3.1207318565659035e-05, "loss": 1.2992, "step": 123400 }, { "epoch": 3.772835583796664, "grad_norm": 6.90880012512207, "learning_rate": 3.1192020560518907e-05, "loss": 1.1887, "step": 123500 }, { "epoch": 3.7758905113948797, "grad_norm": 11.422993659973145, "learning_rate": 3.117672255537878e-05, "loss": 1.0616, "step": 123600 }, { "epoch": 3.778945438993096, "grad_norm": 3.5831525325775146, "learning_rate": 3.116142455023865e-05, "loss": 1.3278, "step": 123700 }, { "epoch": 3.782000366591312, "grad_norm": 13.411617279052734, "learning_rate": 3.114612654509852e-05, "loss": 1.2322, "step": 123800 }, { "epoch": 3.7850552941895277, "grad_norm": 5.012452125549316, "learning_rate": 3.113082853995839e-05, "loss": 1.1816, "step": 123900 }, { "epoch": 3.7881102217877434, "grad_norm": 5.499082088470459, "learning_rate": 3.111553053481826e-05, "loss": 1.079, "step": 124000 }, { "epoch": 3.7911651493859595, "grad_norm": 18.250707626342773, "learning_rate": 3.1100232529678133e-05, "loss": 1.1843, "step": 124100 }, { "epoch": 3.7942200769841756, "grad_norm": 1.7736492156982422, "learning_rate": 3.1084934524538e-05, "loss": 1.3134, "step": 124200 }, { "epoch": 3.7972750045823913, "grad_norm": 4.700921058654785, "learning_rate": 3.106963651939787e-05, "loss": 1.2205, "step": 124300 }, { "epoch": 3.8003299321806074, "grad_norm": 3.189049482345581, "learning_rate": 3.105433851425774e-05, "loss": 1.1814, "step": 124400 }, { "epoch": 3.803384859778823, "grad_norm": 3.8542063236236572, "learning_rate": 3.103904050911761e-05, "loss": 1.2317, "step": 124500 }, { "epoch": 3.8064397873770393, "grad_norm": 5.694820880889893, "learning_rate": 3.102374250397748e-05, "loss": 1.1466, "step": 124600 }, { "epoch": 3.809494714975255, "grad_norm": 4.0774455070495605, "learning_rate": 3.100859747888876e-05, "loss": 1.1709, "step": 124700 }, { "epoch": 3.812549642573471, "grad_norm": 2.863539457321167, "learning_rate": 3.099329947374863e-05, "loss": 1.2222, "step": 124800 }, { "epoch": 3.815604570171687, "grad_norm": 4.64631986618042, "learning_rate": 3.0978154448659896e-05, "loss": 1.2713, "step": 124900 }, { "epoch": 3.818659497769903, "grad_norm": 7.270814895629883, "learning_rate": 3.096285644351977e-05, "loss": 0.9987, "step": 125000 }, { "epoch": 3.8217144253681186, "grad_norm": 2.774071216583252, "learning_rate": 3.094755843837964e-05, "loss": 1.3305, "step": 125100 }, { "epoch": 3.8247693529663347, "grad_norm": 8.187067031860352, "learning_rate": 3.093226043323951e-05, "loss": 1.2963, "step": 125200 }, { "epoch": 3.827824280564551, "grad_norm": 6.8783369064331055, "learning_rate": 3.0916962428099374e-05, "loss": 1.2131, "step": 125300 }, { "epoch": 3.8308792081627665, "grad_norm": 11.448376655578613, "learning_rate": 3.0901664422959245e-05, "loss": 1.2464, "step": 125400 }, { "epoch": 3.8339341357609826, "grad_norm": 11.253005027770996, "learning_rate": 3.0886366417819116e-05, "loss": 1.0755, "step": 125500 }, { "epoch": 3.8369890633591983, "grad_norm": 5.660057544708252, "learning_rate": 3.087106841267899e-05, "loss": 1.1623, "step": 125600 }, { "epoch": 3.8400439909574144, "grad_norm": 6.888990879058838, "learning_rate": 3.085577040753886e-05, "loss": 1.2219, "step": 125700 }, { "epoch": 3.84309891855563, "grad_norm": 4.959685325622559, "learning_rate": 3.084047240239873e-05, "loss": 1.1457, "step": 125800 }, { "epoch": 3.8461538461538463, "grad_norm": 7.199165344238281, "learning_rate": 3.08251743972586e-05, "loss": 1.3069, "step": 125900 }, { "epoch": 3.849208773752062, "grad_norm": 8.368234634399414, "learning_rate": 3.080987639211847e-05, "loss": 1.315, "step": 126000 }, { "epoch": 3.852263701350278, "grad_norm": 17.217199325561523, "learning_rate": 3.079457838697834e-05, "loss": 1.0141, "step": 126100 }, { "epoch": 3.8553186289484938, "grad_norm": 8.5066556930542, "learning_rate": 3.077928038183821e-05, "loss": 1.1976, "step": 126200 }, { "epoch": 3.85837355654671, "grad_norm": 3.0581226348876953, "learning_rate": 3.076398237669808e-05, "loss": 1.1097, "step": 126300 }, { "epoch": 3.861428484144926, "grad_norm": 4.513498783111572, "learning_rate": 3.074868437155795e-05, "loss": 1.1935, "step": 126400 }, { "epoch": 3.8644834117431417, "grad_norm": 5.329300403594971, "learning_rate": 3.073338636641782e-05, "loss": 1.3866, "step": 126500 }, { "epoch": 3.8675383393413574, "grad_norm": 7.060613632202148, "learning_rate": 3.071808836127769e-05, "loss": 1.2348, "step": 126600 }, { "epoch": 3.8705932669395735, "grad_norm": 4.2163615226745605, "learning_rate": 3.0702790356137564e-05, "loss": 1.046, "step": 126700 }, { "epoch": 3.8736481945377896, "grad_norm": 6.511301040649414, "learning_rate": 3.0687492350997435e-05, "loss": 1.1079, "step": 126800 }, { "epoch": 3.8767031221360053, "grad_norm": 7.438748359680176, "learning_rate": 3.06721943458573e-05, "loss": 1.4255, "step": 126900 }, { "epoch": 3.8797580497342214, "grad_norm": 3.3718574047088623, "learning_rate": 3.065689634071717e-05, "loss": 1.1054, "step": 127000 }, { "epoch": 3.882812977332437, "grad_norm": 7.0842132568359375, "learning_rate": 3.064159833557704e-05, "loss": 1.0431, "step": 127100 }, { "epoch": 3.8858679049306533, "grad_norm": 2.1069562435150146, "learning_rate": 3.062630033043691e-05, "loss": 1.1558, "step": 127200 }, { "epoch": 3.888922832528869, "grad_norm": 2.8387815952301025, "learning_rate": 3.0611002325296784e-05, "loss": 1.2421, "step": 127300 }, { "epoch": 3.891977760127085, "grad_norm": 9.572854995727539, "learning_rate": 3.0595704320156655e-05, "loss": 1.1307, "step": 127400 }, { "epoch": 3.8950326877253008, "grad_norm": 7.052920818328857, "learning_rate": 3.0580406315016526e-05, "loss": 1.0729, "step": 127500 }, { "epoch": 3.898087615323517, "grad_norm": 2.5184378623962402, "learning_rate": 3.05651083098764e-05, "loss": 1.3155, "step": 127600 }, { "epoch": 3.9011425429217326, "grad_norm": 6.567754745483398, "learning_rate": 3.054981030473626e-05, "loss": 1.1312, "step": 127700 }, { "epoch": 3.9041974705199487, "grad_norm": 5.300526142120361, "learning_rate": 3.053451229959613e-05, "loss": 1.1642, "step": 127800 }, { "epoch": 3.907252398118165, "grad_norm": 3.9759981632232666, "learning_rate": 3.0519214294456004e-05, "loss": 1.047, "step": 127900 }, { "epoch": 3.9103073257163805, "grad_norm": 4.425239562988281, "learning_rate": 3.0503916289315875e-05, "loss": 1.1105, "step": 128000 }, { "epoch": 3.913362253314596, "grad_norm": 2.3210442066192627, "learning_rate": 3.0488618284175746e-05, "loss": 1.1971, "step": 128100 }, { "epoch": 3.9164171809128123, "grad_norm": 3.3596367835998535, "learning_rate": 3.0473320279035617e-05, "loss": 1.2001, "step": 128200 }, { "epoch": 3.9194721085110285, "grad_norm": 4.397903919219971, "learning_rate": 3.0458022273895485e-05, "loss": 1.2177, "step": 128300 }, { "epoch": 3.922527036109244, "grad_norm": 5.058310508728027, "learning_rate": 3.0442724268755356e-05, "loss": 1.2263, "step": 128400 }, { "epoch": 3.9255819637074603, "grad_norm": 10.105254173278809, "learning_rate": 3.0427426263615227e-05, "loss": 1.1451, "step": 128500 }, { "epoch": 3.928636891305676, "grad_norm": 6.7497968673706055, "learning_rate": 3.04121282584751e-05, "loss": 1.0464, "step": 128600 }, { "epoch": 3.931691818903892, "grad_norm": 3.9784913063049316, "learning_rate": 3.0396830253334966e-05, "loss": 1.0723, "step": 128700 }, { "epoch": 3.9347467465021078, "grad_norm": 0.002016042824834585, "learning_rate": 3.0381532248194838e-05, "loss": 0.9551, "step": 128800 }, { "epoch": 3.937801674100324, "grad_norm": 4.069670677185059, "learning_rate": 3.036623424305471e-05, "loss": 1.1617, "step": 128900 }, { "epoch": 3.94085660169854, "grad_norm": 13.04504680633545, "learning_rate": 3.0351089217965977e-05, "loss": 1.1916, "step": 129000 }, { "epoch": 3.9439115292967557, "grad_norm": 10.69211483001709, "learning_rate": 3.0335791212825848e-05, "loss": 1.1484, "step": 129100 }, { "epoch": 3.9469664568949714, "grad_norm": 5.319109916687012, "learning_rate": 3.032049320768572e-05, "loss": 1.2785, "step": 129200 }, { "epoch": 3.9500213844931875, "grad_norm": 17.455825805664062, "learning_rate": 3.030519520254559e-05, "loss": 1.0728, "step": 129300 }, { "epoch": 3.9530763120914036, "grad_norm": 10.500404357910156, "learning_rate": 3.0289897197405458e-05, "loss": 1.6514, "step": 129400 }, { "epoch": 3.9561312396896193, "grad_norm": 3.3658556938171387, "learning_rate": 3.027459919226533e-05, "loss": 1.3522, "step": 129500 }, { "epoch": 3.959186167287835, "grad_norm": 6.071293830871582, "learning_rate": 3.02593011871252e-05, "loss": 1.2511, "step": 129600 }, { "epoch": 3.962241094886051, "grad_norm": 18.586950302124023, "learning_rate": 3.024400318198507e-05, "loss": 1.1664, "step": 129700 }, { "epoch": 3.9652960224842673, "grad_norm": 11.053479194641113, "learning_rate": 3.022870517684494e-05, "loss": 1.2655, "step": 129800 }, { "epoch": 3.968350950082483, "grad_norm": 3.921522617340088, "learning_rate": 3.021340717170481e-05, "loss": 1.1595, "step": 129900 }, { "epoch": 3.971405877680699, "grad_norm": 15.36481761932373, "learning_rate": 3.019810916656468e-05, "loss": 1.1743, "step": 130000 }, { "epoch": 3.9744608052789148, "grad_norm": 5.760214328765869, "learning_rate": 3.0182811161424553e-05, "loss": 0.9441, "step": 130100 }, { "epoch": 3.977515732877131, "grad_norm": 6.384634971618652, "learning_rate": 3.016751315628442e-05, "loss": 1.0591, "step": 130200 }, { "epoch": 3.9805706604753466, "grad_norm": 5.667691230773926, "learning_rate": 3.015221515114429e-05, "loss": 1.3293, "step": 130300 }, { "epoch": 3.9836255880735627, "grad_norm": 6.393319606781006, "learning_rate": 3.0136917146004163e-05, "loss": 1.1806, "step": 130400 }, { "epoch": 3.986680515671779, "grad_norm": 4.59672737121582, "learning_rate": 3.0121619140864034e-05, "loss": 1.287, "step": 130500 }, { "epoch": 3.9897354432699945, "grad_norm": 6.654175281524658, "learning_rate": 3.01063211357239e-05, "loss": 1.1724, "step": 130600 }, { "epoch": 3.99279037086821, "grad_norm": 3.8924570083618164, "learning_rate": 3.0091023130583773e-05, "loss": 1.1715, "step": 130700 }, { "epoch": 3.9958452984664263, "grad_norm": 2.988812208175659, "learning_rate": 3.0075725125443644e-05, "loss": 0.9282, "step": 130800 }, { "epoch": 3.9989002260646425, "grad_norm": 10.128042221069336, "learning_rate": 3.0060427120303515e-05, "loss": 1.2072, "step": 130900 }, { "epoch": 4.0, "eval_accuracy": 0.7121036231441314, "eval_loss": 0.7748239040374756, "eval_runtime": 1785.0458, "eval_samples_per_second": 18.338, "eval_steps_per_second": 4.585, "step": 130936 }, { "epoch": 4.001955153662858, "grad_norm": 2.5833377838134766, "learning_rate": 3.0045282095214783e-05, "loss": 1.3073, "step": 131000 }, { "epoch": 4.005010081261074, "grad_norm": 5.355751991271973, "learning_rate": 3.0029984090074654e-05, "loss": 1.0811, "step": 131100 }, { "epoch": 4.00806500885929, "grad_norm": 4.671720027923584, "learning_rate": 3.0014686084934522e-05, "loss": 1.1287, "step": 131200 }, { "epoch": 4.011119936457506, "grad_norm": 1.8795313835144043, "learning_rate": 2.9999541059845797e-05, "loss": 1.0344, "step": 131300 }, { "epoch": 4.014174864055722, "grad_norm": 3.870358943939209, "learning_rate": 2.9984243054705668e-05, "loss": 1.1018, "step": 131400 }, { "epoch": 4.0172297916539375, "grad_norm": 1.3954418897628784, "learning_rate": 2.996894504956554e-05, "loss": 1.0517, "step": 131500 }, { "epoch": 4.020284719252154, "grad_norm": 2.2626378536224365, "learning_rate": 2.995364704442541e-05, "loss": 1.1714, "step": 131600 }, { "epoch": 4.02333964685037, "grad_norm": 18.261306762695312, "learning_rate": 2.9938349039285278e-05, "loss": 0.9777, "step": 131700 }, { "epoch": 4.026394574448585, "grad_norm": 3.398371696472168, "learning_rate": 2.992305103414515e-05, "loss": 1.7022, "step": 131800 }, { "epoch": 4.029449502046801, "grad_norm": 10.634683609008789, "learning_rate": 2.990775302900502e-05, "loss": 1.3408, "step": 131900 }, { "epoch": 4.032504429645018, "grad_norm": 3.4038894176483154, "learning_rate": 2.989245502386489e-05, "loss": 1.0956, "step": 132000 }, { "epoch": 4.035559357243233, "grad_norm": 9.867512702941895, "learning_rate": 2.987715701872476e-05, "loss": 1.1892, "step": 132100 }, { "epoch": 4.038614284841449, "grad_norm": 12.254446983337402, "learning_rate": 2.986185901358463e-05, "loss": 1.3299, "step": 132200 }, { "epoch": 4.041669212439666, "grad_norm": 5.026933670043945, "learning_rate": 2.98465610084445e-05, "loss": 1.115, "step": 132300 }, { "epoch": 4.044724140037881, "grad_norm": 7.253444194793701, "learning_rate": 2.9831263003304373e-05, "loss": 1.225, "step": 132400 }, { "epoch": 4.047779067636097, "grad_norm": 1.1668365001678467, "learning_rate": 2.981596499816424e-05, "loss": 1.1519, "step": 132500 }, { "epoch": 4.050833995234313, "grad_norm": 2.0015876293182373, "learning_rate": 2.980066699302411e-05, "loss": 1.1844, "step": 132600 }, { "epoch": 4.053888922832529, "grad_norm": 1.1752444505691528, "learning_rate": 2.9785368987883983e-05, "loss": 1.1572, "step": 132700 }, { "epoch": 4.056943850430745, "grad_norm": 3.1533567905426025, "learning_rate": 2.9770070982743854e-05, "loss": 1.0884, "step": 132800 }, { "epoch": 4.059998778028961, "grad_norm": 8.84151840209961, "learning_rate": 2.9754772977603722e-05, "loss": 1.5823, "step": 132900 }, { "epoch": 4.063053705627176, "grad_norm": 5.8620500564575195, "learning_rate": 2.9739474972463593e-05, "loss": 1.169, "step": 133000 }, { "epoch": 4.066108633225393, "grad_norm": 7.552305221557617, "learning_rate": 2.9724176967323464e-05, "loss": 1.3024, "step": 133100 }, { "epoch": 4.0691635608236085, "grad_norm": 2.8654162883758545, "learning_rate": 2.9708878962183335e-05, "loss": 1.2343, "step": 133200 }, { "epoch": 4.072218488421824, "grad_norm": 4.553361892700195, "learning_rate": 2.9693580957043203e-05, "loss": 1.1538, "step": 133300 }, { "epoch": 4.07527341602004, "grad_norm": 4.676537036895752, "learning_rate": 2.9678282951903074e-05, "loss": 1.2307, "step": 133400 }, { "epoch": 4.0783283436182565, "grad_norm": 8.020150184631348, "learning_rate": 2.9662984946762945e-05, "loss": 1.1745, "step": 133500 }, { "epoch": 4.081383271216472, "grad_norm": 4.467293739318848, "learning_rate": 2.9647686941622816e-05, "loss": 1.0806, "step": 133600 }, { "epoch": 4.084438198814688, "grad_norm": 2.580444574356079, "learning_rate": 2.9632388936482684e-05, "loss": 1.1719, "step": 133700 }, { "epoch": 4.087493126412904, "grad_norm": 3.863105058670044, "learning_rate": 2.9617090931342555e-05, "loss": 1.02, "step": 133800 }, { "epoch": 4.09054805401112, "grad_norm": 3.3739333152770996, "learning_rate": 2.9601792926202427e-05, "loss": 1.2029, "step": 133900 }, { "epoch": 4.093602981609336, "grad_norm": 4.424749374389648, "learning_rate": 2.9586494921062298e-05, "loss": 1.2085, "step": 134000 }, { "epoch": 4.0966579092075515, "grad_norm": 3.976792812347412, "learning_rate": 2.9571196915922165e-05, "loss": 1.3111, "step": 134100 }, { "epoch": 4.099712836805768, "grad_norm": 9.701395034790039, "learning_rate": 2.9555898910782037e-05, "loss": 1.1155, "step": 134200 }, { "epoch": 4.102767764403984, "grad_norm": 0.8991170525550842, "learning_rate": 2.9540753885693308e-05, "loss": 1.0686, "step": 134300 }, { "epoch": 4.105822692002199, "grad_norm": 17.905302047729492, "learning_rate": 2.9525455880553176e-05, "loss": 1.0627, "step": 134400 }, { "epoch": 4.108877619600415, "grad_norm": 5.040673732757568, "learning_rate": 2.9510157875413047e-05, "loss": 1.0886, "step": 134500 }, { "epoch": 4.111932547198632, "grad_norm": 4.204395771026611, "learning_rate": 2.9494859870272918e-05, "loss": 1.238, "step": 134600 }, { "epoch": 4.114987474796847, "grad_norm": 3.686497449874878, "learning_rate": 2.947956186513279e-05, "loss": 1.2616, "step": 134700 }, { "epoch": 4.118042402395063, "grad_norm": 5.15822696685791, "learning_rate": 2.9464263859992657e-05, "loss": 1.2657, "step": 134800 }, { "epoch": 4.12109732999328, "grad_norm": 4.707155227661133, "learning_rate": 2.9448965854852528e-05, "loss": 1.2191, "step": 134900 }, { "epoch": 4.124152257591495, "grad_norm": 3.886612892150879, "learning_rate": 2.94336678497124e-05, "loss": 1.0955, "step": 135000 }, { "epoch": 4.127207185189711, "grad_norm": 2.287773609161377, "learning_rate": 2.941836984457227e-05, "loss": 1.189, "step": 135100 }, { "epoch": 4.130262112787927, "grad_norm": 4.35518741607666, "learning_rate": 2.9403071839432138e-05, "loss": 1.2268, "step": 135200 }, { "epoch": 4.133317040386143, "grad_norm": 8.079886436462402, "learning_rate": 2.938777383429201e-05, "loss": 1.0111, "step": 135300 }, { "epoch": 4.136371967984359, "grad_norm": 7.398697376251221, "learning_rate": 2.937247582915188e-05, "loss": 1.2892, "step": 135400 }, { "epoch": 4.139426895582575, "grad_norm": 2.7290868759155273, "learning_rate": 2.935717782401175e-05, "loss": 1.1484, "step": 135500 }, { "epoch": 4.14248182318079, "grad_norm": 7.36677885055542, "learning_rate": 2.934187981887162e-05, "loss": 1.0452, "step": 135600 }, { "epoch": 4.145536750779007, "grad_norm": 7.556665897369385, "learning_rate": 2.932658181373149e-05, "loss": 1.1462, "step": 135700 }, { "epoch": 4.1485916783772225, "grad_norm": 17.75133514404297, "learning_rate": 2.9311283808591362e-05, "loss": 1.1841, "step": 135800 }, { "epoch": 4.151646605975438, "grad_norm": 8.796733856201172, "learning_rate": 2.9295985803451233e-05, "loss": 1.2635, "step": 135900 }, { "epoch": 4.154701533573654, "grad_norm": 3.0543625354766846, "learning_rate": 2.92806877983111e-05, "loss": 1.1486, "step": 136000 }, { "epoch": 4.1577564611718705, "grad_norm": 8.879738807678223, "learning_rate": 2.9265389793170972e-05, "loss": 1.0926, "step": 136100 }, { "epoch": 4.160811388770086, "grad_norm": 5.291866302490234, "learning_rate": 2.9250091788030843e-05, "loss": 1.2567, "step": 136200 }, { "epoch": 4.163866316368302, "grad_norm": 11.352781295776367, "learning_rate": 2.9234793782890714e-05, "loss": 1.196, "step": 136300 }, { "epoch": 4.166921243966518, "grad_norm": 4.0807785987854, "learning_rate": 2.9219495777750582e-05, "loss": 1.018, "step": 136400 }, { "epoch": 4.169976171564734, "grad_norm": 7.902457237243652, "learning_rate": 2.9204197772610453e-05, "loss": 1.2478, "step": 136500 }, { "epoch": 4.17303109916295, "grad_norm": 6.9937357902526855, "learning_rate": 2.9188899767470324e-05, "loss": 1.2601, "step": 136600 }, { "epoch": 4.1760860267611655, "grad_norm": 19.437175750732422, "learning_rate": 2.9173601762330195e-05, "loss": 1.2579, "step": 136700 }, { "epoch": 4.179140954359382, "grad_norm": 2.6270205974578857, "learning_rate": 2.9158303757190063e-05, "loss": 1.1452, "step": 136800 }, { "epoch": 4.182195881957598, "grad_norm": 11.526654243469238, "learning_rate": 2.9143005752049934e-05, "loss": 1.4067, "step": 136900 }, { "epoch": 4.185250809555813, "grad_norm": 5.607306003570557, "learning_rate": 2.9127707746909805e-05, "loss": 1.3024, "step": 137000 }, { "epoch": 4.188305737154029, "grad_norm": 3.1491425037384033, "learning_rate": 2.9112409741769677e-05, "loss": 1.1115, "step": 137100 }, { "epoch": 4.191360664752246, "grad_norm": 7.519656658172607, "learning_rate": 2.9097111736629544e-05, "loss": 1.1044, "step": 137200 }, { "epoch": 4.194415592350461, "grad_norm": 15.954972267150879, "learning_rate": 2.9081813731489416e-05, "loss": 1.1498, "step": 137300 }, { "epoch": 4.197470519948677, "grad_norm": 1.4020683765411377, "learning_rate": 2.9066515726349287e-05, "loss": 1.1048, "step": 137400 }, { "epoch": 4.200525447546893, "grad_norm": 3.924858331680298, "learning_rate": 2.9051217721209158e-05, "loss": 1.0014, "step": 137500 }, { "epoch": 4.203580375145109, "grad_norm": 13.93403148651123, "learning_rate": 2.9035919716069026e-05, "loss": 1.1036, "step": 137600 }, { "epoch": 4.206635302743325, "grad_norm": 4.891828536987305, "learning_rate": 2.9020621710928897e-05, "loss": 1.0443, "step": 137700 }, { "epoch": 4.209690230341541, "grad_norm": 9.99404525756836, "learning_rate": 2.9005323705788768e-05, "loss": 1.0636, "step": 137800 }, { "epoch": 4.212745157939757, "grad_norm": 3.7061333656311035, "learning_rate": 2.899002570064864e-05, "loss": 1.105, "step": 137900 }, { "epoch": 4.215800085537973, "grad_norm": 9.68040657043457, "learning_rate": 2.8974727695508507e-05, "loss": 1.2866, "step": 138000 }, { "epoch": 4.218855013136189, "grad_norm": 1.8499339818954468, "learning_rate": 2.8959429690368378e-05, "loss": 0.9657, "step": 138100 }, { "epoch": 4.221909940734404, "grad_norm": 2.119901180267334, "learning_rate": 2.894413168522825e-05, "loss": 1.1226, "step": 138200 }, { "epoch": 4.224964868332621, "grad_norm": 7.395858287811279, "learning_rate": 2.8928986660139517e-05, "loss": 1.007, "step": 138300 }, { "epoch": 4.2280197959308365, "grad_norm": 2.304553270339966, "learning_rate": 2.8913688654999388e-05, "loss": 0.978, "step": 138400 }, { "epoch": 4.231074723529052, "grad_norm": 7.8232011795043945, "learning_rate": 2.889839064985926e-05, "loss": 1.0871, "step": 138500 }, { "epoch": 4.234129651127268, "grad_norm": 8.254796981811523, "learning_rate": 2.8883092644719127e-05, "loss": 1.0689, "step": 138600 }, { "epoch": 4.2371845787254845, "grad_norm": 6.827329635620117, "learning_rate": 2.8867794639579e-05, "loss": 1.2603, "step": 138700 }, { "epoch": 4.2402395063237, "grad_norm": 2.7104625701904297, "learning_rate": 2.885249663443887e-05, "loss": 1.0437, "step": 138800 }, { "epoch": 4.243294433921916, "grad_norm": 1.3501704931259155, "learning_rate": 2.883719862929874e-05, "loss": 0.941, "step": 138900 }, { "epoch": 4.2463493615201315, "grad_norm": 10.435741424560547, "learning_rate": 2.882190062415861e-05, "loss": 1.1538, "step": 139000 }, { "epoch": 4.249404289118348, "grad_norm": 6.998001575469971, "learning_rate": 2.880660261901848e-05, "loss": 1.2373, "step": 139100 }, { "epoch": 4.252459216716564, "grad_norm": 3.866121292114258, "learning_rate": 2.879130461387835e-05, "loss": 1.0814, "step": 139200 }, { "epoch": 4.2555141443147795, "grad_norm": 5.841191291809082, "learning_rate": 2.8776006608738222e-05, "loss": 1.1518, "step": 139300 }, { "epoch": 4.258569071912996, "grad_norm": 4.831516265869141, "learning_rate": 2.8760861583649497e-05, "loss": 1.3793, "step": 139400 }, { "epoch": 4.261623999511212, "grad_norm": 3.0584304332733154, "learning_rate": 2.8745563578509364e-05, "loss": 1.2819, "step": 139500 }, { "epoch": 4.264678927109427, "grad_norm": 9.81663703918457, "learning_rate": 2.8730265573369236e-05, "loss": 1.2213, "step": 139600 }, { "epoch": 4.267733854707643, "grad_norm": 3.883660078048706, "learning_rate": 2.8714967568229107e-05, "loss": 1.163, "step": 139700 }, { "epoch": 4.27078878230586, "grad_norm": 3.366514205932617, "learning_rate": 2.8699669563088978e-05, "loss": 1.0588, "step": 139800 }, { "epoch": 4.273843709904075, "grad_norm": 22.876943588256836, "learning_rate": 2.8684371557948846e-05, "loss": 0.9759, "step": 139900 }, { "epoch": 4.276898637502291, "grad_norm": 9.278138160705566, "learning_rate": 2.8669073552808717e-05, "loss": 1.2093, "step": 140000 }, { "epoch": 4.279953565100507, "grad_norm": 12.669641494750977, "learning_rate": 2.8653775547668588e-05, "loss": 1.2022, "step": 140100 }, { "epoch": 4.283008492698723, "grad_norm": 4.376775741577148, "learning_rate": 2.863847754252846e-05, "loss": 1.2715, "step": 140200 }, { "epoch": 4.286063420296939, "grad_norm": 6.5941386222839355, "learning_rate": 2.8623179537388327e-05, "loss": 1.1646, "step": 140300 }, { "epoch": 4.289118347895155, "grad_norm": 10.246788024902344, "learning_rate": 2.8607881532248198e-05, "loss": 1.3237, "step": 140400 }, { "epoch": 4.29217327549337, "grad_norm": 11.5577974319458, "learning_rate": 2.859258352710807e-05, "loss": 1.1616, "step": 140500 }, { "epoch": 4.295228203091587, "grad_norm": 8.360092163085938, "learning_rate": 2.857728552196794e-05, "loss": 1.0561, "step": 140600 }, { "epoch": 4.298283130689803, "grad_norm": 5.822038650512695, "learning_rate": 2.8561987516827808e-05, "loss": 1.3474, "step": 140700 }, { "epoch": 4.301338058288018, "grad_norm": 5.193514823913574, "learning_rate": 2.854668951168768e-05, "loss": 1.1984, "step": 140800 }, { "epoch": 4.304392985886235, "grad_norm": 10.879566192626953, "learning_rate": 2.853139150654755e-05, "loss": 1.2641, "step": 140900 }, { "epoch": 4.3074479134844506, "grad_norm": 0.4382971525192261, "learning_rate": 2.851609350140742e-05, "loss": 1.1111, "step": 141000 }, { "epoch": 4.310502841082666, "grad_norm": 6.622337341308594, "learning_rate": 2.850079549626729e-05, "loss": 1.0635, "step": 141100 }, { "epoch": 4.313557768680882, "grad_norm": 2.3514699935913086, "learning_rate": 2.848549749112716e-05, "loss": 1.0282, "step": 141200 }, { "epoch": 4.3166126962790985, "grad_norm": 2.973541021347046, "learning_rate": 2.8470199485987032e-05, "loss": 1.1509, "step": 141300 }, { "epoch": 4.319667623877314, "grad_norm": 8.881172180175781, "learning_rate": 2.8454901480846903e-05, "loss": 1.064, "step": 141400 }, { "epoch": 4.32272255147553, "grad_norm": 11.173826217651367, "learning_rate": 2.843960347570677e-05, "loss": 1.1981, "step": 141500 }, { "epoch": 4.3257774790737455, "grad_norm": 5.309777736663818, "learning_rate": 2.842430547056664e-05, "loss": 1.1205, "step": 141600 }, { "epoch": 4.328832406671962, "grad_norm": 10.873201370239258, "learning_rate": 2.8409007465426506e-05, "loss": 1.1992, "step": 141700 }, { "epoch": 4.331887334270178, "grad_norm": 5.490105152130127, "learning_rate": 2.8393709460286377e-05, "loss": 1.1684, "step": 141800 }, { "epoch": 4.3349422618683935, "grad_norm": 27.202516555786133, "learning_rate": 2.837841145514625e-05, "loss": 1.0327, "step": 141900 }, { "epoch": 4.337997189466609, "grad_norm": 6.939114093780518, "learning_rate": 2.836311345000612e-05, "loss": 1.0107, "step": 142000 }, { "epoch": 4.341052117064826, "grad_norm": 3.0641424655914307, "learning_rate": 2.8347815444865987e-05, "loss": 0.9114, "step": 142100 }, { "epoch": 4.344107044663041, "grad_norm": 6.63291072845459, "learning_rate": 2.833251743972586e-05, "loss": 1.1327, "step": 142200 }, { "epoch": 4.347161972261257, "grad_norm": 2.9434144496917725, "learning_rate": 2.831721943458573e-05, "loss": 1.1786, "step": 142300 }, { "epoch": 4.350216899859474, "grad_norm": 5.704064846038818, "learning_rate": 2.83019214294456e-05, "loss": 1.2337, "step": 142400 }, { "epoch": 4.353271827457689, "grad_norm": 7.911927223205566, "learning_rate": 2.828662342430547e-05, "loss": 1.1111, "step": 142500 }, { "epoch": 4.356326755055905, "grad_norm": 1.7703015804290771, "learning_rate": 2.827132541916534e-05, "loss": 1.2244, "step": 142600 }, { "epoch": 4.359381682654121, "grad_norm": 6.676158428192139, "learning_rate": 2.825602741402521e-05, "loss": 1.1667, "step": 142700 }, { "epoch": 4.362436610252337, "grad_norm": 21.077234268188477, "learning_rate": 2.8240729408885082e-05, "loss": 1.0365, "step": 142800 }, { "epoch": 4.365491537850553, "grad_norm": 10.691551208496094, "learning_rate": 2.822543140374495e-05, "loss": 1.1383, "step": 142900 }, { "epoch": 4.368546465448769, "grad_norm": 1.3159449100494385, "learning_rate": 2.821013339860482e-05, "loss": 1.1457, "step": 143000 }, { "epoch": 4.371601393046985, "grad_norm": 12.039605140686035, "learning_rate": 2.8194835393464692e-05, "loss": 1.2284, "step": 143100 }, { "epoch": 4.374656320645201, "grad_norm": 4.086942672729492, "learning_rate": 2.8179537388324563e-05, "loss": 1.1436, "step": 143200 }, { "epoch": 4.377711248243417, "grad_norm": 8.750964164733887, "learning_rate": 2.816423938318443e-05, "loss": 1.0047, "step": 143300 }, { "epoch": 4.380766175841632, "grad_norm": 14.825384140014648, "learning_rate": 2.8149094358095706e-05, "loss": 1.061, "step": 143400 }, { "epoch": 4.383821103439849, "grad_norm": 5.547531604766846, "learning_rate": 2.8133796352955577e-05, "loss": 1.1211, "step": 143500 }, { "epoch": 4.386876031038065, "grad_norm": 4.428119659423828, "learning_rate": 2.8118498347815448e-05, "loss": 1.2147, "step": 143600 }, { "epoch": 4.38993095863628, "grad_norm": 4.358971118927002, "learning_rate": 2.8103353322726716e-05, "loss": 1.0761, "step": 143700 }, { "epoch": 4.392985886234496, "grad_norm": 3.263248920440674, "learning_rate": 2.8088055317586587e-05, "loss": 1.0879, "step": 143800 }, { "epoch": 4.3960408138327125, "grad_norm": 6.146893501281738, "learning_rate": 2.807275731244646e-05, "loss": 1.2855, "step": 143900 }, { "epoch": 4.399095741430928, "grad_norm": 6.958042621612549, "learning_rate": 2.8057459307306326e-05, "loss": 1.2137, "step": 144000 }, { "epoch": 4.402150669029144, "grad_norm": 3.671699047088623, "learning_rate": 2.8042161302166197e-05, "loss": 1.0662, "step": 144100 }, { "epoch": 4.4052055966273596, "grad_norm": 6.236356258392334, "learning_rate": 2.802686329702607e-05, "loss": 1.4093, "step": 144200 }, { "epoch": 4.408260524225576, "grad_norm": 7.394711971282959, "learning_rate": 2.801156529188594e-05, "loss": 0.987, "step": 144300 }, { "epoch": 4.411315451823792, "grad_norm": 9.206113815307617, "learning_rate": 2.7996267286745807e-05, "loss": 0.9402, "step": 144400 }, { "epoch": 4.4143703794220075, "grad_norm": 6.848948955535889, "learning_rate": 2.798096928160568e-05, "loss": 1.343, "step": 144500 }, { "epoch": 4.417425307020224, "grad_norm": 6.24761962890625, "learning_rate": 2.796567127646555e-05, "loss": 1.1268, "step": 144600 }, { "epoch": 4.42048023461844, "grad_norm": 4.589752674102783, "learning_rate": 2.795037327132542e-05, "loss": 1.042, "step": 144700 }, { "epoch": 4.423535162216655, "grad_norm": 17.89345359802246, "learning_rate": 2.793507526618529e-05, "loss": 1.1777, "step": 144800 }, { "epoch": 4.426590089814871, "grad_norm": 5.196957111358643, "learning_rate": 2.791977726104516e-05, "loss": 1.2299, "step": 144900 }, { "epoch": 4.429645017413088, "grad_norm": 6.011660099029541, "learning_rate": 2.790447925590503e-05, "loss": 1.0815, "step": 145000 }, { "epoch": 4.432699945011303, "grad_norm": 8.668110847473145, "learning_rate": 2.7889181250764902e-05, "loss": 1.128, "step": 145100 }, { "epoch": 4.435754872609519, "grad_norm": 19.578367233276367, "learning_rate": 2.787388324562477e-05, "loss": 1.4063, "step": 145200 }, { "epoch": 4.438809800207735, "grad_norm": 7.321340084075928, "learning_rate": 2.785858524048464e-05, "loss": 1.1274, "step": 145300 }, { "epoch": 4.441864727805951, "grad_norm": 7.219663143157959, "learning_rate": 2.7843287235344512e-05, "loss": 1.1132, "step": 145400 }, { "epoch": 4.444919655404167, "grad_norm": 3.2994327545166016, "learning_rate": 2.7827989230204383e-05, "loss": 1.2619, "step": 145500 }, { "epoch": 4.447974583002383, "grad_norm": 9.860881805419922, "learning_rate": 2.781269122506425e-05, "loss": 1.2081, "step": 145600 }, { "epoch": 4.451029510600598, "grad_norm": 16.964540481567383, "learning_rate": 2.7797393219924122e-05, "loss": 1.201, "step": 145700 }, { "epoch": 4.454084438198815, "grad_norm": 13.067527770996094, "learning_rate": 2.7782095214783994e-05, "loss": 1.0872, "step": 145800 }, { "epoch": 4.457139365797031, "grad_norm": 2.4433538913726807, "learning_rate": 2.7766797209643865e-05, "loss": 1.4488, "step": 145900 }, { "epoch": 4.460194293395246, "grad_norm": 3.9496967792510986, "learning_rate": 2.7751499204503732e-05, "loss": 1.3713, "step": 146000 }, { "epoch": 4.463249220993463, "grad_norm": 2.2983593940734863, "learning_rate": 2.7736201199363604e-05, "loss": 1.2593, "step": 146100 }, { "epoch": 4.466304148591679, "grad_norm": 6.58331298828125, "learning_rate": 2.7720903194223475e-05, "loss": 1.032, "step": 146200 }, { "epoch": 4.469359076189894, "grad_norm": 9.05478286743164, "learning_rate": 2.7705605189083346e-05, "loss": 1.091, "step": 146300 }, { "epoch": 4.47241400378811, "grad_norm": 4.241959095001221, "learning_rate": 2.7690307183943214e-05, "loss": 1.0762, "step": 146400 }, { "epoch": 4.4754689313863265, "grad_norm": 19.84777069091797, "learning_rate": 2.7675009178803085e-05, "loss": 1.0127, "step": 146500 }, { "epoch": 4.478523858984542, "grad_norm": 3.5767288208007812, "learning_rate": 2.7659711173662956e-05, "loss": 1.1679, "step": 146600 }, { "epoch": 4.481578786582758, "grad_norm": 5.407430171966553, "learning_rate": 2.7644413168522827e-05, "loss": 1.1976, "step": 146700 }, { "epoch": 4.484633714180974, "grad_norm": 5.339792251586914, "learning_rate": 2.7629115163382695e-05, "loss": 1.1483, "step": 146800 }, { "epoch": 4.48768864177919, "grad_norm": 12.349178314208984, "learning_rate": 2.7613817158242566e-05, "loss": 1.2548, "step": 146900 }, { "epoch": 4.490743569377406, "grad_norm": 5.292774200439453, "learning_rate": 2.7598519153102437e-05, "loss": 1.0486, "step": 147000 }, { "epoch": 4.4937984969756215, "grad_norm": 6.7021894454956055, "learning_rate": 2.758322114796231e-05, "loss": 1.2699, "step": 147100 }, { "epoch": 4.496853424573837, "grad_norm": 5.217250823974609, "learning_rate": 2.7567923142822176e-05, "loss": 1.1054, "step": 147200 }, { "epoch": 4.499908352172054, "grad_norm": 5.183603286743164, "learning_rate": 2.7552625137682047e-05, "loss": 1.0157, "step": 147300 }, { "epoch": 4.5029632797702694, "grad_norm": 2.11531400680542, "learning_rate": 2.753732713254192e-05, "loss": 1.1512, "step": 147400 }, { "epoch": 4.506018207368485, "grad_norm": 2.366384983062744, "learning_rate": 2.752202912740179e-05, "loss": 1.0361, "step": 147500 }, { "epoch": 4.509073134966702, "grad_norm": 4.993153095245361, "learning_rate": 2.7506731122261657e-05, "loss": 1.1872, "step": 147600 }, { "epoch": 4.512128062564917, "grad_norm": 3.938058376312256, "learning_rate": 2.749158609717293e-05, "loss": 1.0318, "step": 147700 }, { "epoch": 4.515182990163133, "grad_norm": 5.502648830413818, "learning_rate": 2.7476288092032797e-05, "loss": 1.2964, "step": 147800 }, { "epoch": 4.518237917761349, "grad_norm": 8.001718521118164, "learning_rate": 2.746114306694407e-05, "loss": 0.9859, "step": 147900 }, { "epoch": 4.521292845359565, "grad_norm": 3.583967924118042, "learning_rate": 2.7445845061803942e-05, "loss": 1.1856, "step": 148000 }, { "epoch": 4.524347772957781, "grad_norm": 5.276466369628906, "learning_rate": 2.7430547056663814e-05, "loss": 1.2215, "step": 148100 }, { "epoch": 4.527402700555997, "grad_norm": 4.373233318328857, "learning_rate": 2.7415249051523685e-05, "loss": 1.1688, "step": 148200 }, { "epoch": 4.530457628154212, "grad_norm": 5.781352519989014, "learning_rate": 2.7399951046383553e-05, "loss": 1.1931, "step": 148300 }, { "epoch": 4.533512555752429, "grad_norm": 2.002715826034546, "learning_rate": 2.7384653041243424e-05, "loss": 1.3794, "step": 148400 }, { "epoch": 4.536567483350645, "grad_norm": 11.414073944091797, "learning_rate": 2.7369355036103295e-05, "loss": 1.4432, "step": 148500 }, { "epoch": 4.53962241094886, "grad_norm": 18.380279541015625, "learning_rate": 2.7354057030963166e-05, "loss": 1.1663, "step": 148600 }, { "epoch": 4.542677338547076, "grad_norm": 5.450042724609375, "learning_rate": 2.7338759025823034e-05, "loss": 1.0466, "step": 148700 }, { "epoch": 4.545732266145293, "grad_norm": 9.189093589782715, "learning_rate": 2.7323461020682905e-05, "loss": 1.0986, "step": 148800 }, { "epoch": 4.548787193743508, "grad_norm": 20.606910705566406, "learning_rate": 2.7308163015542776e-05, "loss": 1.1981, "step": 148900 }, { "epoch": 4.551842121341724, "grad_norm": 12.651390075683594, "learning_rate": 2.7292865010402647e-05, "loss": 1.2555, "step": 149000 }, { "epoch": 4.5548970489399405, "grad_norm": 4.8047895431518555, "learning_rate": 2.7277567005262515e-05, "loss": 0.9693, "step": 149100 }, { "epoch": 4.557951976538156, "grad_norm": 3.2293930053710938, "learning_rate": 2.7262269000122386e-05, "loss": 1.0667, "step": 149200 }, { "epoch": 4.561006904136372, "grad_norm": 7.451018810272217, "learning_rate": 2.7246970994982257e-05, "loss": 1.2841, "step": 149300 }, { "epoch": 4.564061831734588, "grad_norm": 3.7198963165283203, "learning_rate": 2.723167298984213e-05, "loss": 1.3827, "step": 149400 }, { "epoch": 4.567116759332804, "grad_norm": 9.99906063079834, "learning_rate": 2.7216374984701996e-05, "loss": 1.1345, "step": 149500 }, { "epoch": 4.57017168693102, "grad_norm": 5.092084884643555, "learning_rate": 2.7201076979561867e-05, "loss": 0.9375, "step": 149600 }, { "epoch": 4.5732266145292355, "grad_norm": 3.065910816192627, "learning_rate": 2.718577897442174e-05, "loss": 1.0997, "step": 149700 }, { "epoch": 4.576281542127451, "grad_norm": 4.881350517272949, "learning_rate": 2.717048096928161e-05, "loss": 1.0738, "step": 149800 }, { "epoch": 4.579336469725668, "grad_norm": 7.635152339935303, "learning_rate": 2.7155182964141477e-05, "loss": 1.2249, "step": 149900 }, { "epoch": 4.5823913973238835, "grad_norm": 4.8124098777771, "learning_rate": 2.713988495900135e-05, "loss": 1.0719, "step": 150000 }, { "epoch": 4.585446324922099, "grad_norm": 3.0546202659606934, "learning_rate": 2.712458695386122e-05, "loss": 1.2786, "step": 150100 }, { "epoch": 4.588501252520315, "grad_norm": 2.5182888507843018, "learning_rate": 2.710928894872109e-05, "loss": 0.9895, "step": 150200 }, { "epoch": 4.591556180118531, "grad_norm": 14.798208236694336, "learning_rate": 2.709399094358096e-05, "loss": 1.2199, "step": 150300 }, { "epoch": 4.594611107716747, "grad_norm": 7.6388983726501465, "learning_rate": 2.707869293844083e-05, "loss": 1.0752, "step": 150400 }, { "epoch": 4.597666035314963, "grad_norm": 11.475516319274902, "learning_rate": 2.70633949333007e-05, "loss": 1.2698, "step": 150500 }, { "epoch": 4.600720962913179, "grad_norm": 19.168842315673828, "learning_rate": 2.7048096928160572e-05, "loss": 1.3125, "step": 150600 }, { "epoch": 4.603775890511395, "grad_norm": 3.9104790687561035, "learning_rate": 2.703279892302044e-05, "loss": 1.0883, "step": 150700 }, { "epoch": 4.606830818109611, "grad_norm": 3.8482766151428223, "learning_rate": 2.701750091788031e-05, "loss": 1.1854, "step": 150800 }, { "epoch": 4.609885745707826, "grad_norm": 12.52638053894043, "learning_rate": 2.7002202912740182e-05, "loss": 1.0693, "step": 150900 }, { "epoch": 4.612940673306043, "grad_norm": 4.986527919769287, "learning_rate": 2.6986904907600053e-05, "loss": 1.1956, "step": 151000 }, { "epoch": 4.615995600904259, "grad_norm": 4.222838878631592, "learning_rate": 2.697160690245992e-05, "loss": 1.1175, "step": 151100 }, { "epoch": 4.619050528502474, "grad_norm": 5.308711051940918, "learning_rate": 2.6956308897319792e-05, "loss": 1.0547, "step": 151200 }, { "epoch": 4.622105456100691, "grad_norm": 4.313049793243408, "learning_rate": 2.6941010892179663e-05, "loss": 1.1143, "step": 151300 }, { "epoch": 4.625160383698907, "grad_norm": 6.646529197692871, "learning_rate": 2.6925712887039535e-05, "loss": 1.2822, "step": 151400 }, { "epoch": 4.628215311297122, "grad_norm": 17.885494232177734, "learning_rate": 2.6910414881899402e-05, "loss": 1.2093, "step": 151500 }, { "epoch": 4.631270238895338, "grad_norm": 4.94324254989624, "learning_rate": 2.6895269856810674e-05, "loss": 1.4626, "step": 151600 }, { "epoch": 4.634325166493554, "grad_norm": 4.033757209777832, "learning_rate": 2.6879971851670545e-05, "loss": 1.1175, "step": 151700 }, { "epoch": 4.63738009409177, "grad_norm": 6.608712673187256, "learning_rate": 2.6864673846530413e-05, "loss": 1.1387, "step": 151800 }, { "epoch": 4.640435021689986, "grad_norm": 9.266666412353516, "learning_rate": 2.6849375841390284e-05, "loss": 1.219, "step": 151900 }, { "epoch": 4.643489949288202, "grad_norm": 3.2637996673583984, "learning_rate": 2.6834077836250155e-05, "loss": 1.0627, "step": 152000 }, { "epoch": 4.646544876886418, "grad_norm": 5.220635414123535, "learning_rate": 2.6818779831110026e-05, "loss": 1.0148, "step": 152100 }, { "epoch": 4.649599804484634, "grad_norm": 4.843519687652588, "learning_rate": 2.6803481825969894e-05, "loss": 1.1965, "step": 152200 }, { "epoch": 4.6526547320828495, "grad_norm": 5.011851787567139, "learning_rate": 2.6788183820829765e-05, "loss": 1.1357, "step": 152300 }, { "epoch": 4.655709659681065, "grad_norm": 5.829235076904297, "learning_rate": 2.6772885815689636e-05, "loss": 1.0582, "step": 152400 }, { "epoch": 4.658764587279282, "grad_norm": 8.131631851196289, "learning_rate": 2.6757587810549507e-05, "loss": 1.2599, "step": 152500 }, { "epoch": 4.6618195148774975, "grad_norm": 3.51049542427063, "learning_rate": 2.6742289805409375e-05, "loss": 1.1366, "step": 152600 }, { "epoch": 4.664874442475713, "grad_norm": 2.632659912109375, "learning_rate": 2.6726991800269246e-05, "loss": 1.1617, "step": 152700 }, { "epoch": 4.66792937007393, "grad_norm": 2.5496699810028076, "learning_rate": 2.6711693795129117e-05, "loss": 0.9997, "step": 152800 }, { "epoch": 4.670984297672145, "grad_norm": 5.918241500854492, "learning_rate": 2.669639578998899e-05, "loss": 0.9468, "step": 152900 }, { "epoch": 4.674039225270361, "grad_norm": 2.7317497730255127, "learning_rate": 2.6681097784848856e-05, "loss": 1.1179, "step": 153000 }, { "epoch": 4.677094152868577, "grad_norm": 25.66161346435547, "learning_rate": 2.6665799779708728e-05, "loss": 1.2545, "step": 153100 }, { "epoch": 4.6801490804667925, "grad_norm": 14.719120025634766, "learning_rate": 2.66505017745686e-05, "loss": 1.1973, "step": 153200 }, { "epoch": 4.683204008065009, "grad_norm": 9.265717506408691, "learning_rate": 2.663520376942847e-05, "loss": 1.0308, "step": 153300 }, { "epoch": 4.686258935663225, "grad_norm": 0.9879122376441956, "learning_rate": 2.6619905764288338e-05, "loss": 1.3015, "step": 153400 }, { "epoch": 4.68931386326144, "grad_norm": 8.03515625, "learning_rate": 2.660460775914821e-05, "loss": 1.0214, "step": 153500 }, { "epoch": 4.692368790859657, "grad_norm": 21.492252349853516, "learning_rate": 2.658930975400808e-05, "loss": 1.1221, "step": 153600 }, { "epoch": 4.695423718457873, "grad_norm": 2.753187894821167, "learning_rate": 2.657401174886795e-05, "loss": 1.1131, "step": 153700 }, { "epoch": 4.698478646056088, "grad_norm": 7.463822364807129, "learning_rate": 2.655871374372782e-05, "loss": 0.9914, "step": 153800 }, { "epoch": 4.701533573654304, "grad_norm": 3.7811901569366455, "learning_rate": 2.654341573858769e-05, "loss": 1.1621, "step": 153900 }, { "epoch": 4.704588501252521, "grad_norm": 5.818233013153076, "learning_rate": 2.652811773344756e-05, "loss": 1.0053, "step": 154000 }, { "epoch": 4.707643428850736, "grad_norm": 11.87837028503418, "learning_rate": 2.651297270835883e-05, "loss": 1.2525, "step": 154100 }, { "epoch": 4.710698356448952, "grad_norm": 6.573636531829834, "learning_rate": 2.64976747032187e-05, "loss": 0.9842, "step": 154200 }, { "epoch": 4.7137532840471685, "grad_norm": 4.809690952301025, "learning_rate": 2.648237669807857e-05, "loss": 1.0407, "step": 154300 }, { "epoch": 4.716808211645384, "grad_norm": 1.3159583806991577, "learning_rate": 2.646707869293844e-05, "loss": 0.9382, "step": 154400 }, { "epoch": 4.7198631392436, "grad_norm": 4.047608375549316, "learning_rate": 2.645178068779831e-05, "loss": 1.172, "step": 154500 }, { "epoch": 4.722918066841816, "grad_norm": 7.231751441955566, "learning_rate": 2.643648268265818e-05, "loss": 1.1552, "step": 154600 }, { "epoch": 4.725972994440031, "grad_norm": 0.9529539942741394, "learning_rate": 2.6421184677518053e-05, "loss": 1.1272, "step": 154700 }, { "epoch": 4.729027922038248, "grad_norm": 2.781122922897339, "learning_rate": 2.640588667237792e-05, "loss": 1.0022, "step": 154800 }, { "epoch": 4.7320828496364635, "grad_norm": 2.895209312438965, "learning_rate": 2.639058866723779e-05, "loss": 1.3872, "step": 154900 }, { "epoch": 4.735137777234679, "grad_norm": 12.057914733886719, "learning_rate": 2.6375290662097663e-05, "loss": 1.2874, "step": 155000 }, { "epoch": 4.738192704832896, "grad_norm": 4.3409881591796875, "learning_rate": 2.6359992656957534e-05, "loss": 1.4921, "step": 155100 }, { "epoch": 4.7412476324311115, "grad_norm": 4.266219139099121, "learning_rate": 2.6344694651817402e-05, "loss": 1.0867, "step": 155200 }, { "epoch": 4.744302560029327, "grad_norm": 4.5013427734375, "learning_rate": 2.6329396646677273e-05, "loss": 1.1747, "step": 155300 }, { "epoch": 4.747357487627543, "grad_norm": 4.732046604156494, "learning_rate": 2.6314098641537144e-05, "loss": 1.1971, "step": 155400 }, { "epoch": 4.750412415225759, "grad_norm": 3.806830883026123, "learning_rate": 2.6298800636397015e-05, "loss": 1.2849, "step": 155500 }, { "epoch": 4.753467342823975, "grad_norm": 14.340802192687988, "learning_rate": 2.6283502631256883e-05, "loss": 1.0664, "step": 155600 }, { "epoch": 4.756522270422191, "grad_norm": 9.971649169921875, "learning_rate": 2.6268204626116754e-05, "loss": 1.0028, "step": 155700 }, { "epoch": 4.759577198020407, "grad_norm": 9.331883430480957, "learning_rate": 2.6252906620976625e-05, "loss": 1.2983, "step": 155800 }, { "epoch": 4.762632125618623, "grad_norm": 7.091375350952148, "learning_rate": 2.6237608615836496e-05, "loss": 1.1806, "step": 155900 }, { "epoch": 4.765687053216839, "grad_norm": 8.6240234375, "learning_rate": 2.6222310610696364e-05, "loss": 1.1379, "step": 156000 }, { "epoch": 4.768741980815054, "grad_norm": 0.5839551091194153, "learning_rate": 2.6207012605556235e-05, "loss": 1.2997, "step": 156100 }, { "epoch": 4.771796908413271, "grad_norm": 1.799262285232544, "learning_rate": 2.6191714600416107e-05, "loss": 1.3177, "step": 156200 }, { "epoch": 4.774851836011487, "grad_norm": 7.29044771194458, "learning_rate": 2.6176416595275978e-05, "loss": 1.3113, "step": 156300 }, { "epoch": 4.777906763609702, "grad_norm": 7.393566131591797, "learning_rate": 2.6161118590135845e-05, "loss": 1.0873, "step": 156400 }, { "epoch": 4.780961691207918, "grad_norm": 5.538482189178467, "learning_rate": 2.6145820584995717e-05, "loss": 1.0264, "step": 156500 }, { "epoch": 4.784016618806135, "grad_norm": 4.523062705993652, "learning_rate": 2.6130522579855588e-05, "loss": 1.0554, "step": 156600 }, { "epoch": 4.78707154640435, "grad_norm": 6.2098894119262695, "learning_rate": 2.611522457471546e-05, "loss": 1.2072, "step": 156700 }, { "epoch": 4.790126474002566, "grad_norm": 5.000922203063965, "learning_rate": 2.6099926569575327e-05, "loss": 1.0549, "step": 156800 }, { "epoch": 4.793181401600782, "grad_norm": 5.6592936515808105, "learning_rate": 2.6084628564435198e-05, "loss": 1.0814, "step": 156900 }, { "epoch": 4.796236329198998, "grad_norm": 11.27538776397705, "learning_rate": 2.606933055929507e-05, "loss": 1.1692, "step": 157000 }, { "epoch": 4.799291256797214, "grad_norm": 9.253680229187012, "learning_rate": 2.605403255415494e-05, "loss": 1.147, "step": 157100 }, { "epoch": 4.80234618439543, "grad_norm": 1.797663927078247, "learning_rate": 2.6038734549014808e-05, "loss": 1.1446, "step": 157200 }, { "epoch": 4.805401111993646, "grad_norm": 4.268359184265137, "learning_rate": 2.602343654387468e-05, "loss": 1.2718, "step": 157300 }, { "epoch": 4.808456039591862, "grad_norm": 12.612486839294434, "learning_rate": 2.600813853873455e-05, "loss": 1.294, "step": 157400 }, { "epoch": 4.8115109671900775, "grad_norm": 3.7770674228668213, "learning_rate": 2.599284053359442e-05, "loss": 0.9739, "step": 157500 }, { "epoch": 4.814565894788293, "grad_norm": 6.431491851806641, "learning_rate": 2.597754252845429e-05, "loss": 0.9815, "step": 157600 }, { "epoch": 4.81762082238651, "grad_norm": 6.823291301727295, "learning_rate": 2.596239750336556e-05, "loss": 1.265, "step": 157700 }, { "epoch": 4.8206757499847255, "grad_norm": 4.733063697814941, "learning_rate": 2.5947099498225432e-05, "loss": 1.0081, "step": 157800 }, { "epoch": 4.823730677582941, "grad_norm": 8.048057556152344, "learning_rate": 2.59318014930853e-05, "loss": 0.9448, "step": 157900 }, { "epoch": 4.826785605181157, "grad_norm": 5.712404251098633, "learning_rate": 2.591650348794517e-05, "loss": 1.1391, "step": 158000 }, { "epoch": 4.829840532779373, "grad_norm": 20.968780517578125, "learning_rate": 2.5901205482805042e-05, "loss": 1.1777, "step": 158100 }, { "epoch": 4.832895460377589, "grad_norm": 27.42873191833496, "learning_rate": 2.5885907477664913e-05, "loss": 1.2007, "step": 158200 }, { "epoch": 4.835950387975805, "grad_norm": 13.017873764038086, "learning_rate": 2.587060947252478e-05, "loss": 1.0182, "step": 158300 }, { "epoch": 4.8390053155740205, "grad_norm": 5.933872222900391, "learning_rate": 2.5855311467384652e-05, "loss": 0.9818, "step": 158400 }, { "epoch": 4.842060243172237, "grad_norm": 6.615920543670654, "learning_rate": 2.5840013462244523e-05, "loss": 1.0567, "step": 158500 }, { "epoch": 4.845115170770453, "grad_norm": 5.392825126647949, "learning_rate": 2.5824715457104394e-05, "loss": 1.3925, "step": 158600 }, { "epoch": 4.848170098368668, "grad_norm": 4.545580863952637, "learning_rate": 2.5809417451964262e-05, "loss": 1.0607, "step": 158700 }, { "epoch": 4.851225025966885, "grad_norm": 3.597367286682129, "learning_rate": 2.5794119446824133e-05, "loss": 1.1868, "step": 158800 }, { "epoch": 4.854279953565101, "grad_norm": 11.348734855651855, "learning_rate": 2.5778821441684004e-05, "loss": 1.0758, "step": 158900 }, { "epoch": 4.857334881163316, "grad_norm": 4.201880931854248, "learning_rate": 2.5763523436543875e-05, "loss": 1.2604, "step": 159000 }, { "epoch": 4.860389808761532, "grad_norm": 2.7427544593811035, "learning_rate": 2.5748225431403743e-05, "loss": 1.1255, "step": 159100 }, { "epoch": 4.863444736359749, "grad_norm": 7.285526275634766, "learning_rate": 2.5732927426263614e-05, "loss": 0.9783, "step": 159200 }, { "epoch": 4.866499663957964, "grad_norm": 3.2996041774749756, "learning_rate": 2.5717629421123485e-05, "loss": 1.1064, "step": 159300 }, { "epoch": 4.86955459155618, "grad_norm": 6.808858394622803, "learning_rate": 2.5702331415983357e-05, "loss": 1.2622, "step": 159400 }, { "epoch": 4.872609519154396, "grad_norm": 10.930204391479492, "learning_rate": 2.5687033410843224e-05, "loss": 1.091, "step": 159500 }, { "epoch": 4.875664446752612, "grad_norm": 3.2368526458740234, "learning_rate": 2.5671735405703096e-05, "loss": 1.0833, "step": 159600 }, { "epoch": 4.878719374350828, "grad_norm": 13.43560791015625, "learning_rate": 2.5656437400562967e-05, "loss": 1.1661, "step": 159700 }, { "epoch": 4.881774301949044, "grad_norm": 3.7159016132354736, "learning_rate": 2.5641139395422838e-05, "loss": 1.2862, "step": 159800 }, { "epoch": 4.884829229547259, "grad_norm": 3.5872607231140137, "learning_rate": 2.5625841390282706e-05, "loss": 1.105, "step": 159900 }, { "epoch": 4.887884157145476, "grad_norm": 5.785370826721191, "learning_rate": 2.5610543385142577e-05, "loss": 1.0201, "step": 160000 }, { "epoch": 4.8909390847436915, "grad_norm": 3.647353172302246, "learning_rate": 2.5595245380002448e-05, "loss": 1.1966, "step": 160100 }, { "epoch": 4.893994012341907, "grad_norm": 4.53983211517334, "learning_rate": 2.557994737486232e-05, "loss": 1.1269, "step": 160200 }, { "epoch": 4.897048939940124, "grad_norm": 0.30907294154167175, "learning_rate": 2.5564649369722187e-05, "loss": 1.1997, "step": 160300 }, { "epoch": 4.9001038675383395, "grad_norm": 3.865354299545288, "learning_rate": 2.5549351364582058e-05, "loss": 1.1692, "step": 160400 }, { "epoch": 4.903158795136555, "grad_norm": 11.909504890441895, "learning_rate": 2.553405335944193e-05, "loss": 1.1911, "step": 160500 }, { "epoch": 4.906213722734771, "grad_norm": 4.162954330444336, "learning_rate": 2.55187553543018e-05, "loss": 1.2396, "step": 160600 }, { "epoch": 4.909268650332987, "grad_norm": 5.563848972320557, "learning_rate": 2.5503457349161668e-05, "loss": 1.221, "step": 160700 }, { "epoch": 4.912323577931203, "grad_norm": 3.500187397003174, "learning_rate": 2.548815934402154e-05, "loss": 1.1687, "step": 160800 }, { "epoch": 4.915378505529419, "grad_norm": 11.680179595947266, "learning_rate": 2.547286133888141e-05, "loss": 1.4372, "step": 160900 }, { "epoch": 4.918433433127635, "grad_norm": 2.8160529136657715, "learning_rate": 2.545756333374128e-05, "loss": 1.513, "step": 161000 }, { "epoch": 4.921488360725851, "grad_norm": 9.372578620910645, "learning_rate": 2.544226532860115e-05, "loss": 1.1128, "step": 161100 }, { "epoch": 4.924543288324067, "grad_norm": 4.864928722381592, "learning_rate": 2.542696732346102e-05, "loss": 1.397, "step": 161200 }, { "epoch": 4.927598215922282, "grad_norm": 4.439268589019775, "learning_rate": 2.541166931832089e-05, "loss": 1.0441, "step": 161300 }, { "epoch": 4.930653143520498, "grad_norm": 29.387102127075195, "learning_rate": 2.5396524293232166e-05, "loss": 1.4971, "step": 161400 }, { "epoch": 4.933708071118715, "grad_norm": 5.6792449951171875, "learning_rate": 2.5381226288092038e-05, "loss": 1.0976, "step": 161500 }, { "epoch": 4.93676299871693, "grad_norm": 4.388849258422852, "learning_rate": 2.5365928282951905e-05, "loss": 1.2655, "step": 161600 }, { "epoch": 4.939817926315146, "grad_norm": 4.719710350036621, "learning_rate": 2.5350630277811777e-05, "loss": 1.4667, "step": 161700 }, { "epoch": 4.942872853913363, "grad_norm": 4.246535301208496, "learning_rate": 2.5335332272671648e-05, "loss": 1.1536, "step": 161800 }, { "epoch": 4.945927781511578, "grad_norm": 8.558767318725586, "learning_rate": 2.532003426753152e-05, "loss": 1.1652, "step": 161900 }, { "epoch": 4.948982709109794, "grad_norm": 5.678195476531982, "learning_rate": 2.5304736262391387e-05, "loss": 1.2042, "step": 162000 }, { "epoch": 4.95203763670801, "grad_norm": 9.030732154846191, "learning_rate": 2.5289438257251258e-05, "loss": 1.1497, "step": 162100 }, { "epoch": 4.955092564306226, "grad_norm": 2.4529170989990234, "learning_rate": 2.527414025211113e-05, "loss": 1.3533, "step": 162200 }, { "epoch": 4.958147491904442, "grad_norm": 6.64496374130249, "learning_rate": 2.5258842246971e-05, "loss": 1.2636, "step": 162300 }, { "epoch": 4.961202419502658, "grad_norm": 10.798649787902832, "learning_rate": 2.5243544241830868e-05, "loss": 1.2472, "step": 162400 }, { "epoch": 4.964257347100874, "grad_norm": 4.542051315307617, "learning_rate": 2.522824623669074e-05, "loss": 1.2845, "step": 162500 }, { "epoch": 4.96731227469909, "grad_norm": 9.684645652770996, "learning_rate": 2.521294823155061e-05, "loss": 1.1378, "step": 162600 }, { "epoch": 4.9703672022973056, "grad_norm": 2.868638753890991, "learning_rate": 2.519765022641048e-05, "loss": 1.0938, "step": 162700 }, { "epoch": 4.973422129895521, "grad_norm": 12.990683555603027, "learning_rate": 2.518235222127035e-05, "loss": 1.1421, "step": 162800 }, { "epoch": 4.976477057493737, "grad_norm": 5.3151140213012695, "learning_rate": 2.516705421613022e-05, "loss": 1.3563, "step": 162900 }, { "epoch": 4.9795319850919535, "grad_norm": 5.537310600280762, "learning_rate": 2.515175621099009e-05, "loss": 0.9885, "step": 163000 }, { "epoch": 4.982586912690169, "grad_norm": 6.696578025817871, "learning_rate": 2.5136458205849963e-05, "loss": 1.2855, "step": 163100 }, { "epoch": 4.985641840288385, "grad_norm": 6.6071648597717285, "learning_rate": 2.512116020070983e-05, "loss": 1.5194, "step": 163200 }, { "epoch": 4.988696767886601, "grad_norm": 3.9273293018341064, "learning_rate": 2.51058621955697e-05, "loss": 1.1475, "step": 163300 }, { "epoch": 4.991751695484817, "grad_norm": 6.573533535003662, "learning_rate": 2.5090564190429573e-05, "loss": 1.0817, "step": 163400 }, { "epoch": 4.994806623083033, "grad_norm": 6.224365234375, "learning_rate": 2.5075266185289444e-05, "loss": 1.2204, "step": 163500 }, { "epoch": 4.9978615506812485, "grad_norm": 8.9005765914917, "learning_rate": 2.505996818014931e-05, "loss": 1.2897, "step": 163600 }, { "epoch": 5.0, "eval_accuracy": 0.7251787132644957, "eval_loss": 0.7394196391105652, "eval_runtime": 1778.1354, "eval_samples_per_second": 18.409, "eval_steps_per_second": 4.603, "step": 163670 }, { "epoch": 5.000916478279465, "grad_norm": 9.13719654083252, "learning_rate": 2.5044670175009183e-05, "loss": 1.0322, "step": 163700 }, { "epoch": 5.003971405877681, "grad_norm": 3.0623581409454346, "learning_rate": 2.5029372169869047e-05, "loss": 1.1183, "step": 163800 }, { "epoch": 5.007026333475896, "grad_norm": 9.058491706848145, "learning_rate": 2.5014074164728918e-05, "loss": 1.2189, "step": 163900 }, { "epoch": 5.010081261074112, "grad_norm": 17.310258865356445, "learning_rate": 2.499877615958879e-05, "loss": 1.1327, "step": 164000 }, { "epoch": 5.013136188672329, "grad_norm": 2.7564404010772705, "learning_rate": 2.498347815444866e-05, "loss": 1.104, "step": 164100 }, { "epoch": 5.016191116270544, "grad_norm": 13.609418869018555, "learning_rate": 2.4968180149308532e-05, "loss": 1.3047, "step": 164200 }, { "epoch": 5.01924604386876, "grad_norm": 5.303647041320801, "learning_rate": 2.4952882144168403e-05, "loss": 1.0716, "step": 164300 }, { "epoch": 5.022300971466977, "grad_norm": 3.680124282836914, "learning_rate": 2.493758413902827e-05, "loss": 1.4862, "step": 164400 }, { "epoch": 5.025355899065192, "grad_norm": 5.9867377281188965, "learning_rate": 2.4922286133888142e-05, "loss": 1.2148, "step": 164500 }, { "epoch": 5.028410826663408, "grad_norm": 4.398543834686279, "learning_rate": 2.4906988128748013e-05, "loss": 1.2117, "step": 164600 }, { "epoch": 5.031465754261624, "grad_norm": 4.801534175872803, "learning_rate": 2.4891690123607884e-05, "loss": 1.1532, "step": 164700 }, { "epoch": 5.03452068185984, "grad_norm": 5.785711288452148, "learning_rate": 2.4876392118467752e-05, "loss": 1.3217, "step": 164800 }, { "epoch": 5.037575609458056, "grad_norm": 2.3183858394622803, "learning_rate": 2.4861094113327623e-05, "loss": 1.3079, "step": 164900 }, { "epoch": 5.040630537056272, "grad_norm": 1.9743845462799072, "learning_rate": 2.4845796108187494e-05, "loss": 1.1051, "step": 165000 }, { "epoch": 5.043685464654487, "grad_norm": 4.131516933441162, "learning_rate": 2.4830498103047365e-05, "loss": 1.0025, "step": 165100 }, { "epoch": 5.046740392252704, "grad_norm": 6.193573951721191, "learning_rate": 2.4815200097907233e-05, "loss": 1.2593, "step": 165200 }, { "epoch": 5.04979531985092, "grad_norm": 9.684208869934082, "learning_rate": 2.4799902092767104e-05, "loss": 1.1306, "step": 165300 }, { "epoch": 5.052850247449135, "grad_norm": 7.245005130767822, "learning_rate": 2.4784757067678376e-05, "loss": 1.0882, "step": 165400 }, { "epoch": 5.055905175047352, "grad_norm": 6.9910664558410645, "learning_rate": 2.4769459062538247e-05, "loss": 0.9976, "step": 165500 }, { "epoch": 5.0589601026455675, "grad_norm": 5.503900051116943, "learning_rate": 2.4754161057398118e-05, "loss": 1.0166, "step": 165600 }, { "epoch": 5.062015030243783, "grad_norm": 6.693299770355225, "learning_rate": 2.473886305225799e-05, "loss": 1.0534, "step": 165700 }, { "epoch": 5.065069957841999, "grad_norm": 5.447563171386719, "learning_rate": 2.4723565047117857e-05, "loss": 1.1812, "step": 165800 }, { "epoch": 5.068124885440215, "grad_norm": 6.907769680023193, "learning_rate": 2.4708267041977728e-05, "loss": 1.0964, "step": 165900 }, { "epoch": 5.071179813038431, "grad_norm": 9.287679672241211, "learning_rate": 2.46929690368376e-05, "loss": 1.1612, "step": 166000 }, { "epoch": 5.074234740636647, "grad_norm": 2.0512337684631348, "learning_rate": 2.467767103169747e-05, "loss": 1.0268, "step": 166100 }, { "epoch": 5.0772896682348625, "grad_norm": 2.716292381286621, "learning_rate": 2.4662373026557338e-05, "loss": 1.2245, "step": 166200 }, { "epoch": 5.080344595833079, "grad_norm": 13.274731636047363, "learning_rate": 2.464707502141721e-05, "loss": 1.3263, "step": 166300 }, { "epoch": 5.083399523431295, "grad_norm": 3.94521427154541, "learning_rate": 2.463177701627708e-05, "loss": 0.9961, "step": 166400 }, { "epoch": 5.08645445102951, "grad_norm": 7.5687456130981445, "learning_rate": 2.4616479011136948e-05, "loss": 1.3137, "step": 166500 }, { "epoch": 5.089509378627726, "grad_norm": 6.075013160705566, "learning_rate": 2.460118100599682e-05, "loss": 1.0343, "step": 166600 }, { "epoch": 5.092564306225943, "grad_norm": 13.179591178894043, "learning_rate": 2.4585883000856687e-05, "loss": 1.2662, "step": 166700 }, { "epoch": 5.095619233824158, "grad_norm": 12.164190292358398, "learning_rate": 2.4570584995716558e-05, "loss": 1.2506, "step": 166800 }, { "epoch": 5.098674161422374, "grad_norm": 7.566710472106934, "learning_rate": 2.455528699057643e-05, "loss": 1.2013, "step": 166900 }, { "epoch": 5.101729089020591, "grad_norm": 7.673246383666992, "learning_rate": 2.45399889854363e-05, "loss": 1.0592, "step": 167000 }, { "epoch": 5.104784016618806, "grad_norm": 9.131725311279297, "learning_rate": 2.452469098029617e-05, "loss": 1.098, "step": 167100 }, { "epoch": 5.107838944217022, "grad_norm": 4.740269184112549, "learning_rate": 2.450939297515604e-05, "loss": 1.0963, "step": 167200 }, { "epoch": 5.110893871815238, "grad_norm": 12.692572593688965, "learning_rate": 2.449409497001591e-05, "loss": 1.2042, "step": 167300 }, { "epoch": 5.113948799413454, "grad_norm": 10.311156272888184, "learning_rate": 2.4478796964875782e-05, "loss": 1.0407, "step": 167400 }, { "epoch": 5.11700372701167, "grad_norm": 13.6527681350708, "learning_rate": 2.4463804919838453e-05, "loss": 1.3487, "step": 167500 }, { "epoch": 5.120058654609886, "grad_norm": 6.338456153869629, "learning_rate": 2.4448506914698325e-05, "loss": 1.1242, "step": 167600 }, { "epoch": 5.123113582208101, "grad_norm": 9.074694633483887, "learning_rate": 2.4433208909558196e-05, "loss": 1.0838, "step": 167700 }, { "epoch": 5.126168509806318, "grad_norm": 4.044082164764404, "learning_rate": 2.4417910904418063e-05, "loss": 1.0163, "step": 167800 }, { "epoch": 5.129223437404534, "grad_norm": 6.65750789642334, "learning_rate": 2.4402612899277935e-05, "loss": 1.1827, "step": 167900 }, { "epoch": 5.132278365002749, "grad_norm": 10.863242149353027, "learning_rate": 2.4387314894137806e-05, "loss": 1.0881, "step": 168000 }, { "epoch": 5.135333292600965, "grad_norm": 4.646780967712402, "learning_rate": 2.4372016888997677e-05, "loss": 0.9714, "step": 168100 }, { "epoch": 5.1383882201991815, "grad_norm": 8.3972749710083, "learning_rate": 2.4356718883857545e-05, "loss": 1.0478, "step": 168200 }, { "epoch": 5.141443147797397, "grad_norm": 9.871989250183105, "learning_rate": 2.4341420878717416e-05, "loss": 1.4897, "step": 168300 }, { "epoch": 5.144498075395613, "grad_norm": 9.105977058410645, "learning_rate": 2.4326122873577287e-05, "loss": 1.1656, "step": 168400 }, { "epoch": 5.1475530029938295, "grad_norm": 9.940979957580566, "learning_rate": 2.4310824868437158e-05, "loss": 1.2031, "step": 168500 }, { "epoch": 5.150607930592045, "grad_norm": 4.936947345733643, "learning_rate": 2.4295526863297026e-05, "loss": 0.9928, "step": 168600 }, { "epoch": 5.153662858190261, "grad_norm": 5.739433765411377, "learning_rate": 2.4280228858156897e-05, "loss": 1.3259, "step": 168700 }, { "epoch": 5.1567177857884765, "grad_norm": 3.817854404449463, "learning_rate": 2.4264930853016768e-05, "loss": 1.2278, "step": 168800 }, { "epoch": 5.159772713386693, "grad_norm": 6.3019633293151855, "learning_rate": 2.424963284787664e-05, "loss": 1.1436, "step": 168900 }, { "epoch": 5.162827640984909, "grad_norm": 6.779687404632568, "learning_rate": 2.4234334842736507e-05, "loss": 1.3622, "step": 169000 }, { "epoch": 5.1658825685831244, "grad_norm": 4.4668731689453125, "learning_rate": 2.421903683759638e-05, "loss": 1.2539, "step": 169100 }, { "epoch": 5.16893749618134, "grad_norm": 6.3263068199157715, "learning_rate": 2.420373883245625e-05, "loss": 1.0856, "step": 169200 }, { "epoch": 5.171992423779557, "grad_norm": 13.243605613708496, "learning_rate": 2.418844082731612e-05, "loss": 1.1165, "step": 169300 }, { "epoch": 5.175047351377772, "grad_norm": 3.979931354522705, "learning_rate": 2.417314282217599e-05, "loss": 0.9953, "step": 169400 }, { "epoch": 5.178102278975988, "grad_norm": 5.5036821365356445, "learning_rate": 2.415784481703586e-05, "loss": 1.0252, "step": 169500 }, { "epoch": 5.181157206574204, "grad_norm": 3.225660562515259, "learning_rate": 2.414254681189573e-05, "loss": 1.1752, "step": 169600 }, { "epoch": 5.18421213417242, "grad_norm": 6.257805824279785, "learning_rate": 2.4127248806755602e-05, "loss": 0.94, "step": 169700 }, { "epoch": 5.187267061770636, "grad_norm": 2.3778061866760254, "learning_rate": 2.411195080161547e-05, "loss": 1.138, "step": 169800 }, { "epoch": 5.190321989368852, "grad_norm": 10.44018840789795, "learning_rate": 2.409665279647534e-05, "loss": 1.2677, "step": 169900 }, { "epoch": 5.193376916967068, "grad_norm": 6.572098255157471, "learning_rate": 2.4081354791335212e-05, "loss": 1.1052, "step": 170000 }, { "epoch": 5.196431844565284, "grad_norm": 4.0942463874816895, "learning_rate": 2.4066056786195083e-05, "loss": 1.229, "step": 170100 }, { "epoch": 5.1994867721635, "grad_norm": 3.8638057708740234, "learning_rate": 2.405075878105495e-05, "loss": 1.1286, "step": 170200 }, { "epoch": 5.202541699761715, "grad_norm": 5.156481742858887, "learning_rate": 2.4035460775914822e-05, "loss": 1.4408, "step": 170300 }, { "epoch": 5.205596627359932, "grad_norm": 8.467106819152832, "learning_rate": 2.4020162770774693e-05, "loss": 1.1772, "step": 170400 }, { "epoch": 5.208651554958148, "grad_norm": 7.49545431137085, "learning_rate": 2.4004864765634564e-05, "loss": 1.1276, "step": 170500 }, { "epoch": 5.211706482556363, "grad_norm": 5.997565746307373, "learning_rate": 2.3989566760494432e-05, "loss": 1.0463, "step": 170600 }, { "epoch": 5.214761410154579, "grad_norm": 1.3201504945755005, "learning_rate": 2.3974421735405707e-05, "loss": 1.1142, "step": 170700 }, { "epoch": 5.2178163377527955, "grad_norm": 4.374696731567383, "learning_rate": 2.3959123730265575e-05, "loss": 1.1381, "step": 170800 }, { "epoch": 5.220871265351011, "grad_norm": 9.526435852050781, "learning_rate": 2.3943825725125446e-05, "loss": 1.3192, "step": 170900 }, { "epoch": 5.223926192949227, "grad_norm": 4.740625858306885, "learning_rate": 2.3928527719985317e-05, "loss": 0.9071, "step": 171000 }, { "epoch": 5.226981120547443, "grad_norm": 2.8437039852142334, "learning_rate": 2.3913229714845188e-05, "loss": 1.136, "step": 171100 }, { "epoch": 5.230036048145659, "grad_norm": 8.31369686126709, "learning_rate": 2.3897931709705056e-05, "loss": 0.9906, "step": 171200 }, { "epoch": 5.233090975743875, "grad_norm": 6.5492401123046875, "learning_rate": 2.3882633704564924e-05, "loss": 1.0642, "step": 171300 }, { "epoch": 5.2361459033420905, "grad_norm": 13.388254165649414, "learning_rate": 2.3867335699424795e-05, "loss": 1.1662, "step": 171400 }, { "epoch": 5.239200830940307, "grad_norm": 13.241950035095215, "learning_rate": 2.3852037694284666e-05, "loss": 1.1376, "step": 171500 }, { "epoch": 5.242255758538523, "grad_norm": 15.0129976272583, "learning_rate": 2.3836739689144537e-05, "loss": 1.0281, "step": 171600 }, { "epoch": 5.2453106861367385, "grad_norm": 2.2022969722747803, "learning_rate": 2.3821441684004405e-05, "loss": 1.2516, "step": 171700 }, { "epoch": 5.248365613734954, "grad_norm": 8.645432472229004, "learning_rate": 2.3806143678864276e-05, "loss": 1.0883, "step": 171800 }, { "epoch": 5.251420541333171, "grad_norm": 11.106043815612793, "learning_rate": 2.3790845673724147e-05, "loss": 0.926, "step": 171900 }, { "epoch": 5.254475468931386, "grad_norm": 3.1398587226867676, "learning_rate": 2.377554766858402e-05, "loss": 1.0641, "step": 172000 }, { "epoch": 5.257530396529602, "grad_norm": 9.555487632751465, "learning_rate": 2.3760249663443886e-05, "loss": 1.0317, "step": 172100 }, { "epoch": 5.260585324127818, "grad_norm": 10.592201232910156, "learning_rate": 2.3744951658303757e-05, "loss": 1.0764, "step": 172200 }, { "epoch": 5.263640251726034, "grad_norm": 11.372227668762207, "learning_rate": 2.372965365316363e-05, "loss": 1.1134, "step": 172300 }, { "epoch": 5.26669517932425, "grad_norm": 3.5038912296295166, "learning_rate": 2.37143556480235e-05, "loss": 1.0342, "step": 172400 }, { "epoch": 5.269750106922466, "grad_norm": 9.717251777648926, "learning_rate": 2.3699057642883367e-05, "loss": 1.1635, "step": 172500 }, { "epoch": 5.272805034520681, "grad_norm": 13.201680183410645, "learning_rate": 2.368375963774324e-05, "loss": 1.2265, "step": 172600 }, { "epoch": 5.275859962118898, "grad_norm": 6.547023773193359, "learning_rate": 2.366846163260311e-05, "loss": 1.0407, "step": 172700 }, { "epoch": 5.278914889717114, "grad_norm": 3.413754463195801, "learning_rate": 2.365316362746298e-05, "loss": 1.2009, "step": 172800 }, { "epoch": 5.281969817315329, "grad_norm": 3.3388729095458984, "learning_rate": 2.363786562232285e-05, "loss": 0.9942, "step": 172900 }, { "epoch": 5.285024744913546, "grad_norm": 3.213888645172119, "learning_rate": 2.362256761718272e-05, "loss": 1.1865, "step": 173000 }, { "epoch": 5.288079672511762, "grad_norm": 7.584039688110352, "learning_rate": 2.360726961204259e-05, "loss": 1.1066, "step": 173100 }, { "epoch": 5.291134600109977, "grad_norm": 9.122584342956543, "learning_rate": 2.3591971606902462e-05, "loss": 1.0166, "step": 173200 }, { "epoch": 5.294189527708193, "grad_norm": 14.75819206237793, "learning_rate": 2.357667360176233e-05, "loss": 1.2447, "step": 173300 }, { "epoch": 5.2972444553064095, "grad_norm": 3.378222703933716, "learning_rate": 2.35615285766736e-05, "loss": 1.1964, "step": 173400 }, { "epoch": 5.300299382904625, "grad_norm": 1.2886375188827515, "learning_rate": 2.3546230571533472e-05, "loss": 1.1858, "step": 173500 }, { "epoch": 5.303354310502841, "grad_norm": 14.222728729248047, "learning_rate": 2.3530932566393344e-05, "loss": 1.3939, "step": 173600 }, { "epoch": 5.3064092381010575, "grad_norm": 7.933919906616211, "learning_rate": 2.3515634561253215e-05, "loss": 1.0642, "step": 173700 }, { "epoch": 5.309464165699273, "grad_norm": 7.706429958343506, "learning_rate": 2.3500336556113082e-05, "loss": 1.0335, "step": 173800 }, { "epoch": 5.312519093297489, "grad_norm": 11.539246559143066, "learning_rate": 2.3485038550972954e-05, "loss": 1.0982, "step": 173900 }, { "epoch": 5.3155740208957045, "grad_norm": 8.754623413085938, "learning_rate": 2.3469740545832825e-05, "loss": 0.9977, "step": 174000 }, { "epoch": 5.318628948493921, "grad_norm": 8.987189292907715, "learning_rate": 2.3454442540692696e-05, "loss": 1.176, "step": 174100 }, { "epoch": 5.321683876092137, "grad_norm": 15.71728515625, "learning_rate": 2.3439144535552564e-05, "loss": 1.0027, "step": 174200 }, { "epoch": 5.3247388036903525, "grad_norm": 8.85295295715332, "learning_rate": 2.3423846530412435e-05, "loss": 1.1556, "step": 174300 }, { "epoch": 5.327793731288568, "grad_norm": 20.38521957397461, "learning_rate": 2.3408548525272306e-05, "loss": 1.2025, "step": 174400 }, { "epoch": 5.330848658886785, "grad_norm": 13.312535285949707, "learning_rate": 2.3393250520132177e-05, "loss": 1.0994, "step": 174500 }, { "epoch": 5.333903586485, "grad_norm": 0.015094890259206295, "learning_rate": 2.3377952514992045e-05, "loss": 1.0021, "step": 174600 }, { "epoch": 5.336958514083216, "grad_norm": 8.27639102935791, "learning_rate": 2.3362654509851916e-05, "loss": 1.0325, "step": 174700 }, { "epoch": 5.340013441681432, "grad_norm": 9.89177417755127, "learning_rate": 2.3347356504711787e-05, "loss": 1.0845, "step": 174800 }, { "epoch": 5.343068369279648, "grad_norm": 6.068204879760742, "learning_rate": 2.333205849957166e-05, "loss": 1.0179, "step": 174900 }, { "epoch": 5.346123296877864, "grad_norm": 5.956294536590576, "learning_rate": 2.3316760494431526e-05, "loss": 1.245, "step": 175000 }, { "epoch": 5.34917822447608, "grad_norm": 5.717597007751465, "learning_rate": 2.3301462489291397e-05, "loss": 1.1191, "step": 175100 }, { "epoch": 5.352233152074296, "grad_norm": 17.489805221557617, "learning_rate": 2.328616448415127e-05, "loss": 1.2599, "step": 175200 }, { "epoch": 5.355288079672512, "grad_norm": 4.305554389953613, "learning_rate": 2.327086647901114e-05, "loss": 1.1257, "step": 175300 }, { "epoch": 5.358343007270728, "grad_norm": 16.285791397094727, "learning_rate": 2.3255568473871007e-05, "loss": 1.2983, "step": 175400 }, { "epoch": 5.361397934868943, "grad_norm": 4.100075721740723, "learning_rate": 2.324027046873088e-05, "loss": 1.0178, "step": 175500 }, { "epoch": 5.36445286246716, "grad_norm": 7.133657932281494, "learning_rate": 2.322497246359075e-05, "loss": 1.1496, "step": 175600 }, { "epoch": 5.367507790065376, "grad_norm": 8.204270362854004, "learning_rate": 2.320967445845062e-05, "loss": 1.0097, "step": 175700 }, { "epoch": 5.370562717663591, "grad_norm": 3.40224289894104, "learning_rate": 2.319437645331049e-05, "loss": 1.4796, "step": 175800 }, { "epoch": 5.373617645261807, "grad_norm": 5.546578884124756, "learning_rate": 2.317907844817036e-05, "loss": 1.4034, "step": 175900 }, { "epoch": 5.3766725728600235, "grad_norm": 7.096791744232178, "learning_rate": 2.316378044303023e-05, "loss": 1.0794, "step": 176000 }, { "epoch": 5.379727500458239, "grad_norm": 9.707426071166992, "learning_rate": 2.3148482437890102e-05, "loss": 0.9545, "step": 176100 }, { "epoch": 5.382782428056455, "grad_norm": 13.694910049438477, "learning_rate": 2.313318443274997e-05, "loss": 0.9155, "step": 176200 }, { "epoch": 5.385837355654671, "grad_norm": 8.697030067443848, "learning_rate": 2.311788642760984e-05, "loss": 1.1714, "step": 176300 }, { "epoch": 5.388892283252887, "grad_norm": 8.831588745117188, "learning_rate": 2.3102588422469712e-05, "loss": 1.022, "step": 176400 }, { "epoch": 5.391947210851103, "grad_norm": 9.650738716125488, "learning_rate": 2.3087290417329583e-05, "loss": 1.1336, "step": 176500 }, { "epoch": 5.3950021384493185, "grad_norm": 5.9181227684021, "learning_rate": 2.307199241218945e-05, "loss": 1.1614, "step": 176600 }, { "epoch": 5.398057066047535, "grad_norm": 19.085289001464844, "learning_rate": 2.3056694407049322e-05, "loss": 1.26, "step": 176700 }, { "epoch": 5.401111993645751, "grad_norm": 4.559824466705322, "learning_rate": 2.3041396401909193e-05, "loss": 1.0525, "step": 176800 }, { "epoch": 5.4041669212439665, "grad_norm": 7.130990505218506, "learning_rate": 2.3026098396769065e-05, "loss": 1.1655, "step": 176900 }, { "epoch": 5.407221848842182, "grad_norm": 8.152735710144043, "learning_rate": 2.3010800391628932e-05, "loss": 1.0988, "step": 177000 }, { "epoch": 5.410276776440399, "grad_norm": 0.021126531064510345, "learning_rate": 2.2995502386488804e-05, "loss": 1.2543, "step": 177100 }, { "epoch": 5.413331704038614, "grad_norm": 12.942683219909668, "learning_rate": 2.2980204381348675e-05, "loss": 0.9339, "step": 177200 }, { "epoch": 5.41638663163683, "grad_norm": 5.814689636230469, "learning_rate": 2.2964906376208546e-05, "loss": 1.0118, "step": 177300 }, { "epoch": 5.419441559235046, "grad_norm": 4.256146430969238, "learning_rate": 2.2949761351119814e-05, "loss": 1.1236, "step": 177400 }, { "epoch": 5.422496486833262, "grad_norm": 5.030366897583008, "learning_rate": 2.2934463345979685e-05, "loss": 1.221, "step": 177500 }, { "epoch": 5.425551414431478, "grad_norm": 11.197477340698242, "learning_rate": 2.2919165340839556e-05, "loss": 1.3183, "step": 177600 }, { "epoch": 5.428606342029694, "grad_norm": 4.696968078613281, "learning_rate": 2.2903867335699424e-05, "loss": 1.2085, "step": 177700 }, { "epoch": 5.431661269627909, "grad_norm": 3.7039284706115723, "learning_rate": 2.2888569330559295e-05, "loss": 1.2063, "step": 177800 }, { "epoch": 5.434716197226126, "grad_norm": 4.057086944580078, "learning_rate": 2.2873271325419166e-05, "loss": 1.1562, "step": 177900 }, { "epoch": 5.437771124824342, "grad_norm": 9.291214942932129, "learning_rate": 2.2857973320279037e-05, "loss": 1.2057, "step": 178000 }, { "epoch": 5.440826052422557, "grad_norm": 3.367495536804199, "learning_rate": 2.2842675315138905e-05, "loss": 1.0456, "step": 178100 }, { "epoch": 5.443880980020774, "grad_norm": 10.886100769042969, "learning_rate": 2.2827377309998776e-05, "loss": 1.4536, "step": 178200 }, { "epoch": 5.44693590761899, "grad_norm": 11.006861686706543, "learning_rate": 2.2812079304858647e-05, "loss": 1.4364, "step": 178300 }, { "epoch": 5.449990835217205, "grad_norm": 21.764328002929688, "learning_rate": 2.279693427976992e-05, "loss": 1.1304, "step": 178400 }, { "epoch": 5.453045762815421, "grad_norm": 1.3549553155899048, "learning_rate": 2.278163627462979e-05, "loss": 1.3561, "step": 178500 }, { "epoch": 5.4561006904136375, "grad_norm": 4.178231716156006, "learning_rate": 2.2766338269489658e-05, "loss": 1.1695, "step": 178600 }, { "epoch": 5.459155618011853, "grad_norm": 7.525631427764893, "learning_rate": 2.275104026434953e-05, "loss": 1.397, "step": 178700 }, { "epoch": 5.462210545610069, "grad_norm": 7.3762593269348145, "learning_rate": 2.27357422592094e-05, "loss": 1.2551, "step": 178800 }, { "epoch": 5.465265473208285, "grad_norm": 12.109660148620605, "learning_rate": 2.272044425406927e-05, "loss": 1.3221, "step": 178900 }, { "epoch": 5.468320400806501, "grad_norm": 8.797658920288086, "learning_rate": 2.270514624892914e-05, "loss": 1.1983, "step": 179000 }, { "epoch": 5.471375328404717, "grad_norm": 4.56755256652832, "learning_rate": 2.268984824378901e-05, "loss": 1.0926, "step": 179100 }, { "epoch": 5.4744302560029325, "grad_norm": 4.608061790466309, "learning_rate": 2.267455023864888e-05, "loss": 1.2058, "step": 179200 }, { "epoch": 5.477485183601148, "grad_norm": 8.531103134155273, "learning_rate": 2.2659252233508752e-05, "loss": 1.3427, "step": 179300 }, { "epoch": 5.480540111199365, "grad_norm": 4.44692325592041, "learning_rate": 2.264395422836862e-05, "loss": 1.063, "step": 179400 }, { "epoch": 5.4835950387975805, "grad_norm": 14.634761810302734, "learning_rate": 2.262865622322849e-05, "loss": 1.4712, "step": 179500 }, { "epoch": 5.486649966395796, "grad_norm": 3.928048849105835, "learning_rate": 2.2613358218088363e-05, "loss": 0.9833, "step": 179600 }, { "epoch": 5.489704893994013, "grad_norm": 7.0261030197143555, "learning_rate": 2.2598060212948234e-05, "loss": 1.0317, "step": 179700 }, { "epoch": 5.492759821592228, "grad_norm": 5.174136638641357, "learning_rate": 2.25827622078081e-05, "loss": 1.1652, "step": 179800 }, { "epoch": 5.495814749190444, "grad_norm": 7.133640766143799, "learning_rate": 2.2567464202667973e-05, "loss": 1.1527, "step": 179900 }, { "epoch": 5.49886967678866, "grad_norm": 13.290231704711914, "learning_rate": 2.2552166197527844e-05, "loss": 1.2461, "step": 180000 }, { "epoch": 5.501924604386876, "grad_norm": 2.7865355014801025, "learning_rate": 2.2536868192387715e-05, "loss": 1.0835, "step": 180100 }, { "epoch": 5.504979531985092, "grad_norm": 3.439481496810913, "learning_rate": 2.2521570187247583e-05, "loss": 1.2283, "step": 180200 }, { "epoch": 5.508034459583308, "grad_norm": 16.009737014770508, "learning_rate": 2.2506272182107454e-05, "loss": 0.9926, "step": 180300 }, { "epoch": 5.511089387181523, "grad_norm": 8.959451675415039, "learning_rate": 2.2490974176967325e-05, "loss": 1.5304, "step": 180400 }, { "epoch": 5.51414431477974, "grad_norm": 10.707795143127441, "learning_rate": 2.2475676171827196e-05, "loss": 1.1731, "step": 180500 }, { "epoch": 5.517199242377956, "grad_norm": 8.683422088623047, "learning_rate": 2.2460378166687064e-05, "loss": 1.1361, "step": 180600 }, { "epoch": 5.520254169976171, "grad_norm": 6.5314507484436035, "learning_rate": 2.2445080161546935e-05, "loss": 1.1169, "step": 180700 }, { "epoch": 5.523309097574387, "grad_norm": 5.770356178283691, "learning_rate": 2.2429782156406806e-05, "loss": 1.1458, "step": 180800 }, { "epoch": 5.526364025172604, "grad_norm": 5.172116756439209, "learning_rate": 2.2414484151266677e-05, "loss": 1.2962, "step": 180900 }, { "epoch": 5.529418952770819, "grad_norm": 3.7103822231292725, "learning_rate": 2.2399186146126545e-05, "loss": 1.1349, "step": 181000 }, { "epoch": 5.532473880369035, "grad_norm": 1.0182088613510132, "learning_rate": 2.2383888140986416e-05, "loss": 1.2462, "step": 181100 }, { "epoch": 5.5355288079672516, "grad_norm": 14.949552536010742, "learning_rate": 2.2368590135846287e-05, "loss": 1.3082, "step": 181200 }, { "epoch": 5.538583735565467, "grad_norm": 18.954626083374023, "learning_rate": 2.235329213070616e-05, "loss": 1.081, "step": 181300 }, { "epoch": 5.541638663163683, "grad_norm": 5.126280307769775, "learning_rate": 2.2337994125566026e-05, "loss": 1.145, "step": 181400 }, { "epoch": 5.544693590761899, "grad_norm": 3.3761730194091797, "learning_rate": 2.2322696120425898e-05, "loss": 1.1507, "step": 181500 }, { "epoch": 5.547748518360115, "grad_norm": 9.969486236572266, "learning_rate": 2.230739811528577e-05, "loss": 1.0478, "step": 181600 }, { "epoch": 5.550803445958331, "grad_norm": 17.782325744628906, "learning_rate": 2.229210011014564e-05, "loss": 1.0812, "step": 181700 }, { "epoch": 5.5538583735565465, "grad_norm": 8.68525505065918, "learning_rate": 2.2276802105005508e-05, "loss": 1.261, "step": 181800 }, { "epoch": 5.556913301154763, "grad_norm": 5.081483364105225, "learning_rate": 2.2261657079916782e-05, "loss": 1.0785, "step": 181900 }, { "epoch": 5.559968228752979, "grad_norm": 9.229408264160156, "learning_rate": 2.224635907477665e-05, "loss": 1.515, "step": 182000 }, { "epoch": 5.5630231563511945, "grad_norm": 5.441746234893799, "learning_rate": 2.223106106963652e-05, "loss": 0.9432, "step": 182100 }, { "epoch": 5.56607808394941, "grad_norm": 10.51654052734375, "learning_rate": 2.2215763064496392e-05, "loss": 1.0654, "step": 182200 }, { "epoch": 5.569133011547626, "grad_norm": 4.113022327423096, "learning_rate": 2.220046505935626e-05, "loss": 1.1169, "step": 182300 }, { "epoch": 5.572187939145842, "grad_norm": 5.066014289855957, "learning_rate": 2.218516705421613e-05, "loss": 1.0663, "step": 182400 }, { "epoch": 5.575242866744058, "grad_norm": 4.2851104736328125, "learning_rate": 2.2169869049076e-05, "loss": 1.4151, "step": 182500 }, { "epoch": 5.578297794342274, "grad_norm": 7.80756139755249, "learning_rate": 2.215457104393587e-05, "loss": 0.9683, "step": 182600 }, { "epoch": 5.58135272194049, "grad_norm": 16.59423828125, "learning_rate": 2.213927303879574e-05, "loss": 1.1639, "step": 182700 }, { "epoch": 5.584407649538706, "grad_norm": 12.586332321166992, "learning_rate": 2.2123975033655613e-05, "loss": 1.1313, "step": 182800 }, { "epoch": 5.587462577136922, "grad_norm": 3.1343181133270264, "learning_rate": 2.210867702851548e-05, "loss": 0.9193, "step": 182900 }, { "epoch": 5.590517504735137, "grad_norm": 11.592085838317871, "learning_rate": 2.209337902337535e-05, "loss": 1.2895, "step": 183000 }, { "epoch": 5.593572432333354, "grad_norm": 3.3230690956115723, "learning_rate": 2.2078081018235223e-05, "loss": 1.092, "step": 183100 }, { "epoch": 5.59662735993157, "grad_norm": 7.102207660675049, "learning_rate": 2.2062783013095094e-05, "loss": 1.1058, "step": 183200 }, { "epoch": 5.599682287529785, "grad_norm": 6.228555202484131, "learning_rate": 2.204748500795496e-05, "loss": 1.0174, "step": 183300 }, { "epoch": 5.602737215128002, "grad_norm": 7.223354339599609, "learning_rate": 2.2032187002814833e-05, "loss": 1.0644, "step": 183400 }, { "epoch": 5.605792142726218, "grad_norm": 5.12488317489624, "learning_rate": 2.2016888997674704e-05, "loss": 1.0768, "step": 183500 }, { "epoch": 5.608847070324433, "grad_norm": 2.4839391708374023, "learning_rate": 2.2001590992534575e-05, "loss": 1.2211, "step": 183600 }, { "epoch": 5.611901997922649, "grad_norm": 10.138099670410156, "learning_rate": 2.1986292987394443e-05, "loss": 1.3718, "step": 183700 }, { "epoch": 5.614956925520865, "grad_norm": 5.2676682472229, "learning_rate": 2.1970994982254314e-05, "loss": 1.0689, "step": 183800 }, { "epoch": 5.618011853119081, "grad_norm": 1.3094004392623901, "learning_rate": 2.1955696977114185e-05, "loss": 1.1548, "step": 183900 }, { "epoch": 5.621066780717297, "grad_norm": 4.16325569152832, "learning_rate": 2.1940398971974056e-05, "loss": 1.1477, "step": 184000 }, { "epoch": 5.624121708315513, "grad_norm": 10.696335792541504, "learning_rate": 2.1925100966833924e-05, "loss": 1.0798, "step": 184100 }, { "epoch": 5.627176635913729, "grad_norm": 2.4368295669555664, "learning_rate": 2.1909802961693795e-05, "loss": 0.9771, "step": 184200 }, { "epoch": 5.630231563511945, "grad_norm": 4.798482894897461, "learning_rate": 2.1894504956553666e-05, "loss": 0.9914, "step": 184300 }, { "epoch": 5.6332864911101606, "grad_norm": 5.789841651916504, "learning_rate": 2.1879206951413538e-05, "loss": 1.2666, "step": 184400 }, { "epoch": 5.636341418708376, "grad_norm": 3.598188638687134, "learning_rate": 2.1863908946273405e-05, "loss": 0.9833, "step": 184500 }, { "epoch": 5.639396346306593, "grad_norm": 10.746664047241211, "learning_rate": 2.1848610941133277e-05, "loss": 1.0929, "step": 184600 }, { "epoch": 5.6424512739048085, "grad_norm": 11.472532272338867, "learning_rate": 2.1833312935993148e-05, "loss": 1.1305, "step": 184700 }, { "epoch": 5.645506201503024, "grad_norm": 3.4682164192199707, "learning_rate": 2.181801493085302e-05, "loss": 0.9999, "step": 184800 }, { "epoch": 5.648561129101241, "grad_norm": 5.039991855621338, "learning_rate": 2.1802716925712887e-05, "loss": 1.0, "step": 184900 }, { "epoch": 5.651616056699456, "grad_norm": 22.90473175048828, "learning_rate": 2.1787418920572758e-05, "loss": 1.366, "step": 185000 }, { "epoch": 5.654670984297672, "grad_norm": 4.122875213623047, "learning_rate": 2.177212091543263e-05, "loss": 1.1675, "step": 185100 }, { "epoch": 5.657725911895888, "grad_norm": 3.3970232009887695, "learning_rate": 2.17568229102925e-05, "loss": 1.1691, "step": 185200 }, { "epoch": 5.660780839494104, "grad_norm": 17.90229034423828, "learning_rate": 2.1741524905152368e-05, "loss": 1.0621, "step": 185300 }, { "epoch": 5.66383576709232, "grad_norm": 8.32997989654541, "learning_rate": 2.172622690001224e-05, "loss": 1.0607, "step": 185400 }, { "epoch": 5.666890694690536, "grad_norm": 4.489802837371826, "learning_rate": 2.171092889487211e-05, "loss": 1.1645, "step": 185500 }, { "epoch": 5.669945622288751, "grad_norm": 9.886762619018555, "learning_rate": 2.169563088973198e-05, "loss": 1.0695, "step": 185600 }, { "epoch": 5.673000549886968, "grad_norm": 3.3725993633270264, "learning_rate": 2.168033288459185e-05, "loss": 0.9897, "step": 185700 }, { "epoch": 5.676055477485184, "grad_norm": 3.5722696781158447, "learning_rate": 2.166503487945172e-05, "loss": 1.1009, "step": 185800 }, { "epoch": 5.679110405083399, "grad_norm": 3.939537763595581, "learning_rate": 2.164973687431159e-05, "loss": 1.1344, "step": 185900 }, { "epoch": 5.682165332681615, "grad_norm": 8.358038902282715, "learning_rate": 2.1634591849222863e-05, "loss": 0.9933, "step": 186000 }, { "epoch": 5.685220260279832, "grad_norm": 6.315106391906738, "learning_rate": 2.1619293844082734e-05, "loss": 1.0294, "step": 186100 }, { "epoch": 5.688275187878047, "grad_norm": 7.731125831604004, "learning_rate": 2.1603995838942605e-05, "loss": 1.1796, "step": 186200 }, { "epoch": 5.691330115476263, "grad_norm": 10.588342666625977, "learning_rate": 2.1588697833802473e-05, "loss": 1.0994, "step": 186300 }, { "epoch": 5.69438504307448, "grad_norm": 4.702441215515137, "learning_rate": 2.1573552808713744e-05, "loss": 1.3255, "step": 186400 }, { "epoch": 5.697439970672695, "grad_norm": 0.7584722638130188, "learning_rate": 2.1558254803573615e-05, "loss": 1.2591, "step": 186500 }, { "epoch": 5.700494898270911, "grad_norm": 3.2129814624786377, "learning_rate": 2.1542956798433486e-05, "loss": 0.942, "step": 186600 }, { "epoch": 5.703549825869127, "grad_norm": 6.426685810089111, "learning_rate": 2.1527658793293358e-05, "loss": 1.2921, "step": 186700 }, { "epoch": 5.706604753467343, "grad_norm": 4.396588325500488, "learning_rate": 2.1512360788153225e-05, "loss": 0.9987, "step": 186800 }, { "epoch": 5.709659681065559, "grad_norm": 4.9708428382873535, "learning_rate": 2.1497062783013097e-05, "loss": 1.2156, "step": 186900 }, { "epoch": 5.712714608663775, "grad_norm": 7.02872371673584, "learning_rate": 2.1481764777872968e-05, "loss": 0.9759, "step": 187000 }, { "epoch": 5.71576953626199, "grad_norm": 8.63855266571045, "learning_rate": 2.146646677273284e-05, "loss": 1.3669, "step": 187100 }, { "epoch": 5.718824463860207, "grad_norm": 21.115148544311523, "learning_rate": 2.1451168767592707e-05, "loss": 1.1757, "step": 187200 }, { "epoch": 5.7218793914584225, "grad_norm": 7.059637069702148, "learning_rate": 2.1435870762452578e-05, "loss": 1.0772, "step": 187300 }, { "epoch": 5.724934319056638, "grad_norm": 5.446715831756592, "learning_rate": 2.142057275731245e-05, "loss": 1.1456, "step": 187400 }, { "epoch": 5.727989246654854, "grad_norm": 4.783729076385498, "learning_rate": 2.140527475217232e-05, "loss": 1.1111, "step": 187500 }, { "epoch": 5.7310441742530704, "grad_norm": 7.948205947875977, "learning_rate": 2.1389976747032188e-05, "loss": 1.2508, "step": 187600 }, { "epoch": 5.734099101851286, "grad_norm": 18.996597290039062, "learning_rate": 2.137467874189206e-05, "loss": 0.9658, "step": 187700 }, { "epoch": 5.737154029449502, "grad_norm": 5.666006088256836, "learning_rate": 2.1359380736751927e-05, "loss": 1.1193, "step": 187800 }, { "epoch": 5.740208957047718, "grad_norm": 3.910343885421753, "learning_rate": 2.1344082731611798e-05, "loss": 1.2442, "step": 187900 }, { "epoch": 5.743263884645934, "grad_norm": 1.8572306632995605, "learning_rate": 2.132878472647167e-05, "loss": 1.0246, "step": 188000 }, { "epoch": 5.74631881224415, "grad_norm": 7.690230846405029, "learning_rate": 2.1313486721331537e-05, "loss": 1.7488, "step": 188100 }, { "epoch": 5.749373739842365, "grad_norm": 5.309960842132568, "learning_rate": 2.1298188716191408e-05, "loss": 1.098, "step": 188200 }, { "epoch": 5.752428667440582, "grad_norm": 6.675844192504883, "learning_rate": 2.128289071105128e-05, "loss": 0.9723, "step": 188300 }, { "epoch": 5.755483595038798, "grad_norm": 14.125645637512207, "learning_rate": 2.126759270591115e-05, "loss": 1.0996, "step": 188400 }, { "epoch": 5.758538522637013, "grad_norm": 6.408461093902588, "learning_rate": 2.1252294700771018e-05, "loss": 1.081, "step": 188500 }, { "epoch": 5.761593450235229, "grad_norm": 6.611395835876465, "learning_rate": 2.123699669563089e-05, "loss": 0.9786, "step": 188600 }, { "epoch": 5.764648377833446, "grad_norm": 3.6658895015716553, "learning_rate": 2.122169869049076e-05, "loss": 1.1722, "step": 188700 }, { "epoch": 5.767703305431661, "grad_norm": 7.49069356918335, "learning_rate": 2.120640068535063e-05, "loss": 1.0996, "step": 188800 }, { "epoch": 5.770758233029877, "grad_norm": 13.291098594665527, "learning_rate": 2.11911026802105e-05, "loss": 0.9886, "step": 188900 }, { "epoch": 5.773813160628093, "grad_norm": 6.7149977684021, "learning_rate": 2.1175957655121774e-05, "loss": 1.1279, "step": 189000 }, { "epoch": 5.776868088226309, "grad_norm": 5.4691948890686035, "learning_rate": 2.1160659649981642e-05, "loss": 1.1921, "step": 189100 }, { "epoch": 5.779923015824525, "grad_norm": 7.12191104888916, "learning_rate": 2.1145361644841513e-05, "loss": 1.1089, "step": 189200 }, { "epoch": 5.782977943422741, "grad_norm": 6.525013446807861, "learning_rate": 2.1130063639701384e-05, "loss": 1.1687, "step": 189300 }, { "epoch": 5.786032871020957, "grad_norm": 3.475522518157959, "learning_rate": 2.1114765634561255e-05, "loss": 1.0949, "step": 189400 }, { "epoch": 5.789087798619173, "grad_norm": 4.3520965576171875, "learning_rate": 2.1099467629421123e-05, "loss": 1.1071, "step": 189500 }, { "epoch": 5.792142726217389, "grad_norm": 24.487106323242188, "learning_rate": 2.1084169624280994e-05, "loss": 1.0645, "step": 189600 }, { "epoch": 5.795197653815604, "grad_norm": 11.095251083374023, "learning_rate": 2.1068871619140865e-05, "loss": 1.1057, "step": 189700 }, { "epoch": 5.798252581413821, "grad_norm": 6.842864513397217, "learning_rate": 2.1053573614000737e-05, "loss": 1.2524, "step": 189800 }, { "epoch": 5.8013075090120365, "grad_norm": 7.722935199737549, "learning_rate": 2.1038275608860604e-05, "loss": 1.0865, "step": 189900 }, { "epoch": 5.804362436610252, "grad_norm": 10.01695728302002, "learning_rate": 2.1022977603720476e-05, "loss": 1.1852, "step": 190000 }, { "epoch": 5.807417364208468, "grad_norm": 12.786285400390625, "learning_rate": 2.1007679598580347e-05, "loss": 1.0876, "step": 190100 }, { "epoch": 5.8104722918066845, "grad_norm": 5.113598823547363, "learning_rate": 2.0992381593440218e-05, "loss": 1.1766, "step": 190200 }, { "epoch": 5.8135272194049, "grad_norm": 4.3457865715026855, "learning_rate": 2.0977083588300086e-05, "loss": 1.4302, "step": 190300 }, { "epoch": 5.816582147003116, "grad_norm": 11.015501976013184, "learning_rate": 2.0961785583159957e-05, "loss": 1.2411, "step": 190400 }, { "epoch": 5.8196370746013315, "grad_norm": 12.930902481079102, "learning_rate": 2.0946487578019828e-05, "loss": 0.9841, "step": 190500 }, { "epoch": 5.822692002199548, "grad_norm": 2.641270160675049, "learning_rate": 2.09311895728797e-05, "loss": 1.273, "step": 190600 }, { "epoch": 5.825746929797764, "grad_norm": 13.15196418762207, "learning_rate": 2.0915891567739567e-05, "loss": 1.4972, "step": 190700 }, { "epoch": 5.8288018573959794, "grad_norm": 4.2401814460754395, "learning_rate": 2.0900593562599438e-05, "loss": 1.2889, "step": 190800 }, { "epoch": 5.831856784994196, "grad_norm": 15.105935096740723, "learning_rate": 2.088529555745931e-05, "loss": 1.0851, "step": 190900 }, { "epoch": 5.834911712592412, "grad_norm": 2.798130750656128, "learning_rate": 2.086999755231918e-05, "loss": 1.1304, "step": 191000 }, { "epoch": 5.837966640190627, "grad_norm": 2.9408164024353027, "learning_rate": 2.0854699547179048e-05, "loss": 1.0078, "step": 191100 }, { "epoch": 5.841021567788843, "grad_norm": 18.773927688598633, "learning_rate": 2.083940154203892e-05, "loss": 1.1115, "step": 191200 }, { "epoch": 5.84407649538706, "grad_norm": 3.5359275341033936, "learning_rate": 2.082410353689879e-05, "loss": 1.2128, "step": 191300 }, { "epoch": 5.847131422985275, "grad_norm": 3.876824378967285, "learning_rate": 2.080880553175866e-05, "loss": 1.1701, "step": 191400 }, { "epoch": 5.850186350583491, "grad_norm": 5.664970397949219, "learning_rate": 2.079350752661853e-05, "loss": 1.3018, "step": 191500 }, { "epoch": 5.853241278181708, "grad_norm": 4.4520111083984375, "learning_rate": 2.07782095214784e-05, "loss": 1.0826, "step": 191600 }, { "epoch": 5.856296205779923, "grad_norm": 0.5348922610282898, "learning_rate": 2.076291151633827e-05, "loss": 1.1042, "step": 191700 }, { "epoch": 5.859351133378139, "grad_norm": 5.47227668762207, "learning_rate": 2.0747766491249543e-05, "loss": 1.2615, "step": 191800 }, { "epoch": 5.862406060976355, "grad_norm": 6.208307266235352, "learning_rate": 2.0732468486109414e-05, "loss": 1.4392, "step": 191900 }, { "epoch": 5.86546098857457, "grad_norm": 15.780241966247559, "learning_rate": 2.0717170480969282e-05, "loss": 1.3018, "step": 192000 }, { "epoch": 5.868515916172787, "grad_norm": 5.232784271240234, "learning_rate": 2.0701872475829153e-05, "loss": 1.1271, "step": 192100 }, { "epoch": 5.871570843771003, "grad_norm": 17.728816986083984, "learning_rate": 2.0686574470689024e-05, "loss": 1.2694, "step": 192200 }, { "epoch": 5.874625771369218, "grad_norm": 13.02619743347168, "learning_rate": 2.0671276465548895e-05, "loss": 1.1319, "step": 192300 }, { "epoch": 5.877680698967435, "grad_norm": 3.499558448791504, "learning_rate": 2.0655978460408763e-05, "loss": 1.1393, "step": 192400 }, { "epoch": 5.8807356265656505, "grad_norm": 14.559880256652832, "learning_rate": 2.0640680455268634e-05, "loss": 1.004, "step": 192500 }, { "epoch": 5.883790554163866, "grad_norm": 13.002328872680664, "learning_rate": 2.0625382450128505e-05, "loss": 1.1475, "step": 192600 }, { "epoch": 5.886845481762082, "grad_norm": 7.72279691696167, "learning_rate": 2.0610084444988377e-05, "loss": 1.2198, "step": 192700 }, { "epoch": 5.8899004093602985, "grad_norm": 5.531588554382324, "learning_rate": 2.0594786439848244e-05, "loss": 1.0233, "step": 192800 }, { "epoch": 5.892955336958514, "grad_norm": 9.753616333007812, "learning_rate": 2.0579488434708116e-05, "loss": 1.304, "step": 192900 }, { "epoch": 5.89601026455673, "grad_norm": 7.747321605682373, "learning_rate": 2.0564190429567987e-05, "loss": 1.2117, "step": 193000 }, { "epoch": 5.899065192154946, "grad_norm": 4.747279644012451, "learning_rate": 2.0548892424427858e-05, "loss": 1.0239, "step": 193100 }, { "epoch": 5.902120119753162, "grad_norm": 2.2190704345703125, "learning_rate": 2.0533594419287726e-05, "loss": 1.4906, "step": 193200 }, { "epoch": 5.905175047351378, "grad_norm": 12.153207778930664, "learning_rate": 2.0518296414147597e-05, "loss": 1.1606, "step": 193300 }, { "epoch": 5.9082299749495935, "grad_norm": 2.3085525035858154, "learning_rate": 2.0502998409007465e-05, "loss": 1.0179, "step": 193400 }, { "epoch": 5.911284902547809, "grad_norm": 1.8929448127746582, "learning_rate": 2.0487700403867336e-05, "loss": 1.1237, "step": 193500 }, { "epoch": 5.914339830146026, "grad_norm": 11.247891426086426, "learning_rate": 2.0472402398727207e-05, "loss": 1.0379, "step": 193600 }, { "epoch": 5.917394757744241, "grad_norm": 17.98173713684082, "learning_rate": 2.0457104393587075e-05, "loss": 1.149, "step": 193700 }, { "epoch": 5.920449685342457, "grad_norm": 6.99159049987793, "learning_rate": 2.0441806388446946e-05, "loss": 1.1825, "step": 193800 }, { "epoch": 5.923504612940674, "grad_norm": 6.678034782409668, "learning_rate": 2.0426508383306817e-05, "loss": 1.1868, "step": 193900 }, { "epoch": 5.926559540538889, "grad_norm": 1.5114043951034546, "learning_rate": 2.0411210378166688e-05, "loss": 1.0141, "step": 194000 }, { "epoch": 5.929614468137105, "grad_norm": 8.493728637695312, "learning_rate": 2.039606535307796e-05, "loss": 1.2834, "step": 194100 }, { "epoch": 5.932669395735321, "grad_norm": 4.0135602951049805, "learning_rate": 2.038076734793783e-05, "loss": 1.4871, "step": 194200 }, { "epoch": 5.935724323333537, "grad_norm": 19.089086532592773, "learning_rate": 2.03654693427977e-05, "loss": 1.229, "step": 194300 }, { "epoch": 5.938779250931753, "grad_norm": 3.464934825897217, "learning_rate": 2.035017133765757e-05, "loss": 1.4185, "step": 194400 }, { "epoch": 5.941834178529969, "grad_norm": 20.901973724365234, "learning_rate": 2.033487333251744e-05, "loss": 1.1335, "step": 194500 }, { "epoch": 5.944889106128185, "grad_norm": 2.50457763671875, "learning_rate": 2.0319575327377312e-05, "loss": 1.1107, "step": 194600 }, { "epoch": 5.947944033726401, "grad_norm": 3.334287166595459, "learning_rate": 2.030427732223718e-05, "loss": 1.0931, "step": 194700 }, { "epoch": 5.950998961324617, "grad_norm": 6.1684980392456055, "learning_rate": 2.028897931709705e-05, "loss": 1.1948, "step": 194800 }, { "epoch": 5.954053888922832, "grad_norm": 3.7260806560516357, "learning_rate": 2.0273681311956922e-05, "loss": 1.0075, "step": 194900 }, { "epoch": 5.957108816521048, "grad_norm": 6.643927574157715, "learning_rate": 2.0258383306816793e-05, "loss": 1.0888, "step": 195000 }, { "epoch": 5.9601637441192645, "grad_norm": 6.050364971160889, "learning_rate": 2.024308530167666e-05, "loss": 1.1477, "step": 195100 }, { "epoch": 5.96321867171748, "grad_norm": 2.7299838066101074, "learning_rate": 2.0227787296536532e-05, "loss": 1.1081, "step": 195200 }, { "epoch": 5.966273599315696, "grad_norm": 8.811811447143555, "learning_rate": 2.0212489291396403e-05, "loss": 1.0636, "step": 195300 }, { "epoch": 5.9693285269139125, "grad_norm": 9.84276294708252, "learning_rate": 2.0197191286256274e-05, "loss": 1.3142, "step": 195400 }, { "epoch": 5.972383454512128, "grad_norm": 12.914100646972656, "learning_rate": 2.0181893281116142e-05, "loss": 1.1995, "step": 195500 }, { "epoch": 5.975438382110344, "grad_norm": 3.8939926624298096, "learning_rate": 2.0166595275976013e-05, "loss": 0.9676, "step": 195600 }, { "epoch": 5.9784933097085595, "grad_norm": 9.499131202697754, "learning_rate": 2.0151297270835884e-05, "loss": 1.036, "step": 195700 }, { "epoch": 5.981548237306776, "grad_norm": 8.472640037536621, "learning_rate": 2.0135999265695756e-05, "loss": 1.2044, "step": 195800 }, { "epoch": 5.984603164904992, "grad_norm": 7.215255260467529, "learning_rate": 2.0120701260555623e-05, "loss": 1.2218, "step": 195900 }, { "epoch": 5.9876580925032075, "grad_norm": 18.55949592590332, "learning_rate": 2.0105403255415495e-05, "loss": 1.0842, "step": 196000 }, { "epoch": 5.990713020101424, "grad_norm": 2.0042617321014404, "learning_rate": 2.0090105250275366e-05, "loss": 1.2315, "step": 196100 }, { "epoch": 5.99376794769964, "grad_norm": 3.453805923461914, "learning_rate": 2.0074807245135237e-05, "loss": 1.0177, "step": 196200 }, { "epoch": 5.996822875297855, "grad_norm": 6.27379035949707, "learning_rate": 2.0059509239995105e-05, "loss": 1.4143, "step": 196300 }, { "epoch": 5.999877802896071, "grad_norm": 2.6724729537963867, "learning_rate": 2.0044211234854976e-05, "loss": 1.206, "step": 196400 }, { "epoch": 6.0, "eval_accuracy": 0.7196187450357426, "eval_loss": 0.7456799745559692, "eval_runtime": 1781.9466, "eval_samples_per_second": 18.37, "eval_steps_per_second": 4.593, "step": 196404 }, { "epoch": 6.002932730494288, "grad_norm": 8.19288158416748, "learning_rate": 2.0028913229714847e-05, "loss": 1.233, "step": 196500 }, { "epoch": 6.005987658092503, "grad_norm": 5.379980087280273, "learning_rate": 2.0013615224574718e-05, "loss": 1.0025, "step": 196600 }, { "epoch": 6.009042585690719, "grad_norm": 8.692319869995117, "learning_rate": 1.9998317219434586e-05, "loss": 1.0007, "step": 196700 }, { "epoch": 6.012097513288935, "grad_norm": 3.728940963745117, "learning_rate": 1.9983019214294457e-05, "loss": 1.3789, "step": 196800 }, { "epoch": 6.015152440887151, "grad_norm": 14.83003044128418, "learning_rate": 1.9967721209154328e-05, "loss": 1.2283, "step": 196900 }, { "epoch": 6.018207368485367, "grad_norm": 10.604146003723145, "learning_rate": 1.99524232040142e-05, "loss": 1.1086, "step": 197000 }, { "epoch": 6.021262296083583, "grad_norm": 2.400364875793457, "learning_rate": 1.993727817892547e-05, "loss": 0.9838, "step": 197100 }, { "epoch": 6.024317223681798, "grad_norm": 16.308069229125977, "learning_rate": 1.992198017378534e-05, "loss": 1.3164, "step": 197200 }, { "epoch": 6.027372151280015, "grad_norm": 21.39978790283203, "learning_rate": 1.990668216864521e-05, "loss": 1.1362, "step": 197300 }, { "epoch": 6.030427078878231, "grad_norm": 5.836481094360352, "learning_rate": 1.989138416350508e-05, "loss": 1.2144, "step": 197400 }, { "epoch": 6.033482006476446, "grad_norm": 8.957596778869629, "learning_rate": 1.9876086158364952e-05, "loss": 1.0569, "step": 197500 }, { "epoch": 6.036536934074663, "grad_norm": 11.061244010925293, "learning_rate": 1.986078815322482e-05, "loss": 1.0122, "step": 197600 }, { "epoch": 6.0395918616728785, "grad_norm": 16.330211639404297, "learning_rate": 1.984549014808469e-05, "loss": 1.6281, "step": 197700 }, { "epoch": 6.042646789271094, "grad_norm": 5.61469030380249, "learning_rate": 1.9830192142944562e-05, "loss": 0.9955, "step": 197800 }, { "epoch": 6.04570171686931, "grad_norm": 4.7612786293029785, "learning_rate": 1.9814894137804433e-05, "loss": 0.8676, "step": 197900 }, { "epoch": 6.0487566444675265, "grad_norm": 17.290145874023438, "learning_rate": 1.97995961326643e-05, "loss": 1.308, "step": 198000 }, { "epoch": 6.051811572065742, "grad_norm": 2.9975016117095947, "learning_rate": 1.9784298127524172e-05, "loss": 1.0552, "step": 198100 }, { "epoch": 6.054866499663958, "grad_norm": 5.638766288757324, "learning_rate": 1.9769000122384043e-05, "loss": 1.4797, "step": 198200 }, { "epoch": 6.0579214272621735, "grad_norm": 7.252283096313477, "learning_rate": 1.9753702117243914e-05, "loss": 1.1544, "step": 198300 }, { "epoch": 6.06097635486039, "grad_norm": 12.537910461425781, "learning_rate": 1.9738404112103782e-05, "loss": 1.267, "step": 198400 }, { "epoch": 6.064031282458606, "grad_norm": 5.209641456604004, "learning_rate": 1.9723106106963653e-05, "loss": 1.2551, "step": 198500 }, { "epoch": 6.0670862100568215, "grad_norm": 5.152153968811035, "learning_rate": 1.9707808101823524e-05, "loss": 1.0689, "step": 198600 }, { "epoch": 6.070141137655037, "grad_norm": 21.221385955810547, "learning_rate": 1.9692510096683396e-05, "loss": 1.2884, "step": 198700 }, { "epoch": 6.073196065253254, "grad_norm": 14.44279956817627, "learning_rate": 1.9677212091543263e-05, "loss": 1.0771, "step": 198800 }, { "epoch": 6.076250992851469, "grad_norm": 9.928838729858398, "learning_rate": 1.966191408640313e-05, "loss": 1.0506, "step": 198900 }, { "epoch": 6.079305920449685, "grad_norm": 2.891791582107544, "learning_rate": 1.9646616081263002e-05, "loss": 1.1673, "step": 199000 }, { "epoch": 6.082360848047902, "grad_norm": 11.18900203704834, "learning_rate": 1.9631318076122873e-05, "loss": 1.0406, "step": 199100 }, { "epoch": 6.085415775646117, "grad_norm": 6.0363993644714355, "learning_rate": 1.9616020070982745e-05, "loss": 1.1409, "step": 199200 }, { "epoch": 6.088470703244333, "grad_norm": 5.4089508056640625, "learning_rate": 1.9600722065842612e-05, "loss": 1.3594, "step": 199300 }, { "epoch": 6.091525630842549, "grad_norm": 5.459392070770264, "learning_rate": 1.9585424060702484e-05, "loss": 1.2248, "step": 199400 }, { "epoch": 6.094580558440765, "grad_norm": 2.2770681381225586, "learning_rate": 1.9570279035613755e-05, "loss": 1.1575, "step": 199500 }, { "epoch": 6.097635486038981, "grad_norm": 3.4792544841766357, "learning_rate": 1.9554981030473626e-05, "loss": 1.1068, "step": 199600 }, { "epoch": 6.100690413637197, "grad_norm": 0.7737553715705872, "learning_rate": 1.9539683025333497e-05, "loss": 1.0847, "step": 199700 }, { "epoch": 6.103745341235412, "grad_norm": 5.00324821472168, "learning_rate": 1.952438502019337e-05, "loss": 1.234, "step": 199800 }, { "epoch": 6.106800268833629, "grad_norm": 1.3894213438034058, "learning_rate": 1.9509087015053236e-05, "loss": 1.3646, "step": 199900 }, { "epoch": 6.109855196431845, "grad_norm": 6.558831214904785, "learning_rate": 1.9493789009913107e-05, "loss": 1.2715, "step": 200000 }, { "epoch": 6.11291012403006, "grad_norm": 4.3237762451171875, "learning_rate": 1.947849100477298e-05, "loss": 1.1757, "step": 200100 }, { "epoch": 6.115965051628276, "grad_norm": 24.602190017700195, "learning_rate": 1.946319299963285e-05, "loss": 1.1975, "step": 200200 }, { "epoch": 6.1190199792264925, "grad_norm": 9.489306449890137, "learning_rate": 1.9447894994492717e-05, "loss": 1.2334, "step": 200300 }, { "epoch": 6.122074906824708, "grad_norm": 1.5155296325683594, "learning_rate": 1.943259698935259e-05, "loss": 1.2309, "step": 200400 }, { "epoch": 6.125129834422924, "grad_norm": 15.262107849121094, "learning_rate": 1.941729898421246e-05, "loss": 1.1219, "step": 200500 }, { "epoch": 6.1281847620211405, "grad_norm": 7.496032238006592, "learning_rate": 1.940200097907233e-05, "loss": 1.178, "step": 200600 }, { "epoch": 6.131239689619356, "grad_norm": 7.0843305587768555, "learning_rate": 1.93867029739322e-05, "loss": 1.1752, "step": 200700 }, { "epoch": 6.134294617217572, "grad_norm": 6.570627689361572, "learning_rate": 1.937140496879207e-05, "loss": 1.0997, "step": 200800 }, { "epoch": 6.1373495448157875, "grad_norm": 15.903374671936035, "learning_rate": 1.935610696365194e-05, "loss": 1.1207, "step": 200900 }, { "epoch": 6.140404472414004, "grad_norm": 5.491482257843018, "learning_rate": 1.9340808958511812e-05, "loss": 1.2633, "step": 201000 }, { "epoch": 6.14345940001222, "grad_norm": 9.441786766052246, "learning_rate": 1.932551095337168e-05, "loss": 0.9868, "step": 201100 }, { "epoch": 6.1465143276104355, "grad_norm": 10.245373725891113, "learning_rate": 1.931021294823155e-05, "loss": 1.2599, "step": 201200 }, { "epoch": 6.149569255208651, "grad_norm": 20.67123031616211, "learning_rate": 1.9294914943091422e-05, "loss": 1.1123, "step": 201300 }, { "epoch": 6.152624182806868, "grad_norm": 3.138038396835327, "learning_rate": 1.9279616937951293e-05, "loss": 1.113, "step": 201400 }, { "epoch": 6.155679110405083, "grad_norm": 3.0966243743896484, "learning_rate": 1.926431893281116e-05, "loss": 1.2311, "step": 201500 }, { "epoch": 6.158734038003299, "grad_norm": 6.163281440734863, "learning_rate": 1.9249020927671032e-05, "loss": 1.0435, "step": 201600 }, { "epoch": 6.161788965601515, "grad_norm": 5.311619758605957, "learning_rate": 1.9233722922530903e-05, "loss": 1.1483, "step": 201700 }, { "epoch": 6.164843893199731, "grad_norm": 7.495104789733887, "learning_rate": 1.9218424917390775e-05, "loss": 1.0817, "step": 201800 }, { "epoch": 6.167898820797947, "grad_norm": 5.036028861999512, "learning_rate": 1.9203126912250642e-05, "loss": 1.147, "step": 201900 }, { "epoch": 6.170953748396163, "grad_norm": 11.376052856445312, "learning_rate": 1.9187828907110514e-05, "loss": 1.2904, "step": 202000 }, { "epoch": 6.174008675994379, "grad_norm": 2.396482467651367, "learning_rate": 1.9172530901970385e-05, "loss": 0.9409, "step": 202100 }, { "epoch": 6.177063603592595, "grad_norm": 12.605530738830566, "learning_rate": 1.9157232896830256e-05, "loss": 1.2427, "step": 202200 }, { "epoch": 6.180118531190811, "grad_norm": 5.409971714019775, "learning_rate": 1.9141934891690124e-05, "loss": 1.2063, "step": 202300 }, { "epoch": 6.183173458789026, "grad_norm": 11.71444034576416, "learning_rate": 1.9126636886549995e-05, "loss": 1.0652, "step": 202400 }, { "epoch": 6.186228386387243, "grad_norm": 6.157597064971924, "learning_rate": 1.9111338881409866e-05, "loss": 1.0756, "step": 202500 }, { "epoch": 6.189283313985459, "grad_norm": 12.192460060119629, "learning_rate": 1.9096040876269737e-05, "loss": 0.9874, "step": 202600 }, { "epoch": 6.192338241583674, "grad_norm": 10.153812408447266, "learning_rate": 1.9080742871129605e-05, "loss": 1.2897, "step": 202700 }, { "epoch": 6.19539316918189, "grad_norm": 7.526157855987549, "learning_rate": 1.9065444865989476e-05, "loss": 1.1976, "step": 202800 }, { "epoch": 6.1984480967801066, "grad_norm": 6.983723163604736, "learning_rate": 1.9050146860849347e-05, "loss": 1.3308, "step": 202900 }, { "epoch": 6.201503024378322, "grad_norm": 5.48733377456665, "learning_rate": 1.9034848855709218e-05, "loss": 1.0038, "step": 203000 }, { "epoch": 6.204557951976538, "grad_norm": 4.00633430480957, "learning_rate": 1.9019550850569086e-05, "loss": 0.9692, "step": 203100 }, { "epoch": 6.2076128795747545, "grad_norm": 7.790951728820801, "learning_rate": 1.9004252845428957e-05, "loss": 1.3422, "step": 203200 }, { "epoch": 6.21066780717297, "grad_norm": 4.072429656982422, "learning_rate": 1.898895484028883e-05, "loss": 1.322, "step": 203300 }, { "epoch": 6.213722734771186, "grad_norm": 6.779109001159668, "learning_rate": 1.89736568351487e-05, "loss": 0.9392, "step": 203400 }, { "epoch": 6.2167776623694015, "grad_norm": 1.8601343631744385, "learning_rate": 1.895851181005997e-05, "loss": 1.1972, "step": 203500 }, { "epoch": 6.219832589967618, "grad_norm": 4.4953508377075195, "learning_rate": 1.894336678497124e-05, "loss": 1.2135, "step": 203600 }, { "epoch": 6.222887517565834, "grad_norm": 4.9885735511779785, "learning_rate": 1.892806877983111e-05, "loss": 1.2205, "step": 203700 }, { "epoch": 6.2259424451640495, "grad_norm": 2.344512701034546, "learning_rate": 1.891277077469098e-05, "loss": 1.0766, "step": 203800 }, { "epoch": 6.228997372762265, "grad_norm": 26.173669815063477, "learning_rate": 1.889747276955085e-05, "loss": 1.1281, "step": 203900 }, { "epoch": 6.232052300360482, "grad_norm": 7.559381484985352, "learning_rate": 1.888217476441072e-05, "loss": 1.1236, "step": 204000 }, { "epoch": 6.235107227958697, "grad_norm": 9.910603523254395, "learning_rate": 1.886687675927059e-05, "loss": 1.3141, "step": 204100 }, { "epoch": 6.238162155556913, "grad_norm": 2.8729636669158936, "learning_rate": 1.8851578754130462e-05, "loss": 1.1617, "step": 204200 }, { "epoch": 6.241217083155129, "grad_norm": 4.2151055335998535, "learning_rate": 1.883628074899033e-05, "loss": 1.1621, "step": 204300 }, { "epoch": 6.244272010753345, "grad_norm": 5.4340410232543945, "learning_rate": 1.88209827438502e-05, "loss": 1.1694, "step": 204400 }, { "epoch": 6.247326938351561, "grad_norm": 5.660393238067627, "learning_rate": 1.8805684738710072e-05, "loss": 1.1854, "step": 204500 }, { "epoch": 6.250381865949777, "grad_norm": 4.4400177001953125, "learning_rate": 1.8790386733569944e-05, "loss": 1.2309, "step": 204600 }, { "epoch": 6.253436793547993, "grad_norm": 6.119187831878662, "learning_rate": 1.877508872842981e-05, "loss": 1.4101, "step": 204700 }, { "epoch": 6.256491721146209, "grad_norm": 8.3779296875, "learning_rate": 1.8759790723289683e-05, "loss": 1.4293, "step": 204800 }, { "epoch": 6.259546648744425, "grad_norm": 13.137105941772461, "learning_rate": 1.8744492718149554e-05, "loss": 1.0669, "step": 204900 }, { "epoch": 6.26260157634264, "grad_norm": 10.320351600646973, "learning_rate": 1.8729194713009425e-05, "loss": 1.1582, "step": 205000 }, { "epoch": 6.265656503940857, "grad_norm": 5.954596996307373, "learning_rate": 1.8713896707869293e-05, "loss": 1.1082, "step": 205100 }, { "epoch": 6.268711431539073, "grad_norm": 8.020731925964355, "learning_rate": 1.8698598702729164e-05, "loss": 1.0602, "step": 205200 }, { "epoch": 6.271766359137288, "grad_norm": 20.765810012817383, "learning_rate": 1.8683300697589035e-05, "loss": 0.9351, "step": 205300 }, { "epoch": 6.274821286735504, "grad_norm": 5.073605537414551, "learning_rate": 1.8668002692448906e-05, "loss": 1.0308, "step": 205400 }, { "epoch": 6.277876214333721, "grad_norm": 7.651671886444092, "learning_rate": 1.8652704687308774e-05, "loss": 1.17, "step": 205500 }, { "epoch": 6.280931141931936, "grad_norm": 5.585090160369873, "learning_rate": 1.8637406682168645e-05, "loss": 1.0851, "step": 205600 }, { "epoch": 6.283986069530152, "grad_norm": 6.514379024505615, "learning_rate": 1.8622108677028516e-05, "loss": 1.1694, "step": 205700 }, { "epoch": 6.2870409971283685, "grad_norm": 15.221776962280273, "learning_rate": 1.8606810671888387e-05, "loss": 1.1695, "step": 205800 }, { "epoch": 6.290095924726584, "grad_norm": 2.7496187686920166, "learning_rate": 1.8591512666748255e-05, "loss": 1.2547, "step": 205900 }, { "epoch": 6.2931508523248, "grad_norm": 21.13014030456543, "learning_rate": 1.8576214661608126e-05, "loss": 1.0187, "step": 206000 }, { "epoch": 6.2962057799230156, "grad_norm": 4.162718296051025, "learning_rate": 1.8560916656467997e-05, "loss": 1.0812, "step": 206100 }, { "epoch": 6.299260707521232, "grad_norm": 13.786234855651855, "learning_rate": 1.854561865132787e-05, "loss": 1.5332, "step": 206200 }, { "epoch": 6.302315635119448, "grad_norm": 19.818574905395508, "learning_rate": 1.8530320646187736e-05, "loss": 1.0737, "step": 206300 }, { "epoch": 6.3053705627176635, "grad_norm": 10.488554954528809, "learning_rate": 1.8515022641047608e-05, "loss": 1.069, "step": 206400 }, { "epoch": 6.308425490315879, "grad_norm": 13.240741729736328, "learning_rate": 1.849972463590748e-05, "loss": 0.979, "step": 206500 }, { "epoch": 6.311480417914096, "grad_norm": 15.739484786987305, "learning_rate": 1.848457961081875e-05, "loss": 1.2026, "step": 206600 }, { "epoch": 6.314535345512311, "grad_norm": 8.837204933166504, "learning_rate": 1.846928160567862e-05, "loss": 1.3493, "step": 206700 }, { "epoch": 6.317590273110527, "grad_norm": 4.217745304107666, "learning_rate": 1.8453983600538492e-05, "loss": 1.2004, "step": 206800 }, { "epoch": 6.320645200708743, "grad_norm": 13.267827987670898, "learning_rate": 1.843868559539836e-05, "loss": 1.0603, "step": 206900 }, { "epoch": 6.323700128306959, "grad_norm": 1.135472059249878, "learning_rate": 1.842338759025823e-05, "loss": 1.1188, "step": 207000 }, { "epoch": 6.326755055905175, "grad_norm": 5.91626501083374, "learning_rate": 1.8408089585118102e-05, "loss": 1.2033, "step": 207100 }, { "epoch": 6.329809983503391, "grad_norm": 1.7794185876846313, "learning_rate": 1.8392791579977974e-05, "loss": 0.9587, "step": 207200 }, { "epoch": 6.332864911101607, "grad_norm": 16.53619956970215, "learning_rate": 1.837749357483784e-05, "loss": 1.1301, "step": 207300 }, { "epoch": 6.335919838699823, "grad_norm": 7.456695556640625, "learning_rate": 1.8362195569697713e-05, "loss": 0.9313, "step": 207400 }, { "epoch": 6.338974766298039, "grad_norm": 1.256122350692749, "learning_rate": 1.8346897564557584e-05, "loss": 1.3876, "step": 207500 }, { "epoch": 6.342029693896254, "grad_norm": 5.864501476287842, "learning_rate": 1.8331599559417455e-05, "loss": 0.9638, "step": 207600 }, { "epoch": 6.345084621494471, "grad_norm": 5.211721420288086, "learning_rate": 1.8316301554277323e-05, "loss": 1.2297, "step": 207700 }, { "epoch": 6.348139549092687, "grad_norm": 8.266345977783203, "learning_rate": 1.8301003549137194e-05, "loss": 1.1224, "step": 207800 }, { "epoch": 6.351194476690902, "grad_norm": 7.6695556640625, "learning_rate": 1.8285705543997065e-05, "loss": 1.2214, "step": 207900 }, { "epoch": 6.354249404289118, "grad_norm": 4.3584184646606445, "learning_rate": 1.8270407538856936e-05, "loss": 1.1276, "step": 208000 }, { "epoch": 6.357304331887335, "grad_norm": 2.5651164054870605, "learning_rate": 1.8255109533716804e-05, "loss": 0.9946, "step": 208100 }, { "epoch": 6.36035925948555, "grad_norm": 6.8474602699279785, "learning_rate": 1.8239811528576675e-05, "loss": 1.2859, "step": 208200 }, { "epoch": 6.363414187083766, "grad_norm": 5.764113426208496, "learning_rate": 1.8224513523436546e-05, "loss": 1.1951, "step": 208300 }, { "epoch": 6.366469114681982, "grad_norm": 3.486423969268799, "learning_rate": 1.8209215518296417e-05, "loss": 1.5135, "step": 208400 }, { "epoch": 6.369524042280198, "grad_norm": 6.736783027648926, "learning_rate": 1.8193917513156285e-05, "loss": 0.8972, "step": 208500 }, { "epoch": 6.372578969878414, "grad_norm": 7.037610054016113, "learning_rate": 1.8178619508016156e-05, "loss": 1.3713, "step": 208600 }, { "epoch": 6.37563389747663, "grad_norm": 4.605196475982666, "learning_rate": 1.8163321502876027e-05, "loss": 1.1864, "step": 208700 }, { "epoch": 6.378688825074846, "grad_norm": 11.924447059631348, "learning_rate": 1.81480234977359e-05, "loss": 1.1253, "step": 208800 }, { "epoch": 6.381743752673062, "grad_norm": 7.568887710571289, "learning_rate": 1.8132725492595766e-05, "loss": 1.0449, "step": 208900 }, { "epoch": 6.3847986802712775, "grad_norm": 4.9831671714782715, "learning_rate": 1.8117427487455637e-05, "loss": 1.2972, "step": 209000 }, { "epoch": 6.387853607869493, "grad_norm": 6.626806735992432, "learning_rate": 1.810212948231551e-05, "loss": 1.2032, "step": 209100 }, { "epoch": 6.39090853546771, "grad_norm": 4.027082920074463, "learning_rate": 1.808683147717538e-05, "loss": 1.1557, "step": 209200 }, { "epoch": 6.3939634630659254, "grad_norm": 3.5578677654266357, "learning_rate": 1.8071533472035248e-05, "loss": 1.2836, "step": 209300 }, { "epoch": 6.397018390664141, "grad_norm": 6.211548805236816, "learning_rate": 1.805623546689512e-05, "loss": 1.1505, "step": 209400 }, { "epoch": 6.400073318262357, "grad_norm": 21.222150802612305, "learning_rate": 1.804093746175499e-05, "loss": 1.266, "step": 209500 }, { "epoch": 6.403128245860573, "grad_norm": 4.08786153793335, "learning_rate": 1.802563945661486e-05, "loss": 1.0255, "step": 209600 }, { "epoch": 6.406183173458789, "grad_norm": 34.38716125488281, "learning_rate": 1.801034145147473e-05, "loss": 1.2649, "step": 209700 }, { "epoch": 6.409238101057005, "grad_norm": 5.348664283752441, "learning_rate": 1.79950434463346e-05, "loss": 1.1752, "step": 209800 }, { "epoch": 6.41229302865522, "grad_norm": 2.724579334259033, "learning_rate": 1.797974544119447e-05, "loss": 1.1778, "step": 209900 }, { "epoch": 6.415347956253437, "grad_norm": 4.925905227661133, "learning_rate": 1.796444743605434e-05, "loss": 1.2821, "step": 210000 }, { "epoch": 6.418402883851653, "grad_norm": 13.180278778076172, "learning_rate": 1.7949149430914207e-05, "loss": 1.3623, "step": 210100 }, { "epoch": 6.421457811449868, "grad_norm": 25.796751022338867, "learning_rate": 1.7933851425774078e-05, "loss": 1.2821, "step": 210200 }, { "epoch": 6.424512739048085, "grad_norm": 3.784736394882202, "learning_rate": 1.791855342063395e-05, "loss": 1.0298, "step": 210300 }, { "epoch": 6.427567666646301, "grad_norm": 7.122948169708252, "learning_rate": 1.790325541549382e-05, "loss": 1.1683, "step": 210400 }, { "epoch": 6.430622594244516, "grad_norm": 7.788839817047119, "learning_rate": 1.7887957410353688e-05, "loss": 1.0356, "step": 210500 }, { "epoch": 6.433677521842732, "grad_norm": 4.615478992462158, "learning_rate": 1.7872812385264963e-05, "loss": 1.326, "step": 210600 }, { "epoch": 6.436732449440949, "grad_norm": 18.982412338256836, "learning_rate": 1.785751438012483e-05, "loss": 0.952, "step": 210700 }, { "epoch": 6.439787377039164, "grad_norm": 5.564967155456543, "learning_rate": 1.78422163749847e-05, "loss": 1.3681, "step": 210800 }, { "epoch": 6.44284230463738, "grad_norm": 5.302816867828369, "learning_rate": 1.7826918369844573e-05, "loss": 1.0967, "step": 210900 }, { "epoch": 6.445897232235596, "grad_norm": 9.340934753417969, "learning_rate": 1.7811773344755844e-05, "loss": 1.2459, "step": 211000 }, { "epoch": 6.448952159833812, "grad_norm": 5.11415958404541, "learning_rate": 1.7796475339615715e-05, "loss": 0.9813, "step": 211100 }, { "epoch": 6.452007087432028, "grad_norm": 2.0551705360412598, "learning_rate": 1.7781177334475586e-05, "loss": 1.0639, "step": 211200 }, { "epoch": 6.455062015030244, "grad_norm": 8.070032119750977, "learning_rate": 1.7765879329335454e-05, "loss": 1.0419, "step": 211300 }, { "epoch": 6.458116942628459, "grad_norm": 7.958362579345703, "learning_rate": 1.7750581324195325e-05, "loss": 0.9593, "step": 211400 }, { "epoch": 6.461171870226676, "grad_norm": 5.6488800048828125, "learning_rate": 1.7735283319055196e-05, "loss": 1.063, "step": 211500 }, { "epoch": 6.4642267978248915, "grad_norm": 7.8869242668151855, "learning_rate": 1.7719985313915068e-05, "loss": 1.3276, "step": 211600 }, { "epoch": 6.467281725423107, "grad_norm": 5.1599016189575195, "learning_rate": 1.7704687308774935e-05, "loss": 0.8885, "step": 211700 }, { "epoch": 6.470336653021324, "grad_norm": 5.28355598449707, "learning_rate": 1.7689389303634807e-05, "loss": 1.3002, "step": 211800 }, { "epoch": 6.4733915806195395, "grad_norm": 11.972784042358398, "learning_rate": 1.7674091298494678e-05, "loss": 1.0144, "step": 211900 }, { "epoch": 6.476446508217755, "grad_norm": 3.3315412998199463, "learning_rate": 1.765879329335455e-05, "loss": 0.9171, "step": 212000 }, { "epoch": 6.479501435815971, "grad_norm": 6.229414463043213, "learning_rate": 1.7643495288214417e-05, "loss": 1.2993, "step": 212100 }, { "epoch": 6.482556363414187, "grad_norm": 11.299790382385254, "learning_rate": 1.7628197283074288e-05, "loss": 1.0803, "step": 212200 }, { "epoch": 6.485611291012403, "grad_norm": 8.365095138549805, "learning_rate": 1.761289927793416e-05, "loss": 1.323, "step": 212300 }, { "epoch": 6.488666218610619, "grad_norm": 11.41671371459961, "learning_rate": 1.759760127279403e-05, "loss": 1.1004, "step": 212400 }, { "epoch": 6.4917211462088344, "grad_norm": 3.927541971206665, "learning_rate": 1.7582303267653898e-05, "loss": 1.0298, "step": 212500 }, { "epoch": 6.494776073807051, "grad_norm": 5.735844135284424, "learning_rate": 1.756700526251377e-05, "loss": 1.2621, "step": 212600 }, { "epoch": 6.497831001405267, "grad_norm": 5.7011284828186035, "learning_rate": 1.755170725737364e-05, "loss": 1.1243, "step": 212700 }, { "epoch": 6.500885929003482, "grad_norm": 9.130974769592285, "learning_rate": 1.753640925223351e-05, "loss": 1.0719, "step": 212800 }, { "epoch": 6.503940856601698, "grad_norm": 7.565042018890381, "learning_rate": 1.752111124709338e-05, "loss": 0.9947, "step": 212900 }, { "epoch": 6.506995784199915, "grad_norm": 3.490960121154785, "learning_rate": 1.750581324195325e-05, "loss": 1.0295, "step": 213000 }, { "epoch": 6.51005071179813, "grad_norm": 5.046875953674316, "learning_rate": 1.749051523681312e-05, "loss": 1.0738, "step": 213100 }, { "epoch": 6.513105639396346, "grad_norm": 2.79036808013916, "learning_rate": 1.7475217231672993e-05, "loss": 1.3818, "step": 213200 }, { "epoch": 6.516160566994563, "grad_norm": 0.009335226379334927, "learning_rate": 1.7460072206584264e-05, "loss": 1.1112, "step": 213300 }, { "epoch": 6.519215494592778, "grad_norm": 6.8894243240356445, "learning_rate": 1.7444774201444135e-05, "loss": 1.1441, "step": 213400 }, { "epoch": 6.522270422190994, "grad_norm": 4.209587574005127, "learning_rate": 1.7429476196304003e-05, "loss": 1.0272, "step": 213500 }, { "epoch": 6.52532534978921, "grad_norm": 2.185170888900757, "learning_rate": 1.7414178191163874e-05, "loss": 1.0875, "step": 213600 }, { "epoch": 6.528380277387426, "grad_norm": 14.174199104309082, "learning_rate": 1.7398880186023745e-05, "loss": 1.089, "step": 213700 }, { "epoch": 6.531435204985642, "grad_norm": 9.62555980682373, "learning_rate": 1.7383582180883616e-05, "loss": 1.1095, "step": 213800 }, { "epoch": 6.534490132583858, "grad_norm": 5.337947368621826, "learning_rate": 1.7368284175743484e-05, "loss": 0.9957, "step": 213900 }, { "epoch": 6.537545060182074, "grad_norm": 4.797677993774414, "learning_rate": 1.7352986170603355e-05, "loss": 1.2343, "step": 214000 }, { "epoch": 6.54059998778029, "grad_norm": 7.560909271240234, "learning_rate": 1.7337688165463226e-05, "loss": 1.1702, "step": 214100 }, { "epoch": 6.5436549153785055, "grad_norm": 3.0463943481445312, "learning_rate": 1.7322390160323098e-05, "loss": 1.2553, "step": 214200 }, { "epoch": 6.546709842976721, "grad_norm": 6.867596626281738, "learning_rate": 1.7307092155182965e-05, "loss": 1.0156, "step": 214300 }, { "epoch": 6.549764770574937, "grad_norm": 7.988159656524658, "learning_rate": 1.7291794150042837e-05, "loss": 1.2007, "step": 214400 }, { "epoch": 6.5528196981731535, "grad_norm": 4.5299482345581055, "learning_rate": 1.7276496144902708e-05, "loss": 1.0585, "step": 214500 }, { "epoch": 6.555874625771369, "grad_norm": 17.113630294799805, "learning_rate": 1.726119813976258e-05, "loss": 1.226, "step": 214600 }, { "epoch": 6.558929553369585, "grad_norm": 6.469338417053223, "learning_rate": 1.7245900134622443e-05, "loss": 1.5139, "step": 214700 }, { "epoch": 6.561984480967801, "grad_norm": 9.526185989379883, "learning_rate": 1.7230602129482314e-05, "loss": 0.9568, "step": 214800 }, { "epoch": 6.565039408566017, "grad_norm": 4.849943161010742, "learning_rate": 1.7215304124342186e-05, "loss": 1.1666, "step": 214900 }, { "epoch": 6.568094336164233, "grad_norm": 4.016180515289307, "learning_rate": 1.7200006119202057e-05, "loss": 1.0448, "step": 215000 }, { "epoch": 6.5711492637624485, "grad_norm": 3.0566747188568115, "learning_rate": 1.7184708114061924e-05, "loss": 1.2607, "step": 215100 }, { "epoch": 6.574204191360665, "grad_norm": 13.017189025878906, "learning_rate": 1.7169410108921796e-05, "loss": 1.2375, "step": 215200 }, { "epoch": 6.577259118958881, "grad_norm": 3.1500213146209717, "learning_rate": 1.7154112103781667e-05, "loss": 1.1126, "step": 215300 }, { "epoch": 6.580314046557096, "grad_norm": 15.420360565185547, "learning_rate": 1.7138814098641538e-05, "loss": 1.0861, "step": 215400 }, { "epoch": 6.583368974155313, "grad_norm": 15.716479301452637, "learning_rate": 1.712366907355281e-05, "loss": 1.357, "step": 215500 }, { "epoch": 6.586423901753529, "grad_norm": 4.730207443237305, "learning_rate": 1.710837106841268e-05, "loss": 1.0504, "step": 215600 }, { "epoch": 6.589478829351744, "grad_norm": 5.073116779327393, "learning_rate": 1.7093073063272548e-05, "loss": 1.3092, "step": 215700 }, { "epoch": 6.59253375694996, "grad_norm": 7.078702449798584, "learning_rate": 1.707777505813242e-05, "loss": 1.2368, "step": 215800 }, { "epoch": 6.595588684548177, "grad_norm": 6.552230358123779, "learning_rate": 1.706247705299229e-05, "loss": 1.1605, "step": 215900 }, { "epoch": 6.598643612146392, "grad_norm": 12.837909698486328, "learning_rate": 1.704717904785216e-05, "loss": 1.5417, "step": 216000 }, { "epoch": 6.601698539744608, "grad_norm": 3.1361231803894043, "learning_rate": 1.703188104271203e-05, "loss": 1.187, "step": 216100 }, { "epoch": 6.604753467342824, "grad_norm": 3.2314138412475586, "learning_rate": 1.70165830375719e-05, "loss": 1.1333, "step": 216200 }, { "epoch": 6.60780839494104, "grad_norm": 7.735902309417725, "learning_rate": 1.7001285032431772e-05, "loss": 1.2677, "step": 216300 }, { "epoch": 6.610863322539256, "grad_norm": 4.2524189949035645, "learning_rate": 1.6985987027291643e-05, "loss": 1.195, "step": 216400 }, { "epoch": 6.613918250137472, "grad_norm": 5.863406658172607, "learning_rate": 1.697068902215151e-05, "loss": 1.0911, "step": 216500 }, { "epoch": 6.616973177735687, "grad_norm": 6.36102294921875, "learning_rate": 1.6955391017011382e-05, "loss": 1.036, "step": 216600 }, { "epoch": 6.620028105333904, "grad_norm": 3.2900092601776123, "learning_rate": 1.6940093011871253e-05, "loss": 1.2053, "step": 216700 }, { "epoch": 6.6230830329321195, "grad_norm": 0.4474376142024994, "learning_rate": 1.6924795006731124e-05, "loss": 1.1469, "step": 216800 }, { "epoch": 6.626137960530335, "grad_norm": 22.14536476135254, "learning_rate": 1.6909497001590992e-05, "loss": 1.2501, "step": 216900 }, { "epoch": 6.629192888128552, "grad_norm": 3.733264207839966, "learning_rate": 1.6894198996450863e-05, "loss": 1.0177, "step": 217000 }, { "epoch": 6.6322478157267675, "grad_norm": 3.5817043781280518, "learning_rate": 1.6878900991310734e-05, "loss": 1.3182, "step": 217100 }, { "epoch": 6.635302743324983, "grad_norm": 5.059553146362305, "learning_rate": 1.6863602986170605e-05, "loss": 1.3476, "step": 217200 }, { "epoch": 6.638357670923199, "grad_norm": 3.9976773262023926, "learning_rate": 1.6848304981030473e-05, "loss": 1.0562, "step": 217300 }, { "epoch": 6.641412598521415, "grad_norm": 16.386789321899414, "learning_rate": 1.6833006975890344e-05, "loss": 1.0905, "step": 217400 }, { "epoch": 6.644467526119631, "grad_norm": 4.764182090759277, "learning_rate": 1.6817708970750215e-05, "loss": 1.1609, "step": 217500 }, { "epoch": 6.647522453717847, "grad_norm": 3.44966721534729, "learning_rate": 1.6802410965610087e-05, "loss": 1.0737, "step": 217600 }, { "epoch": 6.6505773813160625, "grad_norm": 3.3409371376037598, "learning_rate": 1.6787112960469954e-05, "loss": 1.084, "step": 217700 }, { "epoch": 6.653632308914279, "grad_norm": 9.89992904663086, "learning_rate": 1.677196793538123e-05, "loss": 1.0378, "step": 217800 }, { "epoch": 6.656687236512495, "grad_norm": 6.71497106552124, "learning_rate": 1.6756669930241097e-05, "loss": 1.1315, "step": 217900 }, { "epoch": 6.65974216411071, "grad_norm": 3.8522651195526123, "learning_rate": 1.6741371925100968e-05, "loss": 1.0395, "step": 218000 }, { "epoch": 6.662797091708926, "grad_norm": 7.571756362915039, "learning_rate": 1.672607391996084e-05, "loss": 1.1587, "step": 218100 }, { "epoch": 6.665852019307143, "grad_norm": 8.427057266235352, "learning_rate": 1.671077591482071e-05, "loss": 1.1139, "step": 218200 }, { "epoch": 6.668906946905358, "grad_norm": 4.802609443664551, "learning_rate": 1.6695477909680578e-05, "loss": 1.0882, "step": 218300 }, { "epoch": 6.671961874503574, "grad_norm": 5.4690961837768555, "learning_rate": 1.668017990454045e-05, "loss": 1.0002, "step": 218400 }, { "epoch": 6.675016802101791, "grad_norm": 13.944718360900879, "learning_rate": 1.666488189940032e-05, "loss": 1.2559, "step": 218500 }, { "epoch": 6.678071729700006, "grad_norm": 4.6269989013671875, "learning_rate": 1.664958389426019e-05, "loss": 1.0556, "step": 218600 }, { "epoch": 6.681126657298222, "grad_norm": 10.070877075195312, "learning_rate": 1.663428588912006e-05, "loss": 1.1925, "step": 218700 }, { "epoch": 6.684181584896438, "grad_norm": 8.7732515335083, "learning_rate": 1.661898788397993e-05, "loss": 0.9621, "step": 218800 }, { "epoch": 6.687236512494654, "grad_norm": 8.3976411819458, "learning_rate": 1.66036898788398e-05, "loss": 0.9891, "step": 218900 }, { "epoch": 6.69029144009287, "grad_norm": 5.672944068908691, "learning_rate": 1.6588391873699673e-05, "loss": 1.0646, "step": 219000 }, { "epoch": 6.693346367691086, "grad_norm": 3.278470754623413, "learning_rate": 1.657309386855954e-05, "loss": 1.0843, "step": 219100 }, { "epoch": 6.696401295289301, "grad_norm": 4.4352641105651855, "learning_rate": 1.6557795863419412e-05, "loss": 1.1092, "step": 219200 }, { "epoch": 6.699456222887518, "grad_norm": 6.883086204528809, "learning_rate": 1.6542497858279283e-05, "loss": 1.1694, "step": 219300 }, { "epoch": 6.7025111504857335, "grad_norm": 5.87219762802124, "learning_rate": 1.6527199853139154e-05, "loss": 1.1636, "step": 219400 }, { "epoch": 6.705566078083949, "grad_norm": 9.403863906860352, "learning_rate": 1.6511901847999022e-05, "loss": 1.0704, "step": 219500 }, { "epoch": 6.708621005682165, "grad_norm": 5.002470970153809, "learning_rate": 1.6496603842858893e-05, "loss": 1.1899, "step": 219600 }, { "epoch": 6.7116759332803815, "grad_norm": 0.014414245262742043, "learning_rate": 1.6481305837718764e-05, "loss": 1.2799, "step": 219700 }, { "epoch": 6.714730860878597, "grad_norm": 6.7424187660217285, "learning_rate": 1.6466007832578635e-05, "loss": 1.0311, "step": 219800 }, { "epoch": 6.717785788476813, "grad_norm": 6.960201263427734, "learning_rate": 1.6450709827438503e-05, "loss": 0.9706, "step": 219900 }, { "epoch": 6.720840716075029, "grad_norm": 4.6439290046691895, "learning_rate": 1.6435411822298374e-05, "loss": 1.1177, "step": 220000 }, { "epoch": 6.723895643673245, "grad_norm": 3.3932769298553467, "learning_rate": 1.6420113817158245e-05, "loss": 1.2257, "step": 220100 }, { "epoch": 6.726950571271461, "grad_norm": 4.096286296844482, "learning_rate": 1.6404815812018113e-05, "loss": 1.0774, "step": 220200 }, { "epoch": 6.7300054988696765, "grad_norm": 5.358587741851807, "learning_rate": 1.6389670786929385e-05, "loss": 0.8585, "step": 220300 }, { "epoch": 6.733060426467893, "grad_norm": 4.067366600036621, "learning_rate": 1.6374372781789256e-05, "loss": 1.0609, "step": 220400 }, { "epoch": 6.736115354066109, "grad_norm": 5.973146438598633, "learning_rate": 1.6359074776649123e-05, "loss": 1.1005, "step": 220500 }, { "epoch": 6.739170281664324, "grad_norm": 8.303258895874023, "learning_rate": 1.6343776771508995e-05, "loss": 1.1691, "step": 220600 }, { "epoch": 6.74222520926254, "grad_norm": 12.337789535522461, "learning_rate": 1.6328478766368866e-05, "loss": 1.146, "step": 220700 }, { "epoch": 6.745280136860757, "grad_norm": 12.040741920471191, "learning_rate": 1.6313180761228737e-05, "loss": 1.5116, "step": 220800 }, { "epoch": 6.748335064458972, "grad_norm": 8.398398399353027, "learning_rate": 1.6297882756088605e-05, "loss": 1.2403, "step": 220900 }, { "epoch": 6.751389992057188, "grad_norm": 6.159371852874756, "learning_rate": 1.6282584750948476e-05, "loss": 1.169, "step": 221000 }, { "epoch": 6.754444919655404, "grad_norm": 4.961534023284912, "learning_rate": 1.6267286745808347e-05, "loss": 1.2808, "step": 221100 }, { "epoch": 6.75749984725362, "grad_norm": 2.4521520137786865, "learning_rate": 1.6251988740668218e-05, "loss": 1.0539, "step": 221200 }, { "epoch": 6.760554774851836, "grad_norm": 4.656429290771484, "learning_rate": 1.6236690735528086e-05, "loss": 0.9846, "step": 221300 }, { "epoch": 6.763609702450052, "grad_norm": 4.705031394958496, "learning_rate": 1.6221392730387957e-05, "loss": 1.0849, "step": 221400 }, { "epoch": 6.766664630048268, "grad_norm": 4.43703556060791, "learning_rate": 1.6206094725247828e-05, "loss": 1.0477, "step": 221500 }, { "epoch": 6.769719557646484, "grad_norm": 8.725431442260742, "learning_rate": 1.61907967201077e-05, "loss": 1.0679, "step": 221600 }, { "epoch": 6.7727744852447, "grad_norm": 8.772459030151367, "learning_rate": 1.6175498714967567e-05, "loss": 1.2964, "step": 221700 }, { "epoch": 6.775829412842915, "grad_norm": 5.187252044677734, "learning_rate": 1.616020070982744e-05, "loss": 1.1037, "step": 221800 }, { "epoch": 6.778884340441132, "grad_norm": 6.5184526443481445, "learning_rate": 1.614490270468731e-05, "loss": 1.0777, "step": 221900 }, { "epoch": 6.7819392680393475, "grad_norm": 8.145726203918457, "learning_rate": 1.612960469954718e-05, "loss": 1.1475, "step": 222000 }, { "epoch": 6.784994195637563, "grad_norm": 3.8500397205352783, "learning_rate": 1.611430669440705e-05, "loss": 1.0347, "step": 222100 }, { "epoch": 6.78804912323578, "grad_norm": 5.5146894454956055, "learning_rate": 1.609900868926692e-05, "loss": 1.2915, "step": 222200 }, { "epoch": 6.7911040508339955, "grad_norm": 6.543974876403809, "learning_rate": 1.608371068412679e-05, "loss": 1.2405, "step": 222300 }, { "epoch": 6.794158978432211, "grad_norm": 4.565728664398193, "learning_rate": 1.6068412678986662e-05, "loss": 1.3069, "step": 222400 }, { "epoch": 6.797213906030427, "grad_norm": 2.4047398567199707, "learning_rate": 1.605311467384653e-05, "loss": 1.1599, "step": 222500 }, { "epoch": 6.8002688336286425, "grad_norm": 5.106794357299805, "learning_rate": 1.60378166687064e-05, "loss": 1.1013, "step": 222600 }, { "epoch": 6.803323761226859, "grad_norm": 6.793829917907715, "learning_rate": 1.6022518663566272e-05, "loss": 1.2487, "step": 222700 }, { "epoch": 6.806378688825075, "grad_norm": 7.5256452560424805, "learning_rate": 1.6007220658426143e-05, "loss": 1.1052, "step": 222800 }, { "epoch": 6.8094336164232905, "grad_norm": 27.9412841796875, "learning_rate": 1.599192265328601e-05, "loss": 1.1104, "step": 222900 }, { "epoch": 6.812488544021507, "grad_norm": 9.895366668701172, "learning_rate": 1.5976624648145882e-05, "loss": 1.3076, "step": 223000 }, { "epoch": 6.815543471619723, "grad_norm": 3.188260555267334, "learning_rate": 1.5961326643005753e-05, "loss": 0.9702, "step": 223100 }, { "epoch": 6.818598399217938, "grad_norm": 3.186352252960205, "learning_rate": 1.5946028637865624e-05, "loss": 1.1281, "step": 223200 }, { "epoch": 6.821653326816154, "grad_norm": 1.098803997039795, "learning_rate": 1.5930730632725492e-05, "loss": 1.1198, "step": 223300 }, { "epoch": 6.824708254414371, "grad_norm": 3.8436925411224365, "learning_rate": 1.5915432627585363e-05, "loss": 1.2168, "step": 223400 }, { "epoch": 6.827763182012586, "grad_norm": 8.489556312561035, "learning_rate": 1.5900134622445234e-05, "loss": 1.2194, "step": 223500 }, { "epoch": 6.830818109610802, "grad_norm": 4.756037712097168, "learning_rate": 1.5884836617305106e-05, "loss": 1.2152, "step": 223600 }, { "epoch": 6.833873037209019, "grad_norm": 3.0809168815612793, "learning_rate": 1.5869538612164973e-05, "loss": 0.9161, "step": 223700 }, { "epoch": 6.836927964807234, "grad_norm": 5.461037635803223, "learning_rate": 1.5854240607024845e-05, "loss": 1.042, "step": 223800 }, { "epoch": 6.83998289240545, "grad_norm": 13.365588188171387, "learning_rate": 1.5838942601884716e-05, "loss": 1.1589, "step": 223900 }, { "epoch": 6.843037820003666, "grad_norm": 5.828542709350586, "learning_rate": 1.5823644596744587e-05, "loss": 1.2168, "step": 224000 }, { "epoch": 6.846092747601881, "grad_norm": 7.322714805603027, "learning_rate": 1.5808346591604455e-05, "loss": 1.035, "step": 224100 }, { "epoch": 6.849147675200098, "grad_norm": 2.8676154613494873, "learning_rate": 1.579320156651573e-05, "loss": 1.1801, "step": 224200 }, { "epoch": 6.852202602798314, "grad_norm": 2.7156238555908203, "learning_rate": 1.5777903561375597e-05, "loss": 1.0304, "step": 224300 }, { "epoch": 6.855257530396529, "grad_norm": 11.405207633972168, "learning_rate": 1.5762605556235468e-05, "loss": 1.239, "step": 224400 }, { "epoch": 6.858312457994746, "grad_norm": 12.10234260559082, "learning_rate": 1.574730755109534e-05, "loss": 1.3059, "step": 224500 }, { "epoch": 6.8613673855929616, "grad_norm": 35.208656311035156, "learning_rate": 1.573200954595521e-05, "loss": 1.1348, "step": 224600 }, { "epoch": 6.864422313191177, "grad_norm": 6.893289089202881, "learning_rate": 1.571671154081508e-05, "loss": 1.1835, "step": 224700 }, { "epoch": 6.867477240789393, "grad_norm": 4.095139980316162, "learning_rate": 1.570141353567495e-05, "loss": 1.0936, "step": 224800 }, { "epoch": 6.8705321683876095, "grad_norm": 8.315845489501953, "learning_rate": 1.568611553053482e-05, "loss": 1.0834, "step": 224900 }, { "epoch": 6.873587095985825, "grad_norm": 9.586271286010742, "learning_rate": 1.5670817525394692e-05, "loss": 1.2971, "step": 225000 }, { "epoch": 6.876642023584041, "grad_norm": 3.927130699157715, "learning_rate": 1.565551952025456e-05, "loss": 1.6149, "step": 225100 }, { "epoch": 6.879696951182257, "grad_norm": 4.632936477661133, "learning_rate": 1.564022151511443e-05, "loss": 1.1813, "step": 225200 }, { "epoch": 6.882751878780473, "grad_norm": 6.257320880889893, "learning_rate": 1.5624923509974302e-05, "loss": 0.9665, "step": 225300 }, { "epoch": 6.885806806378689, "grad_norm": 23.577136993408203, "learning_rate": 1.5609625504834173e-05, "loss": 1.2635, "step": 225400 }, { "epoch": 6.8888617339769045, "grad_norm": 13.904678344726562, "learning_rate": 1.559432749969404e-05, "loss": 1.0694, "step": 225500 }, { "epoch": 6.89191666157512, "grad_norm": 6.760549545288086, "learning_rate": 1.5579029494553912e-05, "loss": 1.1164, "step": 225600 }, { "epoch": 6.894971589173337, "grad_norm": 10.959383964538574, "learning_rate": 1.5563731489413783e-05, "loss": 1.3547, "step": 225700 }, { "epoch": 6.898026516771552, "grad_norm": 5.473452091217041, "learning_rate": 1.554843348427365e-05, "loss": 1.0673, "step": 225800 }, { "epoch": 6.901081444369768, "grad_norm": 4.299148082733154, "learning_rate": 1.5533135479133522e-05, "loss": 1.1235, "step": 225900 }, { "epoch": 6.904136371967985, "grad_norm": 5.2035417556762695, "learning_rate": 1.551783747399339e-05, "loss": 1.2148, "step": 226000 }, { "epoch": 6.9071912995662, "grad_norm": 8.355424880981445, "learning_rate": 1.550253946885326e-05, "loss": 1.0416, "step": 226100 }, { "epoch": 6.910246227164416, "grad_norm": 17.467309951782227, "learning_rate": 1.5487241463713132e-05, "loss": 1.0306, "step": 226200 }, { "epoch": 6.913301154762632, "grad_norm": 6.781820297241211, "learning_rate": 1.5471943458573003e-05, "loss": 0.9252, "step": 226300 }, { "epoch": 6.916356082360848, "grad_norm": 6.654858589172363, "learning_rate": 1.545664545343287e-05, "loss": 1.124, "step": 226400 }, { "epoch": 6.919411009959064, "grad_norm": 10.962957382202148, "learning_rate": 1.5441347448292742e-05, "loss": 1.0413, "step": 226500 }, { "epoch": 6.92246593755728, "grad_norm": 12.6961030960083, "learning_rate": 1.5426049443152613e-05, "loss": 1.265, "step": 226600 }, { "epoch": 6.925520865155496, "grad_norm": 5.4705681800842285, "learning_rate": 1.5410751438012485e-05, "loss": 1.0878, "step": 226700 }, { "epoch": 6.928575792753712, "grad_norm": 7.821584701538086, "learning_rate": 1.5395453432872352e-05, "loss": 1.2651, "step": 226800 }, { "epoch": 6.931630720351928, "grad_norm": 1.9477187395095825, "learning_rate": 1.5380155427732223e-05, "loss": 1.2534, "step": 226900 }, { "epoch": 6.934685647950143, "grad_norm": 10.880365371704102, "learning_rate": 1.5364857422592095e-05, "loss": 1.028, "step": 227000 }, { "epoch": 6.93774057554836, "grad_norm": 3.2521772384643555, "learning_rate": 1.5349559417451966e-05, "loss": 1.0412, "step": 227100 }, { "epoch": 6.940795503146576, "grad_norm": 6.3504743576049805, "learning_rate": 1.5334261412311834e-05, "loss": 1.2927, "step": 227200 }, { "epoch": 6.943850430744791, "grad_norm": 11.716618537902832, "learning_rate": 1.5319116387223105e-05, "loss": 1.0605, "step": 227300 }, { "epoch": 6.946905358343007, "grad_norm": 12.864195823669434, "learning_rate": 1.5303818382082976e-05, "loss": 0.941, "step": 227400 }, { "epoch": 6.9499602859412235, "grad_norm": 5.322654724121094, "learning_rate": 1.5288520376942847e-05, "loss": 1.1716, "step": 227500 }, { "epoch": 6.953015213539439, "grad_norm": 7.079869270324707, "learning_rate": 1.527322237180272e-05, "loss": 1.0776, "step": 227600 }, { "epoch": 6.956070141137655, "grad_norm": 4.664093017578125, "learning_rate": 1.5257924366662588e-05, "loss": 1.1936, "step": 227700 }, { "epoch": 6.959125068735871, "grad_norm": 10.354986190795898, "learning_rate": 1.5242626361522457e-05, "loss": 1.0978, "step": 227800 }, { "epoch": 6.962179996334087, "grad_norm": 6.121084213256836, "learning_rate": 1.5227328356382328e-05, "loss": 1.1082, "step": 227900 }, { "epoch": 6.965234923932303, "grad_norm": 6.739871025085449, "learning_rate": 1.5212030351242198e-05, "loss": 1.2247, "step": 228000 }, { "epoch": 6.9682898515305185, "grad_norm": 7.606113910675049, "learning_rate": 1.5196732346102069e-05, "loss": 1.314, "step": 228100 }, { "epoch": 6.971344779128735, "grad_norm": 1.3897545337677002, "learning_rate": 1.5181434340961939e-05, "loss": 0.996, "step": 228200 }, { "epoch": 6.974399706726951, "grad_norm": 8.686017036437988, "learning_rate": 1.516613633582181e-05, "loss": 1.1612, "step": 228300 }, { "epoch": 6.977454634325166, "grad_norm": 11.423168182373047, "learning_rate": 1.515083833068168e-05, "loss": 1.0611, "step": 228400 }, { "epoch": 6.980509561923382, "grad_norm": 11.569713592529297, "learning_rate": 1.513554032554155e-05, "loss": 0.9411, "step": 228500 }, { "epoch": 6.983564489521599, "grad_norm": 7.2182745933532715, "learning_rate": 1.512024232040142e-05, "loss": 1.1535, "step": 228600 }, { "epoch": 6.986619417119814, "grad_norm": 9.209665298461914, "learning_rate": 1.5104944315261291e-05, "loss": 1.445, "step": 228700 }, { "epoch": 6.98967434471803, "grad_norm": 3.6917855739593506, "learning_rate": 1.508964631012116e-05, "loss": 1.1005, "step": 228800 }, { "epoch": 6.992729272316246, "grad_norm": 7.044301986694336, "learning_rate": 1.5074348304981032e-05, "loss": 1.1654, "step": 228900 }, { "epoch": 6.995784199914462, "grad_norm": 5.988764762878418, "learning_rate": 1.5059050299840901e-05, "loss": 1.1821, "step": 229000 }, { "epoch": 6.998839127512678, "grad_norm": 4.651966094970703, "learning_rate": 1.5043752294700772e-05, "loss": 1.0204, "step": 229100 }, { "epoch": 7.0, "eval_accuracy": 0.7272560640312825, "eval_loss": 0.7299212217330933, "eval_runtime": 1775.0348, "eval_samples_per_second": 18.441, "eval_steps_per_second": 4.611, "step": 229138 }, { "epoch": 7.001894055110894, "grad_norm": 4.024294853210449, "learning_rate": 1.5028454289560642e-05, "loss": 1.1047, "step": 229200 }, { "epoch": 7.004948982709109, "grad_norm": 4.152279376983643, "learning_rate": 1.5013156284420513e-05, "loss": 1.1013, "step": 229300 }, { "epoch": 7.008003910307326, "grad_norm": 7.752127647399902, "learning_rate": 1.4997858279280382e-05, "loss": 0.9694, "step": 229400 }, { "epoch": 7.011058837905542, "grad_norm": 26.886756896972656, "learning_rate": 1.4982560274140253e-05, "loss": 1.76, "step": 229500 }, { "epoch": 7.014113765503757, "grad_norm": 5.418153285980225, "learning_rate": 1.4967262269000123e-05, "loss": 1.2325, "step": 229600 }, { "epoch": 7.017168693101974, "grad_norm": 3.349722146987915, "learning_rate": 1.4951964263859994e-05, "loss": 1.0413, "step": 229700 }, { "epoch": 7.02022362070019, "grad_norm": 11.646615028381348, "learning_rate": 1.4936666258719864e-05, "loss": 1.0621, "step": 229800 }, { "epoch": 7.023278548298405, "grad_norm": 11.823214530944824, "learning_rate": 1.4921368253579735e-05, "loss": 1.0923, "step": 229900 }, { "epoch": 7.026333475896621, "grad_norm": 16.650747299194336, "learning_rate": 1.4906070248439604e-05, "loss": 1.0721, "step": 230000 }, { "epoch": 7.0293884034948375, "grad_norm": 6.320542335510254, "learning_rate": 1.4890772243299475e-05, "loss": 1.1855, "step": 230100 }, { "epoch": 7.032443331093053, "grad_norm": 8.720442771911621, "learning_rate": 1.4875474238159345e-05, "loss": 1.0078, "step": 230200 }, { "epoch": 7.035498258691269, "grad_norm": 4.015342712402344, "learning_rate": 1.4860176233019216e-05, "loss": 1.1124, "step": 230300 }, { "epoch": 7.038553186289485, "grad_norm": 5.141261100769043, "learning_rate": 1.4844878227879085e-05, "loss": 0.9912, "step": 230400 }, { "epoch": 7.041608113887701, "grad_norm": 6.7251296043396, "learning_rate": 1.4829580222738957e-05, "loss": 1.1093, "step": 230500 }, { "epoch": 7.044663041485917, "grad_norm": 5.447814464569092, "learning_rate": 1.4814282217598826e-05, "loss": 1.0289, "step": 230600 }, { "epoch": 7.0477179690841325, "grad_norm": 10.36055850982666, "learning_rate": 1.4798984212458697e-05, "loss": 0.9546, "step": 230700 }, { "epoch": 7.050772896682348, "grad_norm": 5.748435020446777, "learning_rate": 1.4783686207318567e-05, "loss": 1.0437, "step": 230800 }, { "epoch": 7.053827824280565, "grad_norm": 4.40362548828125, "learning_rate": 1.4768388202178438e-05, "loss": 1.151, "step": 230900 }, { "epoch": 7.0568827518787804, "grad_norm": 5.855711936950684, "learning_rate": 1.4753090197038307e-05, "loss": 0.9614, "step": 231000 }, { "epoch": 7.059937679476996, "grad_norm": 14.346360206604004, "learning_rate": 1.4737792191898178e-05, "loss": 1.1533, "step": 231100 }, { "epoch": 7.062992607075213, "grad_norm": 9.502619743347168, "learning_rate": 1.4722494186758048e-05, "loss": 0.9982, "step": 231200 }, { "epoch": 7.066047534673428, "grad_norm": 10.261883735656738, "learning_rate": 1.4707349161669318e-05, "loss": 1.2859, "step": 231300 }, { "epoch": 7.069102462271644, "grad_norm": 1.5850389003753662, "learning_rate": 1.4692051156529189e-05, "loss": 1.2657, "step": 231400 }, { "epoch": 7.07215738986986, "grad_norm": 9.610984802246094, "learning_rate": 1.4676753151389058e-05, "loss": 1.2829, "step": 231500 }, { "epoch": 7.075212317468076, "grad_norm": 9.58398151397705, "learning_rate": 1.466145514624893e-05, "loss": 1.1443, "step": 231600 }, { "epoch": 7.078267245066292, "grad_norm": 2.9476981163024902, "learning_rate": 1.4646157141108799e-05, "loss": 0.9903, "step": 231700 }, { "epoch": 7.081322172664508, "grad_norm": 7.310122966766357, "learning_rate": 1.463085913596867e-05, "loss": 0.9488, "step": 231800 }, { "epoch": 7.084377100262723, "grad_norm": 5.88616418838501, "learning_rate": 1.461556113082854e-05, "loss": 1.0228, "step": 231900 }, { "epoch": 7.08743202786094, "grad_norm": 6.997511386871338, "learning_rate": 1.460026312568841e-05, "loss": 1.0473, "step": 232000 }, { "epoch": 7.090486955459156, "grad_norm": 4.911482810974121, "learning_rate": 1.458496512054828e-05, "loss": 1.0401, "step": 232100 }, { "epoch": 7.093541883057371, "grad_norm": 0.0005010916502214968, "learning_rate": 1.4569667115408151e-05, "loss": 1.1492, "step": 232200 }, { "epoch": 7.096596810655588, "grad_norm": 3.7000105381011963, "learning_rate": 1.455436911026802e-05, "loss": 1.0027, "step": 232300 }, { "epoch": 7.099651738253804, "grad_norm": 13.737366676330566, "learning_rate": 1.4539071105127892e-05, "loss": 1.0997, "step": 232400 }, { "epoch": 7.102706665852019, "grad_norm": 6.119179725646973, "learning_rate": 1.4523773099987761e-05, "loss": 0.9705, "step": 232500 }, { "epoch": 7.105761593450235, "grad_norm": 1.9026933908462524, "learning_rate": 1.4508475094847632e-05, "loss": 1.1789, "step": 232600 }, { "epoch": 7.1088165210484515, "grad_norm": 20.846769332885742, "learning_rate": 1.4493177089707502e-05, "loss": 1.1999, "step": 232700 }, { "epoch": 7.111871448646667, "grad_norm": 6.88537073135376, "learning_rate": 1.4477879084567373e-05, "loss": 1.2442, "step": 232800 }, { "epoch": 7.114926376244883, "grad_norm": 5.11663293838501, "learning_rate": 1.4462581079427242e-05, "loss": 1.2855, "step": 232900 }, { "epoch": 7.117981303843099, "grad_norm": 9.254779815673828, "learning_rate": 1.4447283074287114e-05, "loss": 1.2988, "step": 233000 }, { "epoch": 7.121036231441315, "grad_norm": 7.493042945861816, "learning_rate": 1.4431985069146983e-05, "loss": 1.1818, "step": 233100 }, { "epoch": 7.124091159039531, "grad_norm": 9.90920352935791, "learning_rate": 1.4416687064006854e-05, "loss": 1.0613, "step": 233200 }, { "epoch": 7.1271460866377465, "grad_norm": 4.25264835357666, "learning_rate": 1.4401542038918126e-05, "loss": 0.9499, "step": 233300 }, { "epoch": 7.130201014235962, "grad_norm": 7.848468780517578, "learning_rate": 1.4386397013829397e-05, "loss": 1.0826, "step": 233400 }, { "epoch": 7.133255941834179, "grad_norm": 14.143811225891113, "learning_rate": 1.4371099008689268e-05, "loss": 1.0622, "step": 233500 }, { "epoch": 7.1363108694323945, "grad_norm": 1.9151153564453125, "learning_rate": 1.4355801003549138e-05, "loss": 0.9594, "step": 233600 }, { "epoch": 7.13936579703061, "grad_norm": 8.173432350158691, "learning_rate": 1.4340502998409009e-05, "loss": 1.0031, "step": 233700 }, { "epoch": 7.142420724628827, "grad_norm": 1.022454857826233, "learning_rate": 1.4325204993268878e-05, "loss": 1.2063, "step": 233800 }, { "epoch": 7.145475652227042, "grad_norm": 9.124458312988281, "learning_rate": 1.430990698812875e-05, "loss": 1.1762, "step": 233900 }, { "epoch": 7.148530579825258, "grad_norm": 12.708720207214355, "learning_rate": 1.4294608982988619e-05, "loss": 1.1155, "step": 234000 }, { "epoch": 7.151585507423474, "grad_norm": 1.8738713264465332, "learning_rate": 1.427931097784849e-05, "loss": 1.0435, "step": 234100 }, { "epoch": 7.15464043502169, "grad_norm": 7.204272747039795, "learning_rate": 1.426401297270836e-05, "loss": 1.263, "step": 234200 }, { "epoch": 7.157695362619906, "grad_norm": 1.4616661071777344, "learning_rate": 1.424871496756823e-05, "loss": 1.0202, "step": 234300 }, { "epoch": 7.160750290218122, "grad_norm": 3.6434624195098877, "learning_rate": 1.42334169624281e-05, "loss": 0.9481, "step": 234400 }, { "epoch": 7.163805217816337, "grad_norm": 5.396409511566162, "learning_rate": 1.4218118957287971e-05, "loss": 1.1474, "step": 234500 }, { "epoch": 7.166860145414554, "grad_norm": 1.5187724828720093, "learning_rate": 1.420282095214784e-05, "loss": 1.1487, "step": 234600 }, { "epoch": 7.16991507301277, "grad_norm": 2.525434732437134, "learning_rate": 1.4187522947007712e-05, "loss": 1.1669, "step": 234700 }, { "epoch": 7.172970000610985, "grad_norm": 2.303208112716675, "learning_rate": 1.4172224941867581e-05, "loss": 1.1454, "step": 234800 }, { "epoch": 7.176024928209201, "grad_norm": 6.004208564758301, "learning_rate": 1.4156926936727452e-05, "loss": 1.168, "step": 234900 }, { "epoch": 7.179079855807418, "grad_norm": 11.640929222106934, "learning_rate": 1.4141628931587322e-05, "loss": 1.2544, "step": 235000 }, { "epoch": 7.182134783405633, "grad_norm": 6.673908233642578, "learning_rate": 1.4126330926447193e-05, "loss": 1.2008, "step": 235100 }, { "epoch": 7.185189711003849, "grad_norm": 14.122396469116211, "learning_rate": 1.4111032921307063e-05, "loss": 1.0178, "step": 235200 }, { "epoch": 7.1882446386020655, "grad_norm": 11.688652992248535, "learning_rate": 1.4095734916166934e-05, "loss": 1.2486, "step": 235300 }, { "epoch": 7.191299566200281, "grad_norm": 4.625455856323242, "learning_rate": 1.4080436911026803e-05, "loss": 1.1022, "step": 235400 }, { "epoch": 7.194354493798497, "grad_norm": 5.053491115570068, "learning_rate": 1.4065138905886674e-05, "loss": 1.1197, "step": 235500 }, { "epoch": 7.197409421396713, "grad_norm": 4.066915035247803, "learning_rate": 1.4049840900746544e-05, "loss": 1.1365, "step": 235600 }, { "epoch": 7.200464348994929, "grad_norm": 11.92159366607666, "learning_rate": 1.4034542895606415e-05, "loss": 1.0201, "step": 235700 }, { "epoch": 7.203519276593145, "grad_norm": 15.129595756530762, "learning_rate": 1.4019244890466284e-05, "loss": 1.3958, "step": 235800 }, { "epoch": 7.2065742041913605, "grad_norm": 5.431704521179199, "learning_rate": 1.4003946885326156e-05, "loss": 1.5226, "step": 235900 }, { "epoch": 7.209629131789576, "grad_norm": 9.33706283569336, "learning_rate": 1.3988648880186025e-05, "loss": 1.2875, "step": 236000 }, { "epoch": 7.212684059387793, "grad_norm": 1.7640498876571655, "learning_rate": 1.3973350875045896e-05, "loss": 1.2525, "step": 236100 }, { "epoch": 7.2157389869860085, "grad_norm": 5.957800388336182, "learning_rate": 1.3958052869905766e-05, "loss": 1.1711, "step": 236200 }, { "epoch": 7.218793914584224, "grad_norm": 1.0705599784851074, "learning_rate": 1.3942754864765637e-05, "loss": 1.1135, "step": 236300 }, { "epoch": 7.221848842182441, "grad_norm": 2.2979612350463867, "learning_rate": 1.3927456859625506e-05, "loss": 1.3061, "step": 236400 }, { "epoch": 7.224903769780656, "grad_norm": 7.749421119689941, "learning_rate": 1.3912158854485377e-05, "loss": 1.109, "step": 236500 }, { "epoch": 7.227958697378872, "grad_norm": 2.989555835723877, "learning_rate": 1.3897013829396645e-05, "loss": 1.0205, "step": 236600 }, { "epoch": 7.231013624977088, "grad_norm": 5.32132625579834, "learning_rate": 1.3881715824256517e-05, "loss": 1.2055, "step": 236700 }, { "epoch": 7.234068552575304, "grad_norm": 5.634767055511475, "learning_rate": 1.3866417819116386e-05, "loss": 1.0373, "step": 236800 }, { "epoch": 7.23712348017352, "grad_norm": 7.112980365753174, "learning_rate": 1.3851119813976257e-05, "loss": 1.4055, "step": 236900 }, { "epoch": 7.240178407771736, "grad_norm": 10.331284523010254, "learning_rate": 1.3835821808836127e-05, "loss": 1.1856, "step": 237000 }, { "epoch": 7.243233335369951, "grad_norm": 10.450326919555664, "learning_rate": 1.3820523803695998e-05, "loss": 1.229, "step": 237100 }, { "epoch": 7.246288262968168, "grad_norm": 4.122806549072266, "learning_rate": 1.3805225798555867e-05, "loss": 1.1836, "step": 237200 }, { "epoch": 7.249343190566384, "grad_norm": 6.546516418457031, "learning_rate": 1.3789927793415738e-05, "loss": 1.4002, "step": 237300 }, { "epoch": 7.252398118164599, "grad_norm": 20.594345092773438, "learning_rate": 1.3774629788275608e-05, "loss": 1.0855, "step": 237400 }, { "epoch": 7.255453045762815, "grad_norm": 4.27672815322876, "learning_rate": 1.3759331783135479e-05, "loss": 1.1928, "step": 237500 }, { "epoch": 7.258507973361032, "grad_norm": 24.660978317260742, "learning_rate": 1.3744033777995348e-05, "loss": 1.0662, "step": 237600 }, { "epoch": 7.261562900959247, "grad_norm": 6.918597221374512, "learning_rate": 1.372873577285522e-05, "loss": 1.5574, "step": 237700 }, { "epoch": 7.264617828557463, "grad_norm": 3.216303586959839, "learning_rate": 1.3713437767715089e-05, "loss": 1.0191, "step": 237800 }, { "epoch": 7.2676727561556795, "grad_norm": 10.302803039550781, "learning_rate": 1.369813976257496e-05, "loss": 1.0279, "step": 237900 }, { "epoch": 7.270727683753895, "grad_norm": 9.102781295776367, "learning_rate": 1.368284175743483e-05, "loss": 1.2373, "step": 238000 }, { "epoch": 7.273782611352111, "grad_norm": 24.65117645263672, "learning_rate": 1.3667543752294701e-05, "loss": 1.0409, "step": 238100 }, { "epoch": 7.276837538950327, "grad_norm": 3.3516035079956055, "learning_rate": 1.365224574715457e-05, "loss": 1.0934, "step": 238200 }, { "epoch": 7.279892466548543, "grad_norm": 8.557634353637695, "learning_rate": 1.3636947742014441e-05, "loss": 1.0797, "step": 238300 }, { "epoch": 7.282947394146759, "grad_norm": 5.520821571350098, "learning_rate": 1.3621649736874311e-05, "loss": 1.0617, "step": 238400 }, { "epoch": 7.2860023217449745, "grad_norm": 5.227816104888916, "learning_rate": 1.3606351731734182e-05, "loss": 0.977, "step": 238500 }, { "epoch": 7.28905724934319, "grad_norm": 4.158600330352783, "learning_rate": 1.3591053726594052e-05, "loss": 1.057, "step": 238600 }, { "epoch": 7.292112176941407, "grad_norm": 19.767812728881836, "learning_rate": 1.3575908701505325e-05, "loss": 1.484, "step": 238700 }, { "epoch": 7.2951671045396225, "grad_norm": 4.106821060180664, "learning_rate": 1.3560610696365194e-05, "loss": 0.9952, "step": 238800 }, { "epoch": 7.298222032137838, "grad_norm": 8.97727108001709, "learning_rate": 1.3545312691225065e-05, "loss": 1.2, "step": 238900 }, { "epoch": 7.301276959736054, "grad_norm": 11.650214195251465, "learning_rate": 1.3530014686084935e-05, "loss": 1.2169, "step": 239000 }, { "epoch": 7.30433188733427, "grad_norm": 5.218963623046875, "learning_rate": 1.3514716680944806e-05, "loss": 1.1261, "step": 239100 }, { "epoch": 7.307386814932486, "grad_norm": 7.4983086585998535, "learning_rate": 1.3499418675804675e-05, "loss": 0.9903, "step": 239200 }, { "epoch": 7.310441742530702, "grad_norm": 4.398214817047119, "learning_rate": 1.3484120670664546e-05, "loss": 1.12, "step": 239300 }, { "epoch": 7.313496670128918, "grad_norm": 4.890592098236084, "learning_rate": 1.3468822665524416e-05, "loss": 1.1316, "step": 239400 }, { "epoch": 7.316551597727134, "grad_norm": 4.648405075073242, "learning_rate": 1.3453524660384287e-05, "loss": 1.2801, "step": 239500 }, { "epoch": 7.31960652532535, "grad_norm": 3.6855125427246094, "learning_rate": 1.3438226655244157e-05, "loss": 1.1782, "step": 239600 }, { "epoch": 7.322661452923565, "grad_norm": 7.176767349243164, "learning_rate": 1.3422928650104028e-05, "loss": 0.9889, "step": 239700 }, { "epoch": 7.325716380521782, "grad_norm": 8.887348175048828, "learning_rate": 1.3407630644963897e-05, "loss": 1.0531, "step": 239800 }, { "epoch": 7.328771308119998, "grad_norm": 3.1371002197265625, "learning_rate": 1.3392332639823768e-05, "loss": 0.9987, "step": 239900 }, { "epoch": 7.331826235718213, "grad_norm": 18.492141723632812, "learning_rate": 1.3377034634683638e-05, "loss": 1.1184, "step": 240000 }, { "epoch": 7.334881163316429, "grad_norm": 19.492719650268555, "learning_rate": 1.3361736629543509e-05, "loss": 1.266, "step": 240100 }, { "epoch": 7.337936090914646, "grad_norm": 5.891000747680664, "learning_rate": 1.3346438624403378e-05, "loss": 0.932, "step": 240200 }, { "epoch": 7.340991018512861, "grad_norm": 3.2475335597991943, "learning_rate": 1.333114061926325e-05, "loss": 1.3091, "step": 240300 }, { "epoch": 7.344045946111077, "grad_norm": 7.269662857055664, "learning_rate": 1.3315842614123119e-05, "loss": 1.0285, "step": 240400 }, { "epoch": 7.347100873709293, "grad_norm": 8.6480073928833, "learning_rate": 1.330054460898299e-05, "loss": 1.038, "step": 240500 }, { "epoch": 7.350155801307509, "grad_norm": 8.871041297912598, "learning_rate": 1.328524660384286e-05, "loss": 1.051, "step": 240600 }, { "epoch": 7.353210728905725, "grad_norm": 1.0303308963775635, "learning_rate": 1.326994859870273e-05, "loss": 1.0538, "step": 240700 }, { "epoch": 7.356265656503941, "grad_norm": 6.683864593505859, "learning_rate": 1.32546505935626e-05, "loss": 1.135, "step": 240800 }, { "epoch": 7.359320584102157, "grad_norm": 7.670544624328613, "learning_rate": 1.3239352588422471e-05, "loss": 1.0207, "step": 240900 }, { "epoch": 7.362375511700373, "grad_norm": 6.8379807472229, "learning_rate": 1.3224054583282341e-05, "loss": 0.986, "step": 241000 }, { "epoch": 7.3654304392985885, "grad_norm": 4.02485990524292, "learning_rate": 1.3208756578142212e-05, "loss": 1.4324, "step": 241100 }, { "epoch": 7.368485366896804, "grad_norm": 3.0033915042877197, "learning_rate": 1.3193458573002082e-05, "loss": 1.2366, "step": 241200 }, { "epoch": 7.371540294495021, "grad_norm": 10.496794700622559, "learning_rate": 1.3178160567861953e-05, "loss": 1.1109, "step": 241300 }, { "epoch": 7.3745952220932365, "grad_norm": 13.235259056091309, "learning_rate": 1.3162862562721822e-05, "loss": 1.185, "step": 241400 }, { "epoch": 7.377650149691452, "grad_norm": 1.6905969381332397, "learning_rate": 1.3147564557581693e-05, "loss": 1.1872, "step": 241500 }, { "epoch": 7.380705077289668, "grad_norm": 6.437650680541992, "learning_rate": 1.3132266552441563e-05, "loss": 1.0061, "step": 241600 }, { "epoch": 7.383760004887884, "grad_norm": 9.396056175231934, "learning_rate": 1.3116968547301434e-05, "loss": 1.3761, "step": 241700 }, { "epoch": 7.3868149324861, "grad_norm": 8.510388374328613, "learning_rate": 1.3101670542161303e-05, "loss": 1.1561, "step": 241800 }, { "epoch": 7.389869860084316, "grad_norm": 3.6753456592559814, "learning_rate": 1.3086372537021175e-05, "loss": 1.3092, "step": 241900 }, { "epoch": 7.3929247876825315, "grad_norm": 5.2257585525512695, "learning_rate": 1.3071074531881044e-05, "loss": 1.0462, "step": 242000 }, { "epoch": 7.395979715280748, "grad_norm": 4.928442001342773, "learning_rate": 1.3055776526740915e-05, "loss": 1.0274, "step": 242100 }, { "epoch": 7.399034642878964, "grad_norm": 14.141927719116211, "learning_rate": 1.3040478521600785e-05, "loss": 0.9869, "step": 242200 }, { "epoch": 7.402089570477179, "grad_norm": 6.100747108459473, "learning_rate": 1.3025180516460656e-05, "loss": 1.0678, "step": 242300 }, { "epoch": 7.405144498075396, "grad_norm": 3.6259148120880127, "learning_rate": 1.3009882511320524e-05, "loss": 1.1743, "step": 242400 }, { "epoch": 7.408199425673612, "grad_norm": 10.17052173614502, "learning_rate": 1.2994584506180393e-05, "loss": 1.2426, "step": 242500 }, { "epoch": 7.411254353271827, "grad_norm": 16.377918243408203, "learning_rate": 1.2979286501040264e-05, "loss": 1.0444, "step": 242600 }, { "epoch": 7.414309280870043, "grad_norm": 9.223760604858398, "learning_rate": 1.2964141475951536e-05, "loss": 1.1732, "step": 242700 }, { "epoch": 7.41736420846826, "grad_norm": 9.646039962768555, "learning_rate": 1.2948843470811407e-05, "loss": 0.9626, "step": 242800 }, { "epoch": 7.420419136066475, "grad_norm": 6.570456504821777, "learning_rate": 1.2933545465671276e-05, "loss": 1.1104, "step": 242900 }, { "epoch": 7.423474063664691, "grad_norm": 6.207412242889404, "learning_rate": 1.2918247460531147e-05, "loss": 1.2049, "step": 243000 }, { "epoch": 7.426528991262907, "grad_norm": 4.209483623504639, "learning_rate": 1.2902949455391017e-05, "loss": 0.9876, "step": 243100 }, { "epoch": 7.429583918861123, "grad_norm": 9.007349967956543, "learning_rate": 1.2887651450250888e-05, "loss": 1.226, "step": 243200 }, { "epoch": 7.432638846459339, "grad_norm": 2.926483154296875, "learning_rate": 1.2872353445110757e-05, "loss": 1.1661, "step": 243300 }, { "epoch": 7.435693774057555, "grad_norm": 12.762822151184082, "learning_rate": 1.2857055439970629e-05, "loss": 1.211, "step": 243400 }, { "epoch": 7.43874870165577, "grad_norm": 6.0010085105896, "learning_rate": 1.2841757434830498e-05, "loss": 1.15, "step": 243500 }, { "epoch": 7.441803629253987, "grad_norm": 6.056092739105225, "learning_rate": 1.282645942969037e-05, "loss": 1.1764, "step": 243600 }, { "epoch": 7.4448585568522025, "grad_norm": 18.341081619262695, "learning_rate": 1.2811161424550239e-05, "loss": 0.924, "step": 243700 }, { "epoch": 7.447913484450418, "grad_norm": 1.1248468160629272, "learning_rate": 1.279601639946151e-05, "loss": 1.6367, "step": 243800 }, { "epoch": 7.450968412048635, "grad_norm": 30.9728946685791, "learning_rate": 1.2780718394321381e-05, "loss": 1.2798, "step": 243900 }, { "epoch": 7.4540233396468505, "grad_norm": 9.996712684631348, "learning_rate": 1.276542038918125e-05, "loss": 1.017, "step": 244000 }, { "epoch": 7.457078267245066, "grad_norm": 11.12072467803955, "learning_rate": 1.2750122384041122e-05, "loss": 1.148, "step": 244100 }, { "epoch": 7.460133194843282, "grad_norm": 17.706918716430664, "learning_rate": 1.2734824378900991e-05, "loss": 1.2085, "step": 244200 }, { "epoch": 7.463188122441498, "grad_norm": 6.079930782318115, "learning_rate": 1.2719526373760862e-05, "loss": 1.168, "step": 244300 }, { "epoch": 7.466243050039714, "grad_norm": 6.82859468460083, "learning_rate": 1.2704228368620732e-05, "loss": 0.9845, "step": 244400 }, { "epoch": 7.46929797763793, "grad_norm": 3.8205902576446533, "learning_rate": 1.2688930363480603e-05, "loss": 1.064, "step": 244500 }, { "epoch": 7.472352905236146, "grad_norm": 4.047750473022461, "learning_rate": 1.2673632358340472e-05, "loss": 1.2488, "step": 244600 }, { "epoch": 7.475407832834362, "grad_norm": 7.491181373596191, "learning_rate": 1.2658334353200344e-05, "loss": 1.0114, "step": 244700 }, { "epoch": 7.478462760432578, "grad_norm": 6.972776889801025, "learning_rate": 1.2643036348060213e-05, "loss": 0.9721, "step": 244800 }, { "epoch": 7.481517688030793, "grad_norm": 6.191954135894775, "learning_rate": 1.2627738342920084e-05, "loss": 1.131, "step": 244900 }, { "epoch": 7.48457261562901, "grad_norm": 0.34253376722335815, "learning_rate": 1.2612440337779954e-05, "loss": 1.3406, "step": 245000 }, { "epoch": 7.487627543227226, "grad_norm": 4.320346832275391, "learning_rate": 1.2597142332639825e-05, "loss": 1.23, "step": 245100 }, { "epoch": 7.490682470825441, "grad_norm": 2.7404799461364746, "learning_rate": 1.2581844327499694e-05, "loss": 1.0078, "step": 245200 }, { "epoch": 7.493737398423657, "grad_norm": 12.498672485351562, "learning_rate": 1.2566546322359565e-05, "loss": 1.0024, "step": 245300 }, { "epoch": 7.496792326021874, "grad_norm": 5.074945449829102, "learning_rate": 1.2551248317219435e-05, "loss": 1.1319, "step": 245400 }, { "epoch": 7.499847253620089, "grad_norm": 6.632750511169434, "learning_rate": 1.2535950312079306e-05, "loss": 1.0165, "step": 245500 }, { "epoch": 7.502902181218305, "grad_norm": 9.370561599731445, "learning_rate": 1.2520652306939176e-05, "loss": 1.0874, "step": 245600 }, { "epoch": 7.505957108816521, "grad_norm": 13.537225723266602, "learning_rate": 1.2505354301799047e-05, "loss": 1.1978, "step": 245700 }, { "epoch": 7.509012036414737, "grad_norm": 6.405066013336182, "learning_rate": 1.2490056296658916e-05, "loss": 0.9219, "step": 245800 }, { "epoch": 7.512066964012953, "grad_norm": 2.5054051876068115, "learning_rate": 1.2474758291518787e-05, "loss": 1.2472, "step": 245900 }, { "epoch": 7.515121891611169, "grad_norm": 4.03517484664917, "learning_rate": 1.2459460286378657e-05, "loss": 1.0767, "step": 246000 }, { "epoch": 7.518176819209385, "grad_norm": 1.487331509590149, "learning_rate": 1.2444315261289928e-05, "loss": 1.3574, "step": 246100 }, { "epoch": 7.521231746807601, "grad_norm": 11.857295989990234, "learning_rate": 1.2429017256149798e-05, "loss": 1.1426, "step": 246200 }, { "epoch": 7.5242866744058166, "grad_norm": 21.534475326538086, "learning_rate": 1.2413719251009669e-05, "loss": 1.1534, "step": 246300 }, { "epoch": 7.527341602004032, "grad_norm": 49.81202697753906, "learning_rate": 1.2398421245869538e-05, "loss": 1.2241, "step": 246400 }, { "epoch": 7.530396529602249, "grad_norm": 0.9042030572891235, "learning_rate": 1.238312324072941e-05, "loss": 1.0467, "step": 246500 }, { "epoch": 7.5334514572004645, "grad_norm": 6.449215412139893, "learning_rate": 1.2367825235589279e-05, "loss": 1.0632, "step": 246600 }, { "epoch": 7.53650638479868, "grad_norm": 5.2182841300964355, "learning_rate": 1.235252723044915e-05, "loss": 1.0664, "step": 246700 }, { "epoch": 7.539561312396896, "grad_norm": 3.7590441703796387, "learning_rate": 1.233722922530902e-05, "loss": 1.2511, "step": 246800 }, { "epoch": 7.542616239995112, "grad_norm": 6.888582229614258, "learning_rate": 1.232193122016889e-05, "loss": 1.1955, "step": 246900 }, { "epoch": 7.545671167593328, "grad_norm": 11.217452049255371, "learning_rate": 1.230663321502876e-05, "loss": 0.9752, "step": 247000 }, { "epoch": 7.548726095191544, "grad_norm": 0.6481238603591919, "learning_rate": 1.2291335209888631e-05, "loss": 1.1871, "step": 247100 }, { "epoch": 7.5517810227897595, "grad_norm": 12.974531173706055, "learning_rate": 1.22760372047485e-05, "loss": 1.0176, "step": 247200 }, { "epoch": 7.554835950387976, "grad_norm": 11.442163467407227, "learning_rate": 1.2260739199608372e-05, "loss": 1.2426, "step": 247300 }, { "epoch": 7.557890877986192, "grad_norm": 6.187389850616455, "learning_rate": 1.2245441194468241e-05, "loss": 1.1011, "step": 247400 }, { "epoch": 7.560945805584407, "grad_norm": 9.375640869140625, "learning_rate": 1.2230143189328112e-05, "loss": 1.1226, "step": 247500 }, { "epoch": 7.564000733182624, "grad_norm": 9.352968215942383, "learning_rate": 1.2214845184187982e-05, "loss": 1.0503, "step": 247600 }, { "epoch": 7.56705566078084, "grad_norm": 1.1412074565887451, "learning_rate": 1.2199547179047853e-05, "loss": 1.2557, "step": 247700 }, { "epoch": 7.570110588379055, "grad_norm": 4.671082019805908, "learning_rate": 1.2184249173907723e-05, "loss": 1.3057, "step": 247800 }, { "epoch": 7.573165515977271, "grad_norm": 3.711911916732788, "learning_rate": 1.2168951168767594e-05, "loss": 1.1333, "step": 247900 }, { "epoch": 7.576220443575488, "grad_norm": 2.88683819770813, "learning_rate": 1.2153653163627463e-05, "loss": 1.1266, "step": 248000 }, { "epoch": 7.579275371173703, "grad_norm": 5.689997673034668, "learning_rate": 1.2138355158487334e-05, "loss": 1.2176, "step": 248100 }, { "epoch": 7.582330298771919, "grad_norm": 19.876480102539062, "learning_rate": 1.2123057153347204e-05, "loss": 1.0021, "step": 248200 }, { "epoch": 7.585385226370135, "grad_norm": 14.01642894744873, "learning_rate": 1.2107759148207075e-05, "loss": 1.0996, "step": 248300 }, { "epoch": 7.588440153968351, "grad_norm": 7.023202896118164, "learning_rate": 1.2092461143066944e-05, "loss": 1.0367, "step": 248400 }, { "epoch": 7.591495081566567, "grad_norm": 2.8547801971435547, "learning_rate": 1.2077163137926816e-05, "loss": 1.1222, "step": 248500 }, { "epoch": 7.594550009164783, "grad_norm": 4.367480754852295, "learning_rate": 1.2061865132786685e-05, "loss": 1.0549, "step": 248600 }, { "epoch": 7.597604936762998, "grad_norm": 13.797048568725586, "learning_rate": 1.2046567127646556e-05, "loss": 1.085, "step": 248700 }, { "epoch": 7.600659864361215, "grad_norm": 4.761076927185059, "learning_rate": 1.2031269122506426e-05, "loss": 0.9709, "step": 248800 }, { "epoch": 7.603714791959431, "grad_norm": 9.693614959716797, "learning_rate": 1.2015971117366295e-05, "loss": 1.0974, "step": 248900 }, { "epoch": 7.606769719557646, "grad_norm": 9.48823356628418, "learning_rate": 1.2000673112226166e-05, "loss": 1.1598, "step": 249000 }, { "epoch": 7.609824647155863, "grad_norm": 2.078655481338501, "learning_rate": 1.1985375107086036e-05, "loss": 1.1203, "step": 249100 }, { "epoch": 7.6128795747540785, "grad_norm": 8.278289794921875, "learning_rate": 1.1970230081997307e-05, "loss": 1.0562, "step": 249200 }, { "epoch": 7.615934502352294, "grad_norm": 20.82103157043457, "learning_rate": 1.1954932076857178e-05, "loss": 1.0621, "step": 249300 }, { "epoch": 7.61898942995051, "grad_norm": 1.3670573234558105, "learning_rate": 1.1939634071717048e-05, "loss": 1.2731, "step": 249400 }, { "epoch": 7.6220443575487264, "grad_norm": 12.100311279296875, "learning_rate": 1.1924336066576919e-05, "loss": 1.0303, "step": 249500 }, { "epoch": 7.625099285146942, "grad_norm": 13.72838020324707, "learning_rate": 1.1909038061436788e-05, "loss": 1.2989, "step": 249600 }, { "epoch": 7.628154212745158, "grad_norm": 8.885602951049805, "learning_rate": 1.189374005629666e-05, "loss": 0.9904, "step": 249700 }, { "epoch": 7.6312091403433735, "grad_norm": 12.419922828674316, "learning_rate": 1.1878442051156529e-05, "loss": 1.1415, "step": 249800 }, { "epoch": 7.63426406794159, "grad_norm": 3.4879326820373535, "learning_rate": 1.18631440460164e-05, "loss": 1.1963, "step": 249900 }, { "epoch": 7.637318995539806, "grad_norm": 10.463134765625, "learning_rate": 1.184784604087627e-05, "loss": 1.4307, "step": 250000 }, { "epoch": 7.640373923138021, "grad_norm": 8.788840293884277, "learning_rate": 1.183254803573614e-05, "loss": 1.2819, "step": 250100 }, { "epoch": 7.643428850736237, "grad_norm": 20.626136779785156, "learning_rate": 1.181725003059601e-05, "loss": 1.025, "step": 250200 }, { "epoch": 7.646483778334454, "grad_norm": 7.884489059448242, "learning_rate": 1.1801952025455881e-05, "loss": 0.9718, "step": 250300 }, { "epoch": 7.649538705932669, "grad_norm": 5.414434432983398, "learning_rate": 1.178665402031575e-05, "loss": 1.2849, "step": 250400 }, { "epoch": 7.652593633530885, "grad_norm": 6.032369613647461, "learning_rate": 1.1771356015175622e-05, "loss": 1.0759, "step": 250500 }, { "epoch": 7.655648561129102, "grad_norm": 4.3075642585754395, "learning_rate": 1.1756058010035491e-05, "loss": 1.1365, "step": 250600 }, { "epoch": 7.658703488727317, "grad_norm": 1.8966740369796753, "learning_rate": 1.1740760004895363e-05, "loss": 1.0646, "step": 250700 }, { "epoch": 7.661758416325533, "grad_norm": 2.841329574584961, "learning_rate": 1.1725461999755232e-05, "loss": 0.9975, "step": 250800 }, { "epoch": 7.664813343923749, "grad_norm": 11.211971282958984, "learning_rate": 1.1710163994615103e-05, "loss": 1.0907, "step": 250900 }, { "epoch": 7.667868271521965, "grad_norm": 6.564040660858154, "learning_rate": 1.1694865989474973e-05, "loss": 1.2923, "step": 251000 }, { "epoch": 7.670923199120181, "grad_norm": 9.287890434265137, "learning_rate": 1.1679567984334844e-05, "loss": 0.9328, "step": 251100 }, { "epoch": 7.673978126718397, "grad_norm": 17.20326805114746, "learning_rate": 1.1664269979194713e-05, "loss": 1.2543, "step": 251200 }, { "epoch": 7.677033054316612, "grad_norm": 8.150099754333496, "learning_rate": 1.1648971974054584e-05, "loss": 0.9473, "step": 251300 }, { "epoch": 7.680087981914829, "grad_norm": 7.258596897125244, "learning_rate": 1.1633673968914454e-05, "loss": 1.3426, "step": 251400 }, { "epoch": 7.683142909513045, "grad_norm": 7.076548099517822, "learning_rate": 1.1618375963774325e-05, "loss": 1.2426, "step": 251500 }, { "epoch": 7.68619783711126, "grad_norm": 6.173647880554199, "learning_rate": 1.1603077958634195e-05, "loss": 1.1409, "step": 251600 }, { "epoch": 7.689252764709476, "grad_norm": 12.131219863891602, "learning_rate": 1.1587932933545466e-05, "loss": 1.1354, "step": 251700 }, { "epoch": 7.6923076923076925, "grad_norm": 6.685665607452393, "learning_rate": 1.1572634928405335e-05, "loss": 0.8743, "step": 251800 }, { "epoch": 7.695362619905908, "grad_norm": 8.49716854095459, "learning_rate": 1.1557336923265207e-05, "loss": 1.0753, "step": 251900 }, { "epoch": 7.698417547504124, "grad_norm": 5.741514682769775, "learning_rate": 1.1542038918125076e-05, "loss": 0.9022, "step": 252000 }, { "epoch": 7.7014724751023405, "grad_norm": 6.333673477172852, "learning_rate": 1.1526740912984947e-05, "loss": 1.1122, "step": 252100 }, { "epoch": 7.704527402700556, "grad_norm": 12.152358055114746, "learning_rate": 1.1511442907844817e-05, "loss": 1.2149, "step": 252200 }, { "epoch": 7.707582330298772, "grad_norm": 6.272040367126465, "learning_rate": 1.1496144902704688e-05, "loss": 1.0376, "step": 252300 }, { "epoch": 7.7106372578969875, "grad_norm": 9.917558670043945, "learning_rate": 1.1480846897564557e-05, "loss": 1.2105, "step": 252400 }, { "epoch": 7.713692185495204, "grad_norm": 5.992608070373535, "learning_rate": 1.1465548892424428e-05, "loss": 1.2978, "step": 252500 }, { "epoch": 7.71674711309342, "grad_norm": 15.975558280944824, "learning_rate": 1.1450250887284298e-05, "loss": 1.0692, "step": 252600 }, { "epoch": 7.7198020406916354, "grad_norm": 4.722947597503662, "learning_rate": 1.1434952882144169e-05, "loss": 1.2287, "step": 252700 }, { "epoch": 7.722856968289852, "grad_norm": 11.227826118469238, "learning_rate": 1.1419654877004038e-05, "loss": 1.0139, "step": 252800 }, { "epoch": 7.725911895888068, "grad_norm": 2.9598705768585205, "learning_rate": 1.140435687186391e-05, "loss": 1.13, "step": 252900 }, { "epoch": 7.728966823486283, "grad_norm": 7.798380374908447, "learning_rate": 1.1389058866723779e-05, "loss": 1.2424, "step": 253000 }, { "epoch": 7.732021751084499, "grad_norm": 12.195430755615234, "learning_rate": 1.137376086158365e-05, "loss": 1.1667, "step": 253100 }, { "epoch": 7.735076678682715, "grad_norm": 1.4271444082260132, "learning_rate": 1.135846285644352e-05, "loss": 1.0682, "step": 253200 }, { "epoch": 7.738131606280931, "grad_norm": 7.447188377380371, "learning_rate": 1.1343164851303391e-05, "loss": 1.0512, "step": 253300 }, { "epoch": 7.741186533879147, "grad_norm": 7.641314506530762, "learning_rate": 1.132786684616326e-05, "loss": 1.2633, "step": 253400 }, { "epoch": 7.744241461477363, "grad_norm": 5.096051216125488, "learning_rate": 1.1312568841023131e-05, "loss": 0.9755, "step": 253500 }, { "epoch": 7.747296389075579, "grad_norm": 5.797981262207031, "learning_rate": 1.1297270835883001e-05, "loss": 1.0048, "step": 253600 }, { "epoch": 7.750351316673795, "grad_norm": 26.29532814025879, "learning_rate": 1.1281972830742872e-05, "loss": 1.0032, "step": 253700 }, { "epoch": 7.753406244272011, "grad_norm": 15.508411407470703, "learning_rate": 1.1266674825602742e-05, "loss": 1.1657, "step": 253800 }, { "epoch": 7.756461171870226, "grad_norm": 5.2701897621154785, "learning_rate": 1.1251376820462613e-05, "loss": 1.1883, "step": 253900 }, { "epoch": 7.759516099468443, "grad_norm": 8.632824897766113, "learning_rate": 1.1236078815322484e-05, "loss": 1.0014, "step": 254000 }, { "epoch": 7.762571027066659, "grad_norm": 7.206718921661377, "learning_rate": 1.1220780810182353e-05, "loss": 1.0059, "step": 254100 }, { "epoch": 7.765625954664874, "grad_norm": 4.950286865234375, "learning_rate": 1.1205482805042224e-05, "loss": 0.9502, "step": 254200 }, { "epoch": 7.768680882263091, "grad_norm": 7.537136077880859, "learning_rate": 1.1190184799902094e-05, "loss": 0.9475, "step": 254300 }, { "epoch": 7.7717358098613065, "grad_norm": 8.627643585205078, "learning_rate": 1.1174886794761965e-05, "loss": 1.2275, "step": 254400 }, { "epoch": 7.774790737459522, "grad_norm": 4.86769962310791, "learning_rate": 1.1159588789621833e-05, "loss": 1.0626, "step": 254500 }, { "epoch": 7.777845665057738, "grad_norm": 2.8688037395477295, "learning_rate": 1.1144290784481704e-05, "loss": 1.2409, "step": 254600 }, { "epoch": 7.780900592655954, "grad_norm": 3.97426700592041, "learning_rate": 1.1129145759392975e-05, "loss": 1.1648, "step": 254700 }, { "epoch": 7.78395552025417, "grad_norm": 12.675214767456055, "learning_rate": 1.1113847754252847e-05, "loss": 0.9781, "step": 254800 }, { "epoch": 7.787010447852386, "grad_norm": 5.294600486755371, "learning_rate": 1.1098549749112716e-05, "loss": 1.2876, "step": 254900 }, { "epoch": 7.7900653754506015, "grad_norm": 18.957189559936523, "learning_rate": 1.1083251743972587e-05, "loss": 1.2355, "step": 255000 }, { "epoch": 7.793120303048818, "grad_norm": 18.206850051879883, "learning_rate": 1.1067953738832457e-05, "loss": 1.1789, "step": 255100 }, { "epoch": 7.796175230647034, "grad_norm": 5.690242290496826, "learning_rate": 1.1052655733692328e-05, "loss": 1.2387, "step": 255200 }, { "epoch": 7.7992301582452495, "grad_norm": 9.694096565246582, "learning_rate": 1.1037357728552197e-05, "loss": 1.1317, "step": 255300 }, { "epoch": 7.802285085843465, "grad_norm": 14.475813865661621, "learning_rate": 1.1022059723412068e-05, "loss": 1.0649, "step": 255400 }, { "epoch": 7.805340013441682, "grad_norm": 3.3793346881866455, "learning_rate": 1.1006761718271938e-05, "loss": 1.1786, "step": 255500 }, { "epoch": 7.808394941039897, "grad_norm": 7.643477916717529, "learning_rate": 1.0991463713131809e-05, "loss": 1.0217, "step": 255600 }, { "epoch": 7.811449868638113, "grad_norm": 11.50840950012207, "learning_rate": 1.0976165707991678e-05, "loss": 1.2502, "step": 255700 }, { "epoch": 7.81450479623633, "grad_norm": 3.7972185611724854, "learning_rate": 1.096086770285155e-05, "loss": 1.024, "step": 255800 }, { "epoch": 7.817559723834545, "grad_norm": 6.405685901641846, "learning_rate": 1.0945569697711419e-05, "loss": 1.0386, "step": 255900 }, { "epoch": 7.820614651432761, "grad_norm": 6.837638854980469, "learning_rate": 1.093027169257129e-05, "loss": 1.1037, "step": 256000 }, { "epoch": 7.823669579030977, "grad_norm": 0.05840953066945076, "learning_rate": 1.091497368743116e-05, "loss": 1.1804, "step": 256100 }, { "epoch": 7.826724506629192, "grad_norm": 3.761370897293091, "learning_rate": 1.0899675682291031e-05, "loss": 0.9695, "step": 256200 }, { "epoch": 7.829779434227409, "grad_norm": 1.4047656059265137, "learning_rate": 1.08843776771509e-05, "loss": 1.0739, "step": 256300 }, { "epoch": 7.832834361825625, "grad_norm": 13.112373352050781, "learning_rate": 1.0869079672010772e-05, "loss": 1.071, "step": 256400 }, { "epoch": 7.83588928942384, "grad_norm": 9.92342472076416, "learning_rate": 1.0853781666870641e-05, "loss": 1.4552, "step": 256500 }, { "epoch": 7.838944217022057, "grad_norm": 3.913094997406006, "learning_rate": 1.0838483661730512e-05, "loss": 1.2118, "step": 256600 }, { "epoch": 7.841999144620273, "grad_norm": 3.993881940841675, "learning_rate": 1.0823185656590382e-05, "loss": 1.2099, "step": 256700 }, { "epoch": 7.845054072218488, "grad_norm": 11.119633674621582, "learning_rate": 1.0807887651450253e-05, "loss": 1.0744, "step": 256800 }, { "epoch": 7.848108999816704, "grad_norm": 1.991754412651062, "learning_rate": 1.0792589646310122e-05, "loss": 1.0599, "step": 256900 }, { "epoch": 7.8511639274149205, "grad_norm": 3.5789756774902344, "learning_rate": 1.0777444621221394e-05, "loss": 1.2062, "step": 257000 }, { "epoch": 7.854218855013136, "grad_norm": 3.4579782485961914, "learning_rate": 1.0762146616081263e-05, "loss": 1.4146, "step": 257100 }, { "epoch": 7.857273782611352, "grad_norm": 6.728309631347656, "learning_rate": 1.0746848610941134e-05, "loss": 1.0293, "step": 257200 }, { "epoch": 7.8603287102095685, "grad_norm": 8.280560493469238, "learning_rate": 1.0731550605801004e-05, "loss": 0.9475, "step": 257300 }, { "epoch": 7.863383637807784, "grad_norm": 7.0217366218566895, "learning_rate": 1.0716252600660875e-05, "loss": 1.1673, "step": 257400 }, { "epoch": 7.866438565406, "grad_norm": 4.446942329406738, "learning_rate": 1.0700954595520744e-05, "loss": 1.3345, "step": 257500 }, { "epoch": 7.8694934930042155, "grad_norm": 2.9662723541259766, "learning_rate": 1.0685656590380615e-05, "loss": 1.1369, "step": 257600 }, { "epoch": 7.872548420602432, "grad_norm": 7.454762935638428, "learning_rate": 1.0670358585240485e-05, "loss": 0.892, "step": 257700 }, { "epoch": 7.875603348200648, "grad_norm": 1.9147623777389526, "learning_rate": 1.0655060580100356e-05, "loss": 1.2211, "step": 257800 }, { "epoch": 7.8786582757988635, "grad_norm": 2.9092390537261963, "learning_rate": 1.0639762574960226e-05, "loss": 1.0067, "step": 257900 }, { "epoch": 7.881713203397079, "grad_norm": 22.353246688842773, "learning_rate": 1.0624464569820097e-05, "loss": 1.1166, "step": 258000 }, { "epoch": 7.884768130995296, "grad_norm": 4.585965156555176, "learning_rate": 1.0609166564679966e-05, "loss": 1.1157, "step": 258100 }, { "epoch": 7.887823058593511, "grad_norm": 10.530981063842773, "learning_rate": 1.0593868559539837e-05, "loss": 1.1327, "step": 258200 }, { "epoch": 7.890877986191727, "grad_norm": 6.027432918548584, "learning_rate": 1.0578570554399707e-05, "loss": 1.0564, "step": 258300 }, { "epoch": 7.893932913789943, "grad_norm": 10.110967636108398, "learning_rate": 1.0563272549259578e-05, "loss": 1.0226, "step": 258400 }, { "epoch": 7.896987841388159, "grad_norm": 6.894031047821045, "learning_rate": 1.0547974544119447e-05, "loss": 1.0349, "step": 258500 }, { "epoch": 7.900042768986375, "grad_norm": 7.420821666717529, "learning_rate": 1.0532676538979319e-05, "loss": 1.3148, "step": 258600 }, { "epoch": 7.903097696584591, "grad_norm": 7.061910629272461, "learning_rate": 1.0517378533839188e-05, "loss": 1.1127, "step": 258700 }, { "epoch": 7.906152624182807, "grad_norm": 5.673000335693359, "learning_rate": 1.0502080528699059e-05, "loss": 1.0278, "step": 258800 }, { "epoch": 7.909207551781023, "grad_norm": 6.4721574783325195, "learning_rate": 1.0486782523558929e-05, "loss": 1.0613, "step": 258900 }, { "epoch": 7.912262479379239, "grad_norm": 2.630610942840576, "learning_rate": 1.04716374984702e-05, "loss": 1.3414, "step": 259000 }, { "epoch": 7.915317406977454, "grad_norm": 3.0736887454986572, "learning_rate": 1.045633949333007e-05, "loss": 1.0752, "step": 259100 }, { "epoch": 7.918372334575671, "grad_norm": 17.174854278564453, "learning_rate": 1.044104148818994e-05, "loss": 1.0473, "step": 259200 }, { "epoch": 7.921427262173887, "grad_norm": 5.889415740966797, "learning_rate": 1.042574348304981e-05, "loss": 1.0409, "step": 259300 }, { "epoch": 7.924482189772102, "grad_norm": 8.806005477905273, "learning_rate": 1.0410445477909681e-05, "loss": 1.2461, "step": 259400 }, { "epoch": 7.927537117370318, "grad_norm": 5.232207775115967, "learning_rate": 1.039514747276955e-05, "loss": 1.1434, "step": 259500 }, { "epoch": 7.9305920449685345, "grad_norm": 6.207265853881836, "learning_rate": 1.0379849467629422e-05, "loss": 1.1761, "step": 259600 }, { "epoch": 7.93364697256675, "grad_norm": 3.6977267265319824, "learning_rate": 1.0364551462489291e-05, "loss": 1.1148, "step": 259700 }, { "epoch": 7.936701900164966, "grad_norm": 9.098258018493652, "learning_rate": 1.0349253457349162e-05, "loss": 1.0661, "step": 259800 }, { "epoch": 7.939756827763182, "grad_norm": 13.900864601135254, "learning_rate": 1.0333955452209032e-05, "loss": 1.0208, "step": 259900 }, { "epoch": 7.942811755361398, "grad_norm": 3.2146785259246826, "learning_rate": 1.0318657447068903e-05, "loss": 1.31, "step": 260000 }, { "epoch": 7.945866682959614, "grad_norm": 5.783294200897217, "learning_rate": 1.0303359441928773e-05, "loss": 1.357, "step": 260100 }, { "epoch": 7.9489216105578295, "grad_norm": 6.471368789672852, "learning_rate": 1.0288061436788644e-05, "loss": 1.0479, "step": 260200 }, { "epoch": 7.951976538156046, "grad_norm": 5.971426486968994, "learning_rate": 1.0272763431648513e-05, "loss": 1.0926, "step": 260300 }, { "epoch": 7.955031465754262, "grad_norm": 2.5386502742767334, "learning_rate": 1.0257465426508384e-05, "loss": 0.9909, "step": 260400 }, { "epoch": 7.9580863933524775, "grad_norm": 13.469215393066406, "learning_rate": 1.0242167421368254e-05, "loss": 1.0925, "step": 260500 }, { "epoch": 7.961141320950693, "grad_norm": 8.153322219848633, "learning_rate": 1.0226869416228125e-05, "loss": 1.0247, "step": 260600 }, { "epoch": 7.96419624854891, "grad_norm": 9.48679256439209, "learning_rate": 1.0211571411087994e-05, "loss": 1.1772, "step": 260700 }, { "epoch": 7.967251176147125, "grad_norm": 3.1017658710479736, "learning_rate": 1.0196273405947866e-05, "loss": 1.2221, "step": 260800 }, { "epoch": 7.970306103745341, "grad_norm": 2.5709776878356934, "learning_rate": 1.0180975400807735e-05, "loss": 1.5175, "step": 260900 }, { "epoch": 7.973361031343557, "grad_norm": 13.61955738067627, "learning_rate": 1.0165677395667606e-05, "loss": 1.1054, "step": 261000 }, { "epoch": 7.976415958941773, "grad_norm": 8.353294372558594, "learning_rate": 1.0150379390527476e-05, "loss": 1.0566, "step": 261100 }, { "epoch": 7.979470886539989, "grad_norm": 8.776516914367676, "learning_rate": 1.0135234365438747e-05, "loss": 1.3163, "step": 261200 }, { "epoch": 7.982525814138205, "grad_norm": 14.955212593078613, "learning_rate": 1.0119936360298618e-05, "loss": 1.1869, "step": 261300 }, { "epoch": 7.98558074173642, "grad_norm": 10.493465423583984, "learning_rate": 1.0104638355158488e-05, "loss": 1.1279, "step": 261400 }, { "epoch": 7.988635669334637, "grad_norm": 25.103595733642578, "learning_rate": 1.0089340350018359e-05, "loss": 1.26, "step": 261500 }, { "epoch": 7.991690596932853, "grad_norm": 6.875821113586426, "learning_rate": 1.0074042344878228e-05, "loss": 1.021, "step": 261600 }, { "epoch": 7.994745524531068, "grad_norm": 1.227331519126892, "learning_rate": 1.00587443397381e-05, "loss": 1.2149, "step": 261700 }, { "epoch": 7.997800452129285, "grad_norm": 6.490436553955078, "learning_rate": 1.0043446334597969e-05, "loss": 1.1207, "step": 261800 }, { "epoch": 8.0, "eval_accuracy": 0.7329687786399463, "eval_loss": 0.7225077748298645, "eval_runtime": 1807.7404, "eval_samples_per_second": 18.108, "eval_steps_per_second": 4.527, "step": 261872 }, { "epoch": 8.0008553797275, "grad_norm": 4.015980243682861, "learning_rate": 1.0028148329457838e-05, "loss": 1.074, "step": 261900 }, { "epoch": 8.003910307325716, "grad_norm": 5.590630054473877, "learning_rate": 1.001285032431771e-05, "loss": 1.3674, "step": 262000 }, { "epoch": 8.006965234923932, "grad_norm": 4.984447002410889, "learning_rate": 9.997552319177579e-06, "loss": 1.1473, "step": 262100 }, { "epoch": 8.010020162522148, "grad_norm": 18.93263816833496, "learning_rate": 9.98225431403745e-06, "loss": 1.5645, "step": 262200 }, { "epoch": 8.013075090120363, "grad_norm": 11.363048553466797, "learning_rate": 9.96695630889732e-06, "loss": 1.2486, "step": 262300 }, { "epoch": 8.01613001771858, "grad_norm": 18.34299659729004, "learning_rate": 9.95165830375719e-06, "loss": 1.0042, "step": 262400 }, { "epoch": 8.019184945316796, "grad_norm": 14.327021598815918, "learning_rate": 9.93636029861706e-06, "loss": 1.0541, "step": 262500 }, { "epoch": 8.022239872915012, "grad_norm": 8.86902141571045, "learning_rate": 9.921062293476931e-06, "loss": 1.3444, "step": 262600 }, { "epoch": 8.025294800513228, "grad_norm": 20.985706329345703, "learning_rate": 9.9057642883368e-06, "loss": 1.2337, "step": 262700 }, { "epoch": 8.028349728111444, "grad_norm": 23.785953521728516, "learning_rate": 9.890466283196672e-06, "loss": 1.2492, "step": 262800 }, { "epoch": 8.03140465570966, "grad_norm": 19.114500045776367, "learning_rate": 9.875168278056541e-06, "loss": 1.3941, "step": 262900 }, { "epoch": 8.034459583307875, "grad_norm": 6.7520833015441895, "learning_rate": 9.859870272916413e-06, "loss": 0.9711, "step": 263000 }, { "epoch": 8.037514510906092, "grad_norm": 2.405261516571045, "learning_rate": 9.844572267776282e-06, "loss": 0.9566, "step": 263100 }, { "epoch": 8.040569438504308, "grad_norm": 3.6269869804382324, "learning_rate": 9.829274262636153e-06, "loss": 1.2396, "step": 263200 }, { "epoch": 8.043624366102524, "grad_norm": 4.379708766937256, "learning_rate": 9.813976257496023e-06, "loss": 1.0622, "step": 263300 }, { "epoch": 8.04667929370074, "grad_norm": 4.235758304595947, "learning_rate": 9.798678252355894e-06, "loss": 0.9627, "step": 263400 }, { "epoch": 8.049734221298955, "grad_norm": 5.870180606842041, "learning_rate": 9.783380247215763e-06, "loss": 0.9076, "step": 263500 }, { "epoch": 8.05278914889717, "grad_norm": 5.316646575927734, "learning_rate": 9.768082242075634e-06, "loss": 1.1855, "step": 263600 }, { "epoch": 8.055844076495386, "grad_norm": 5.945902347564697, "learning_rate": 9.752784236935504e-06, "loss": 1.0746, "step": 263700 }, { "epoch": 8.058899004093602, "grad_norm": 5.88398551940918, "learning_rate": 9.737486231795375e-06, "loss": 1.0877, "step": 263800 }, { "epoch": 8.06195393169182, "grad_norm": 5.979194641113281, "learning_rate": 9.722188226655244e-06, "loss": 1.061, "step": 263900 }, { "epoch": 8.065008859290035, "grad_norm": 7.003350734710693, "learning_rate": 9.706890221515116e-06, "loss": 1.2109, "step": 264000 }, { "epoch": 8.068063786888251, "grad_norm": 5.546398162841797, "learning_rate": 9.691592216374985e-06, "loss": 1.3, "step": 264100 }, { "epoch": 8.071118714486467, "grad_norm": 7.2468743324279785, "learning_rate": 9.676294211234856e-06, "loss": 1.1125, "step": 264200 }, { "epoch": 8.074173642084682, "grad_norm": 4.5122575759887695, "learning_rate": 9.660996206094726e-06, "loss": 0.9328, "step": 264300 }, { "epoch": 8.077228569682898, "grad_norm": 5.045060634613037, "learning_rate": 9.645698200954597e-06, "loss": 1.0791, "step": 264400 }, { "epoch": 8.080283497281114, "grad_norm": 6.919245719909668, "learning_rate": 9.630400195814466e-06, "loss": 1.0238, "step": 264500 }, { "epoch": 8.083338424879331, "grad_norm": 2.2628605365753174, "learning_rate": 9.615102190674338e-06, "loss": 0.9938, "step": 264600 }, { "epoch": 8.086393352477547, "grad_norm": 4.537121295928955, "learning_rate": 9.599804185534207e-06, "loss": 1.0183, "step": 264700 }, { "epoch": 8.089448280075763, "grad_norm": 19.846834182739258, "learning_rate": 9.584506180394078e-06, "loss": 1.1355, "step": 264800 }, { "epoch": 8.092503207673978, "grad_norm": 9.19655990600586, "learning_rate": 9.569208175253948e-06, "loss": 1.3802, "step": 264900 }, { "epoch": 8.095558135272194, "grad_norm": 5.800466060638428, "learning_rate": 9.553910170113819e-06, "loss": 1.1175, "step": 265000 }, { "epoch": 8.09861306287041, "grad_norm": 0.6195187568664551, "learning_rate": 9.538612164973688e-06, "loss": 1.0464, "step": 265100 }, { "epoch": 8.101667990468625, "grad_norm": 3.0343387126922607, "learning_rate": 9.52346713988496e-06, "loss": 0.9321, "step": 265200 }, { "epoch": 8.104722918066841, "grad_norm": 19.209545135498047, "learning_rate": 9.508169134744829e-06, "loss": 1.1553, "step": 265300 }, { "epoch": 8.107777845665058, "grad_norm": 3.6193349361419678, "learning_rate": 9.4930241096561e-06, "loss": 1.1593, "step": 265400 }, { "epoch": 8.110832773263274, "grad_norm": 7.876428127288818, "learning_rate": 9.477726104515972e-06, "loss": 1.1373, "step": 265500 }, { "epoch": 8.11388770086149, "grad_norm": 6.429228782653809, "learning_rate": 9.462428099375841e-06, "loss": 1.0856, "step": 265600 }, { "epoch": 8.116942628459705, "grad_norm": 15.394742012023926, "learning_rate": 9.447130094235712e-06, "loss": 1.1688, "step": 265700 }, { "epoch": 8.119997556057921, "grad_norm": 3.984320878982544, "learning_rate": 9.431832089095582e-06, "loss": 1.2846, "step": 265800 }, { "epoch": 8.123052483656137, "grad_norm": 23.03862762451172, "learning_rate": 9.416534083955453e-06, "loss": 1.3795, "step": 265900 }, { "epoch": 8.126107411254353, "grad_norm": 5.847175598144531, "learning_rate": 9.401236078815322e-06, "loss": 0.988, "step": 266000 }, { "epoch": 8.12916233885257, "grad_norm": 4.800594806671143, "learning_rate": 9.385938073675193e-06, "loss": 1.0304, "step": 266100 }, { "epoch": 8.132217266450786, "grad_norm": 13.866425514221191, "learning_rate": 9.370640068535063e-06, "loss": 1.1209, "step": 266200 }, { "epoch": 8.135272194049001, "grad_norm": 4.881516456604004, "learning_rate": 9.355342063394934e-06, "loss": 1.3845, "step": 266300 }, { "epoch": 8.138327121647217, "grad_norm": 8.563227653503418, "learning_rate": 9.340044058254803e-06, "loss": 1.2194, "step": 266400 }, { "epoch": 8.141382049245433, "grad_norm": 6.669600486755371, "learning_rate": 9.324746053114675e-06, "loss": 0.9601, "step": 266500 }, { "epoch": 8.144436976843648, "grad_norm": 6.998952388763428, "learning_rate": 9.309448047974546e-06, "loss": 1.1143, "step": 266600 }, { "epoch": 8.147491904441864, "grad_norm": 6.582401752471924, "learning_rate": 9.294150042834415e-06, "loss": 0.8779, "step": 266700 }, { "epoch": 8.15054683204008, "grad_norm": 7.145859718322754, "learning_rate": 9.278852037694286e-06, "loss": 1.1478, "step": 266800 }, { "epoch": 8.153601759638297, "grad_norm": 4.021243572235107, "learning_rate": 9.263554032554156e-06, "loss": 1.0431, "step": 266900 }, { "epoch": 8.156656687236513, "grad_norm": 8.807994842529297, "learning_rate": 9.248256027414027e-06, "loss": 1.3477, "step": 267000 }, { "epoch": 8.159711614834729, "grad_norm": 14.921202659606934, "learning_rate": 9.232958022273896e-06, "loss": 1.0948, "step": 267100 }, { "epoch": 8.162766542432944, "grad_norm": 12.610527992248535, "learning_rate": 9.217660017133768e-06, "loss": 1.2238, "step": 267200 }, { "epoch": 8.16582147003116, "grad_norm": 6.394305229187012, "learning_rate": 9.202362011993637e-06, "loss": 1.0813, "step": 267300 }, { "epoch": 8.168876397629376, "grad_norm": 3.418588161468506, "learning_rate": 9.187064006853508e-06, "loss": 1.1551, "step": 267400 }, { "epoch": 8.171931325227591, "grad_norm": 10.825216293334961, "learning_rate": 9.171766001713376e-06, "loss": 1.2308, "step": 267500 }, { "epoch": 8.174986252825809, "grad_norm": 14.792623519897461, "learning_rate": 9.156467996573247e-06, "loss": 0.9937, "step": 267600 }, { "epoch": 8.178041180424025, "grad_norm": 4.262557029724121, "learning_rate": 9.141169991433117e-06, "loss": 1.0925, "step": 267700 }, { "epoch": 8.18109610802224, "grad_norm": 7.162014007568359, "learning_rate": 9.125871986292988e-06, "loss": 1.0426, "step": 267800 }, { "epoch": 8.184151035620456, "grad_norm": 4.955055236816406, "learning_rate": 9.110573981152857e-06, "loss": 1.0415, "step": 267900 }, { "epoch": 8.187205963218672, "grad_norm": 8.408870697021484, "learning_rate": 9.095275976012728e-06, "loss": 1.1049, "step": 268000 }, { "epoch": 8.190260890816887, "grad_norm": 3.665238857269287, "learning_rate": 9.079977970872598e-06, "loss": 1.0314, "step": 268100 }, { "epoch": 8.193315818415103, "grad_norm": 11.393309593200684, "learning_rate": 9.064679965732469e-06, "loss": 1.388, "step": 268200 }, { "epoch": 8.196370746013319, "grad_norm": 3.749567747116089, "learning_rate": 9.04953494064374e-06, "loss": 1.1032, "step": 268300 }, { "epoch": 8.199425673611536, "grad_norm": 7.865976333618164, "learning_rate": 9.034236935503612e-06, "loss": 1.0042, "step": 268400 }, { "epoch": 8.202480601209752, "grad_norm": 7.517879962921143, "learning_rate": 9.018938930363481e-06, "loss": 1.0302, "step": 268500 }, { "epoch": 8.205535528807967, "grad_norm": 11.752159118652344, "learning_rate": 9.003640925223352e-06, "loss": 1.0179, "step": 268600 }, { "epoch": 8.208590456406183, "grad_norm": 19.774866104125977, "learning_rate": 8.988342920083222e-06, "loss": 1.079, "step": 268700 }, { "epoch": 8.211645384004399, "grad_norm": 4.879484176635742, "learning_rate": 8.973044914943093e-06, "loss": 1.2598, "step": 268800 }, { "epoch": 8.214700311602614, "grad_norm": 16.126920700073242, "learning_rate": 8.957746909802962e-06, "loss": 1.0387, "step": 268900 }, { "epoch": 8.21775523920083, "grad_norm": 8.293771743774414, "learning_rate": 8.942448904662833e-06, "loss": 0.9176, "step": 269000 }, { "epoch": 8.220810166799048, "grad_norm": 15.987733840942383, "learning_rate": 8.927150899522703e-06, "loss": 1.2638, "step": 269100 }, { "epoch": 8.223865094397263, "grad_norm": 0.636125385761261, "learning_rate": 8.911852894382574e-06, "loss": 1.1503, "step": 269200 }, { "epoch": 8.226920021995479, "grad_norm": 10.927716255187988, "learning_rate": 8.896554889242444e-06, "loss": 1.2616, "step": 269300 }, { "epoch": 8.229974949593695, "grad_norm": 5.758749961853027, "learning_rate": 8.881256884102315e-06, "loss": 1.031, "step": 269400 }, { "epoch": 8.23302987719191, "grad_norm": 3.450516700744629, "learning_rate": 8.865958878962184e-06, "loss": 1.3088, "step": 269500 }, { "epoch": 8.236084804790126, "grad_norm": 3.5513436794281006, "learning_rate": 8.850660873822055e-06, "loss": 1.1002, "step": 269600 }, { "epoch": 8.239139732388342, "grad_norm": 2.624598979949951, "learning_rate": 8.835362868681925e-06, "loss": 1.2626, "step": 269700 }, { "epoch": 8.24219465998656, "grad_norm": 14.29702091217041, "learning_rate": 8.820064863541796e-06, "loss": 0.9916, "step": 269800 }, { "epoch": 8.245249587584775, "grad_norm": 8.308228492736816, "learning_rate": 8.804766858401665e-06, "loss": 1.0105, "step": 269900 }, { "epoch": 8.24830451518299, "grad_norm": 16.899057388305664, "learning_rate": 8.789468853261537e-06, "loss": 1.0939, "step": 270000 }, { "epoch": 8.251359442781206, "grad_norm": 1.783301591873169, "learning_rate": 8.774170848121406e-06, "loss": 1.0611, "step": 270100 }, { "epoch": 8.254414370379422, "grad_norm": 9.797531127929688, "learning_rate": 8.758872842981275e-06, "loss": 1.3537, "step": 270200 }, { "epoch": 8.257469297977638, "grad_norm": 11.86088752746582, "learning_rate": 8.743574837841145e-06, "loss": 1.1028, "step": 270300 }, { "epoch": 8.260524225575853, "grad_norm": 9.216573715209961, "learning_rate": 8.728429812752418e-06, "loss": 1.0754, "step": 270400 }, { "epoch": 8.263579153174069, "grad_norm": 6.141629219055176, "learning_rate": 8.713131807612287e-06, "loss": 0.928, "step": 270500 }, { "epoch": 8.266634080772286, "grad_norm": 2.7297372817993164, "learning_rate": 8.697833802472159e-06, "loss": 1.2339, "step": 270600 }, { "epoch": 8.269689008370502, "grad_norm": 7.393041610717773, "learning_rate": 8.682535797332028e-06, "loss": 0.9285, "step": 270700 }, { "epoch": 8.272743935968718, "grad_norm": 4.140475273132324, "learning_rate": 8.6672377921919e-06, "loss": 0.9032, "step": 270800 }, { "epoch": 8.275798863566934, "grad_norm": 7.7887773513793945, "learning_rate": 8.651939787051769e-06, "loss": 0.9346, "step": 270900 }, { "epoch": 8.27885379116515, "grad_norm": 5.817099571228027, "learning_rate": 8.63664178191164e-06, "loss": 1.2652, "step": 271000 }, { "epoch": 8.281908718763365, "grad_norm": 6.46265983581543, "learning_rate": 8.62134377677151e-06, "loss": 1.1182, "step": 271100 }, { "epoch": 8.28496364636158, "grad_norm": 8.130231857299805, "learning_rate": 8.60604577163138e-06, "loss": 1.213, "step": 271200 }, { "epoch": 8.288018573959796, "grad_norm": 5.918152332305908, "learning_rate": 8.59074776649125e-06, "loss": 1.1042, "step": 271300 }, { "epoch": 8.291073501558014, "grad_norm": 7.691157341003418, "learning_rate": 8.575449761351121e-06, "loss": 1.1968, "step": 271400 }, { "epoch": 8.29412842915623, "grad_norm": 5.404082298278809, "learning_rate": 8.56015175621099e-06, "loss": 0.9836, "step": 271500 }, { "epoch": 8.297183356754445, "grad_norm": 6.673370361328125, "learning_rate": 8.544853751070862e-06, "loss": 0.969, "step": 271600 }, { "epoch": 8.30023828435266, "grad_norm": 1.4831641912460327, "learning_rate": 8.529555745930731e-06, "loss": 1.2201, "step": 271700 }, { "epoch": 8.303293211950876, "grad_norm": 9.45586109161377, "learning_rate": 8.514257740790602e-06, "loss": 0.9741, "step": 271800 }, { "epoch": 8.306348139549092, "grad_norm": 3.7599129676818848, "learning_rate": 8.498959735650472e-06, "loss": 1.2079, "step": 271900 }, { "epoch": 8.309403067147308, "grad_norm": 2.7845051288604736, "learning_rate": 8.483661730510343e-06, "loss": 1.0476, "step": 272000 }, { "epoch": 8.312457994745525, "grad_norm": 9.171859741210938, "learning_rate": 8.468363725370212e-06, "loss": 1.0662, "step": 272100 }, { "epoch": 8.315512922343741, "grad_norm": 3.499755859375, "learning_rate": 8.453065720230084e-06, "loss": 1.2587, "step": 272200 }, { "epoch": 8.318567849941957, "grad_norm": 23.11410140991211, "learning_rate": 8.437767715089953e-06, "loss": 0.9943, "step": 272300 }, { "epoch": 8.321622777540172, "grad_norm": 5.886302471160889, "learning_rate": 8.422622690001224e-06, "loss": 1.2263, "step": 272400 }, { "epoch": 8.324677705138388, "grad_norm": 30.187957763671875, "learning_rate": 8.407324684861094e-06, "loss": 1.2, "step": 272500 }, { "epoch": 8.327732632736604, "grad_norm": 21.372512817382812, "learning_rate": 8.392026679720965e-06, "loss": 0.9827, "step": 272600 }, { "epoch": 8.33078756033482, "grad_norm": 2.0684807300567627, "learning_rate": 8.376728674580834e-06, "loss": 1.0877, "step": 272700 }, { "epoch": 8.333842487933037, "grad_norm": 4.702449798583984, "learning_rate": 8.361430669440706e-06, "loss": 1.0267, "step": 272800 }, { "epoch": 8.336897415531253, "grad_norm": 11.90904712677002, "learning_rate": 8.346132664300575e-06, "loss": 1.1004, "step": 272900 }, { "epoch": 8.339952343129468, "grad_norm": 7.533595561981201, "learning_rate": 8.330834659160446e-06, "loss": 1.0142, "step": 273000 }, { "epoch": 8.343007270727684, "grad_norm": 5.445414066314697, "learning_rate": 8.315536654020316e-06, "loss": 0.9771, "step": 273100 }, { "epoch": 8.3460621983259, "grad_norm": 20.57103729248047, "learning_rate": 8.300238648880187e-06, "loss": 1.1579, "step": 273200 }, { "epoch": 8.349117125924115, "grad_norm": 1.6556389331817627, "learning_rate": 8.284940643740056e-06, "loss": 1.1873, "step": 273300 }, { "epoch": 8.352172053522331, "grad_norm": 8.812055587768555, "learning_rate": 8.269642638599927e-06, "loss": 1.1352, "step": 273400 }, { "epoch": 8.355226981120547, "grad_norm": 2.017615795135498, "learning_rate": 8.254344633459797e-06, "loss": 1.0793, "step": 273500 }, { "epoch": 8.358281908718764, "grad_norm": 15.955278396606445, "learning_rate": 8.239046628319668e-06, "loss": 1.0959, "step": 273600 }, { "epoch": 8.36133683631698, "grad_norm": 6.15826416015625, "learning_rate": 8.223748623179538e-06, "loss": 0.9902, "step": 273700 }, { "epoch": 8.364391763915195, "grad_norm": 3.7233998775482178, "learning_rate": 8.208450618039409e-06, "loss": 0.9586, "step": 273800 }, { "epoch": 8.367446691513411, "grad_norm": 13.683627128601074, "learning_rate": 8.193152612899278e-06, "loss": 0.9191, "step": 273900 }, { "epoch": 8.370501619111627, "grad_norm": 8.237982749938965, "learning_rate": 8.17785460775915e-06, "loss": 1.2411, "step": 274000 }, { "epoch": 8.373556546709843, "grad_norm": 20.673309326171875, "learning_rate": 8.162556602619019e-06, "loss": 0.9853, "step": 274100 }, { "epoch": 8.376611474308058, "grad_norm": 3.522181749343872, "learning_rate": 8.14725859747889e-06, "loss": 1.0354, "step": 274200 }, { "epoch": 8.379666401906276, "grad_norm": 3.1122303009033203, "learning_rate": 8.13196059233876e-06, "loss": 1.2159, "step": 274300 }, { "epoch": 8.382721329504491, "grad_norm": 4.235086917877197, "learning_rate": 8.11666258719863e-06, "loss": 1.2349, "step": 274400 }, { "epoch": 8.385776257102707, "grad_norm": 9.490307807922363, "learning_rate": 8.101517562109902e-06, "loss": 1.3173, "step": 274500 }, { "epoch": 8.388831184700923, "grad_norm": 9.003594398498535, "learning_rate": 8.086219556969771e-06, "loss": 1.2762, "step": 274600 }, { "epoch": 8.391886112299138, "grad_norm": 23.088111877441406, "learning_rate": 8.070921551829643e-06, "loss": 1.2825, "step": 274700 }, { "epoch": 8.394941039897354, "grad_norm": 4.005666732788086, "learning_rate": 8.055623546689512e-06, "loss": 1.1964, "step": 274800 }, { "epoch": 8.39799596749557, "grad_norm": 8.891443252563477, "learning_rate": 8.040325541549381e-06, "loss": 1.0961, "step": 274900 }, { "epoch": 8.401050895093785, "grad_norm": 2.082915782928467, "learning_rate": 8.025027536409253e-06, "loss": 0.9843, "step": 275000 }, { "epoch": 8.404105822692003, "grad_norm": 3.974708080291748, "learning_rate": 8.009729531269122e-06, "loss": 1.0311, "step": 275100 }, { "epoch": 8.407160750290219, "grad_norm": 4.6099982261657715, "learning_rate": 7.994431526128993e-06, "loss": 1.0286, "step": 275200 }, { "epoch": 8.410215677888434, "grad_norm": 8.777955055236816, "learning_rate": 7.979133520988863e-06, "loss": 1.0127, "step": 275300 }, { "epoch": 8.41327060548665, "grad_norm": 9.076045036315918, "learning_rate": 7.963835515848734e-06, "loss": 1.284, "step": 275400 }, { "epoch": 8.416325533084866, "grad_norm": 7.679018974304199, "learning_rate": 7.948537510708603e-06, "loss": 1.2001, "step": 275500 }, { "epoch": 8.419380460683081, "grad_norm": 8.124194145202637, "learning_rate": 7.933239505568474e-06, "loss": 1.0003, "step": 275600 }, { "epoch": 8.422435388281297, "grad_norm": 5.4354729652404785, "learning_rate": 7.917941500428344e-06, "loss": 1.1908, "step": 275700 }, { "epoch": 8.425490315879514, "grad_norm": 8.532306671142578, "learning_rate": 7.902643495288215e-06, "loss": 0.9961, "step": 275800 }, { "epoch": 8.42854524347773, "grad_norm": 9.226902961730957, "learning_rate": 7.887345490148085e-06, "loss": 1.1599, "step": 275900 }, { "epoch": 8.431600171075946, "grad_norm": 16.286670684814453, "learning_rate": 7.872047485007956e-06, "loss": 1.1251, "step": 276000 }, { "epoch": 8.434655098674162, "grad_norm": 11.915355682373047, "learning_rate": 7.856749479867825e-06, "loss": 1.2525, "step": 276100 }, { "epoch": 8.437710026272377, "grad_norm": 2.382817506790161, "learning_rate": 7.841451474727696e-06, "loss": 1.1368, "step": 276200 }, { "epoch": 8.440764953870593, "grad_norm": 6.8670854568481445, "learning_rate": 7.826153469587566e-06, "loss": 1.1651, "step": 276300 }, { "epoch": 8.443819881468809, "grad_norm": 5.160354137420654, "learning_rate": 7.810855464447437e-06, "loss": 1.1812, "step": 276400 }, { "epoch": 8.446874809067024, "grad_norm": 6.945872783660889, "learning_rate": 7.795557459307306e-06, "loss": 1.1228, "step": 276500 }, { "epoch": 8.449929736665242, "grad_norm": 5.746914863586426, "learning_rate": 7.780259454167178e-06, "loss": 1.4192, "step": 276600 }, { "epoch": 8.452984664263457, "grad_norm": 12.10163402557373, "learning_rate": 7.764961449027047e-06, "loss": 1.0688, "step": 276700 }, { "epoch": 8.456039591861673, "grad_norm": 10.079050064086914, "learning_rate": 7.749663443886918e-06, "loss": 1.3006, "step": 276800 }, { "epoch": 8.459094519459889, "grad_norm": 3.420541524887085, "learning_rate": 7.734365438746788e-06, "loss": 1.1754, "step": 276900 }, { "epoch": 8.462149447058104, "grad_norm": 9.938535690307617, "learning_rate": 7.719067433606659e-06, "loss": 1.1401, "step": 277000 }, { "epoch": 8.46520437465632, "grad_norm": 5.8871378898620605, "learning_rate": 7.703769428466528e-06, "loss": 1.1469, "step": 277100 }, { "epoch": 8.468259302254536, "grad_norm": 2.248385190963745, "learning_rate": 7.6884714233264e-06, "loss": 1.0529, "step": 277200 }, { "epoch": 8.471314229852753, "grad_norm": 3.6512601375579834, "learning_rate": 7.673173418186269e-06, "loss": 0.9395, "step": 277300 }, { "epoch": 8.474369157450969, "grad_norm": 17.915828704833984, "learning_rate": 7.65787541304614e-06, "loss": 0.9789, "step": 277400 }, { "epoch": 8.477424085049185, "grad_norm": 11.305119514465332, "learning_rate": 7.64257740790601e-06, "loss": 1.3882, "step": 277500 }, { "epoch": 8.4804790126474, "grad_norm": 21.870290756225586, "learning_rate": 7.627432382817282e-06, "loss": 1.1529, "step": 277600 }, { "epoch": 8.483533940245616, "grad_norm": 1.5844974517822266, "learning_rate": 7.61213437767715e-06, "loss": 1.2412, "step": 277700 }, { "epoch": 8.486588867843832, "grad_norm": 10.673868179321289, "learning_rate": 7.596836372537021e-06, "loss": 1.2452, "step": 277800 }, { "epoch": 8.489643795442047, "grad_norm": 5.95991849899292, "learning_rate": 7.581538367396891e-06, "loss": 1.2224, "step": 277900 }, { "epoch": 8.492698723040263, "grad_norm": 2.6692888736724854, "learning_rate": 7.566240362256761e-06, "loss": 0.9876, "step": 278000 }, { "epoch": 8.49575365063848, "grad_norm": 5.3354363441467285, "learning_rate": 7.5509423571166316e-06, "loss": 1.0738, "step": 278100 }, { "epoch": 8.498808578236696, "grad_norm": 10.510762214660645, "learning_rate": 7.535644351976502e-06, "loss": 1.056, "step": 278200 }, { "epoch": 8.501863505834912, "grad_norm": 3.5407023429870605, "learning_rate": 7.520346346836372e-06, "loss": 1.1578, "step": 278300 }, { "epoch": 8.504918433433128, "grad_norm": 17.953824996948242, "learning_rate": 7.5050483416962425e-06, "loss": 0.9995, "step": 278400 }, { "epoch": 8.507973361031343, "grad_norm": 8.821012496948242, "learning_rate": 7.489750336556113e-06, "loss": 1.2082, "step": 278500 }, { "epoch": 8.511028288629559, "grad_norm": 11.681960105895996, "learning_rate": 7.474452331415983e-06, "loss": 1.2717, "step": 278600 }, { "epoch": 8.514083216227775, "grad_norm": 8.691248893737793, "learning_rate": 7.4591543262758534e-06, "loss": 1.3788, "step": 278700 }, { "epoch": 8.517138143825992, "grad_norm": 7.976315498352051, "learning_rate": 7.443856321135724e-06, "loss": 1.043, "step": 278800 }, { "epoch": 8.520193071424208, "grad_norm": 16.34596061706543, "learning_rate": 7.428558315995594e-06, "loss": 1.0461, "step": 278900 }, { "epoch": 8.523247999022423, "grad_norm": 4.934373378753662, "learning_rate": 7.413260310855464e-06, "loss": 0.9906, "step": 279000 }, { "epoch": 8.52630292662064, "grad_norm": 10.014142990112305, "learning_rate": 7.397962305715335e-06, "loss": 1.0192, "step": 279100 }, { "epoch": 8.529357854218855, "grad_norm": 2.8271279335021973, "learning_rate": 7.382664300575205e-06, "loss": 0.9847, "step": 279200 }, { "epoch": 8.53241278181707, "grad_norm": 3.4308598041534424, "learning_rate": 7.367366295435075e-06, "loss": 1.1499, "step": 279300 }, { "epoch": 8.535467709415286, "grad_norm": 4.892797946929932, "learning_rate": 7.352068290294946e-06, "loss": 1.1534, "step": 279400 }, { "epoch": 8.538522637013504, "grad_norm": 6.5739922523498535, "learning_rate": 7.336770285154816e-06, "loss": 1.0535, "step": 279500 }, { "epoch": 8.54157756461172, "grad_norm": 16.024093627929688, "learning_rate": 7.321472280014686e-06, "loss": 1.2655, "step": 279600 }, { "epoch": 8.544632492209935, "grad_norm": 4.712281703948975, "learning_rate": 7.3061742748745565e-06, "loss": 1.0642, "step": 279700 }, { "epoch": 8.54768741980815, "grad_norm": 2.757760524749756, "learning_rate": 7.290876269734427e-06, "loss": 1.302, "step": 279800 }, { "epoch": 8.550742347406366, "grad_norm": 6.04436731338501, "learning_rate": 7.275578264594297e-06, "loss": 1.0924, "step": 279900 }, { "epoch": 8.553797275004582, "grad_norm": 4.666031837463379, "learning_rate": 7.2602802594541675e-06, "loss": 1.2949, "step": 280000 }, { "epoch": 8.556852202602798, "grad_norm": 4.954720497131348, "learning_rate": 7.244982254314038e-06, "loss": 1.1814, "step": 280100 }, { "epoch": 8.559907130201013, "grad_norm": 11.050684928894043, "learning_rate": 7.229684249173908e-06, "loss": 1.175, "step": 280200 }, { "epoch": 8.562962057799231, "grad_norm": 24.601699829101562, "learning_rate": 7.21453922408518e-06, "loss": 1.2731, "step": 280300 }, { "epoch": 8.566016985397447, "grad_norm": 8.36152172088623, "learning_rate": 7.199241218945051e-06, "loss": 1.2436, "step": 280400 }, { "epoch": 8.569071912995662, "grad_norm": 6.849078178405762, "learning_rate": 7.183943213804919e-06, "loss": 1.0235, "step": 280500 }, { "epoch": 8.572126840593878, "grad_norm": 5.327115058898926, "learning_rate": 7.1686452086647895e-06, "loss": 0.8993, "step": 280600 }, { "epoch": 8.575181768192094, "grad_norm": 5.185020446777344, "learning_rate": 7.15334720352466e-06, "loss": 0.9513, "step": 280700 }, { "epoch": 8.57823669579031, "grad_norm": 5.5773444175720215, "learning_rate": 7.13804919838453e-06, "loss": 0.9164, "step": 280800 }, { "epoch": 8.581291623388525, "grad_norm": 6.858551025390625, "learning_rate": 7.1227511932444004e-06, "loss": 0.9887, "step": 280900 }, { "epoch": 8.58434655098674, "grad_norm": 7.5612616539001465, "learning_rate": 7.107453188104271e-06, "loss": 1.022, "step": 281000 }, { "epoch": 8.587401478584958, "grad_norm": 10.310321807861328, "learning_rate": 7.092155182964141e-06, "loss": 1.2092, "step": 281100 }, { "epoch": 8.590456406183174, "grad_norm": 1.7474365234375, "learning_rate": 7.076857177824011e-06, "loss": 0.9727, "step": 281200 }, { "epoch": 8.59351133378139, "grad_norm": 8.458216667175293, "learning_rate": 7.061559172683882e-06, "loss": 1.3668, "step": 281300 }, { "epoch": 8.596566261379605, "grad_norm": 8.884044647216797, "learning_rate": 7.046261167543752e-06, "loss": 1.0134, "step": 281400 }, { "epoch": 8.599621188977821, "grad_norm": 8.570650100708008, "learning_rate": 7.030963162403622e-06, "loss": 1.2935, "step": 281500 }, { "epoch": 8.602676116576037, "grad_norm": 2.1662871837615967, "learning_rate": 7.015665157263493e-06, "loss": 1.5576, "step": 281600 }, { "epoch": 8.605731044174252, "grad_norm": 2.5884835720062256, "learning_rate": 7.000367152123363e-06, "loss": 1.1192, "step": 281700 }, { "epoch": 8.60878597177247, "grad_norm": 3.6408133506774902, "learning_rate": 6.985069146983233e-06, "loss": 1.1488, "step": 281800 }, { "epoch": 8.611840899370685, "grad_norm": 11.819531440734863, "learning_rate": 6.9697711418431035e-06, "loss": 1.1942, "step": 281900 }, { "epoch": 8.614895826968901, "grad_norm": 3.195770263671875, "learning_rate": 6.954473136702974e-06, "loss": 0.9802, "step": 282000 }, { "epoch": 8.617950754567117, "grad_norm": 1.4171463251113892, "learning_rate": 6.939175131562844e-06, "loss": 1.0642, "step": 282100 }, { "epoch": 8.621005682165332, "grad_norm": 5.6690802574157715, "learning_rate": 6.9238771264227145e-06, "loss": 1.0642, "step": 282200 }, { "epoch": 8.624060609763548, "grad_norm": 13.222604751586914, "learning_rate": 6.908579121282585e-06, "loss": 1.0442, "step": 282300 }, { "epoch": 8.627115537361764, "grad_norm": 9.634571075439453, "learning_rate": 6.893281116142455e-06, "loss": 1.315, "step": 282400 }, { "epoch": 8.630170464959981, "grad_norm": 10.725183486938477, "learning_rate": 6.877983111002325e-06, "loss": 1.2638, "step": 282500 }, { "epoch": 8.633225392558197, "grad_norm": 12.106181144714355, "learning_rate": 6.862685105862196e-06, "loss": 1.2009, "step": 282600 }, { "epoch": 8.636280320156413, "grad_norm": 8.581542015075684, "learning_rate": 6.847387100722066e-06, "loss": 0.9775, "step": 282700 }, { "epoch": 8.639335247754628, "grad_norm": 7.633388996124268, "learning_rate": 6.832089095581936e-06, "loss": 1.227, "step": 282800 }, { "epoch": 8.642390175352844, "grad_norm": 5.2652130126953125, "learning_rate": 6.816791090441807e-06, "loss": 1.2009, "step": 282900 }, { "epoch": 8.64544510295106, "grad_norm": 2.624347448348999, "learning_rate": 6.801493085301677e-06, "loss": 1.2672, "step": 283000 }, { "epoch": 8.648500030549275, "grad_norm": 4.857085227966309, "learning_rate": 6.786195080161547e-06, "loss": 1.18, "step": 283100 }, { "epoch": 8.651554958147491, "grad_norm": 4.37367582321167, "learning_rate": 6.7708970750214176e-06, "loss": 1.0433, "step": 283200 }, { "epoch": 8.654609885745709, "grad_norm": 8.585063934326172, "learning_rate": 6.755599069881288e-06, "loss": 1.3093, "step": 283300 }, { "epoch": 8.657664813343924, "grad_norm": 9.964705467224121, "learning_rate": 6.740301064741158e-06, "loss": 1.3228, "step": 283400 }, { "epoch": 8.66071974094214, "grad_norm": 7.176522731781006, "learning_rate": 6.7250030596010285e-06, "loss": 1.131, "step": 283500 }, { "epoch": 8.663774668540356, "grad_norm": 7.689813613891602, "learning_rate": 6.709705054460899e-06, "loss": 1.1918, "step": 283600 }, { "epoch": 8.666829596138571, "grad_norm": 6.8202714920043945, "learning_rate": 6.694407049320769e-06, "loss": 0.9765, "step": 283700 }, { "epoch": 8.669884523736787, "grad_norm": 6.130142688751221, "learning_rate": 6.6791090441806394e-06, "loss": 1.1884, "step": 283800 }, { "epoch": 8.672939451335003, "grad_norm": 7.813923358917236, "learning_rate": 6.66381103904051e-06, "loss": 0.9992, "step": 283900 }, { "epoch": 8.675994378933218, "grad_norm": 3.493478775024414, "learning_rate": 6.64866601395178e-06, "loss": 1.1072, "step": 284000 }, { "epoch": 8.679049306531436, "grad_norm": 9.310433387756348, "learning_rate": 6.6333680088116506e-06, "loss": 1.118, "step": 284100 }, { "epoch": 8.682104234129651, "grad_norm": 20.439428329467773, "learning_rate": 6.618070003671521e-06, "loss": 0.9944, "step": 284200 }, { "epoch": 8.685159161727867, "grad_norm": 6.449985504150391, "learning_rate": 6.602771998531391e-06, "loss": 1.0403, "step": 284300 }, { "epoch": 8.688214089326083, "grad_norm": 1.44782292842865, "learning_rate": 6.5874739933912615e-06, "loss": 1.0567, "step": 284400 }, { "epoch": 8.691269016924299, "grad_norm": 9.718864440917969, "learning_rate": 6.572175988251132e-06, "loss": 0.9943, "step": 284500 }, { "epoch": 8.694323944522514, "grad_norm": 3.4997923374176025, "learning_rate": 6.556877983111002e-06, "loss": 1.2147, "step": 284600 }, { "epoch": 8.69737887212073, "grad_norm": 3.804351568222046, "learning_rate": 6.541579977970872e-06, "loss": 0.8873, "step": 284700 }, { "epoch": 8.700433799718947, "grad_norm": 8.257146835327148, "learning_rate": 6.526281972830743e-06, "loss": 1.3033, "step": 284800 }, { "epoch": 8.703488727317163, "grad_norm": 5.3684821128845215, "learning_rate": 6.510983967690613e-06, "loss": 0.9764, "step": 284900 }, { "epoch": 8.706543654915379, "grad_norm": 4.762368202209473, "learning_rate": 6.495685962550483e-06, "loss": 1.0703, "step": 285000 }, { "epoch": 8.709598582513594, "grad_norm": 4.3477678298950195, "learning_rate": 6.480387957410354e-06, "loss": 1.1059, "step": 285100 }, { "epoch": 8.71265351011181, "grad_norm": 10.474966049194336, "learning_rate": 6.465089952270224e-06, "loss": 1.0836, "step": 285200 }, { "epoch": 8.715708437710026, "grad_norm": 5.3630194664001465, "learning_rate": 6.449791947130094e-06, "loss": 1.01, "step": 285300 }, { "epoch": 8.718763365308241, "grad_norm": 2.3403940200805664, "learning_rate": 6.434493941989965e-06, "loss": 1.0378, "step": 285400 }, { "epoch": 8.721818292906459, "grad_norm": 5.546512603759766, "learning_rate": 6.419195936849835e-06, "loss": 1.1563, "step": 285500 }, { "epoch": 8.724873220504675, "grad_norm": 39.056983947753906, "learning_rate": 6.403897931709705e-06, "loss": 1.2946, "step": 285600 }, { "epoch": 8.72792814810289, "grad_norm": 10.053462028503418, "learning_rate": 6.3885999265695755e-06, "loss": 1.1364, "step": 285700 }, { "epoch": 8.730983075701106, "grad_norm": 2.9167656898498535, "learning_rate": 6.373301921429446e-06, "loss": 1.3461, "step": 285800 }, { "epoch": 8.734038003299322, "grad_norm": 4.460560321807861, "learning_rate": 6.358003916289316e-06, "loss": 1.0223, "step": 285900 }, { "epoch": 8.737092930897537, "grad_norm": 7.659598350524902, "learning_rate": 6.3428588912005875e-06, "loss": 1.1359, "step": 286000 }, { "epoch": 8.740147858495753, "grad_norm": 8.358567237854004, "learning_rate": 6.327560886060458e-06, "loss": 1.039, "step": 286100 }, { "epoch": 8.74320278609397, "grad_norm": 7.288459300994873, "learning_rate": 6.312262880920328e-06, "loss": 1.3169, "step": 286200 }, { "epoch": 8.746257713692186, "grad_norm": 7.670612812042236, "learning_rate": 6.2969648757801984e-06, "loss": 1.0615, "step": 286300 }, { "epoch": 8.749312641290402, "grad_norm": 4.944478988647461, "learning_rate": 6.281666870640069e-06, "loss": 0.9396, "step": 286400 }, { "epoch": 8.752367568888618, "grad_norm": 14.047966957092285, "learning_rate": 6.266368865499939e-06, "loss": 1.0738, "step": 286500 }, { "epoch": 8.755422496486833, "grad_norm": 2.8102049827575684, "learning_rate": 6.251070860359809e-06, "loss": 1.0472, "step": 286600 }, { "epoch": 8.758477424085049, "grad_norm": 8.91782283782959, "learning_rate": 6.23577285521968e-06, "loss": 1.2268, "step": 286700 }, { "epoch": 8.761532351683265, "grad_norm": 13.246162414550781, "learning_rate": 6.22047485007955e-06, "loss": 1.2976, "step": 286800 }, { "epoch": 8.76458727928148, "grad_norm": 3.5406272411346436, "learning_rate": 6.20517684493942e-06, "loss": 1.4642, "step": 286900 }, { "epoch": 8.767642206879698, "grad_norm": 0.011519878171384335, "learning_rate": 6.189878839799291e-06, "loss": 1.1179, "step": 287000 }, { "epoch": 8.770697134477913, "grad_norm": 3.2644083499908447, "learning_rate": 6.174580834659161e-06, "loss": 1.1199, "step": 287100 }, { "epoch": 8.77375206207613, "grad_norm": 5.073540210723877, "learning_rate": 6.159282829519031e-06, "loss": 1.1659, "step": 287200 }, { "epoch": 8.776806989674345, "grad_norm": 6.0912089347839355, "learning_rate": 6.1439848243789015e-06, "loss": 1.286, "step": 287300 }, { "epoch": 8.77986191727256, "grad_norm": 7.724213600158691, "learning_rate": 6.128686819238772e-06, "loss": 1.0862, "step": 287400 }, { "epoch": 8.782916844870776, "grad_norm": 14.391536712646484, "learning_rate": 6.113388814098642e-06, "loss": 1.1989, "step": 287500 }, { "epoch": 8.785971772468992, "grad_norm": 4.909726619720459, "learning_rate": 6.0980908089585124e-06, "loss": 0.9811, "step": 287600 }, { "epoch": 8.789026700067208, "grad_norm": 3.266512870788574, "learning_rate": 6.082792803818383e-06, "loss": 1.0923, "step": 287700 }, { "epoch": 8.792081627665425, "grad_norm": 17.347148895263672, "learning_rate": 6.067494798678253e-06, "loss": 1.1496, "step": 287800 }, { "epoch": 8.79513655526364, "grad_norm": 13.67123031616211, "learning_rate": 6.052196793538123e-06, "loss": 1.0582, "step": 287900 }, { "epoch": 8.798191482861856, "grad_norm": 13.52348804473877, "learning_rate": 6.036898788397994e-06, "loss": 1.0022, "step": 288000 }, { "epoch": 8.801246410460072, "grad_norm": 6.078239440917969, "learning_rate": 6.021600783257864e-06, "loss": 1.2463, "step": 288100 }, { "epoch": 8.804301338058288, "grad_norm": 2.679036855697632, "learning_rate": 6.006302778117734e-06, "loss": 1.0169, "step": 288200 }, { "epoch": 8.807356265656503, "grad_norm": 4.248081684112549, "learning_rate": 5.991004772977605e-06, "loss": 0.9784, "step": 288300 }, { "epoch": 8.810411193254719, "grad_norm": 0.44658416509628296, "learning_rate": 5.975706767837475e-06, "loss": 1.0276, "step": 288400 }, { "epoch": 8.813466120852937, "grad_norm": 3.2473037242889404, "learning_rate": 5.960408762697345e-06, "loss": 0.8594, "step": 288500 }, { "epoch": 8.816521048451152, "grad_norm": 8.046907424926758, "learning_rate": 5.9451107575572155e-06, "loss": 1.0015, "step": 288600 }, { "epoch": 8.819575976049368, "grad_norm": 5.975189208984375, "learning_rate": 5.929812752417085e-06, "loss": 1.2663, "step": 288700 }, { "epoch": 8.822630903647584, "grad_norm": 6.2016215324401855, "learning_rate": 5.914514747276955e-06, "loss": 1.1905, "step": 288800 }, { "epoch": 8.8256858312458, "grad_norm": 5.386969089508057, "learning_rate": 5.899216742136826e-06, "loss": 1.1232, "step": 288900 }, { "epoch": 8.828740758844015, "grad_norm": 3.4425082206726074, "learning_rate": 5.883918736996696e-06, "loss": 1.226, "step": 289000 }, { "epoch": 8.83179568644223, "grad_norm": 7.528200626373291, "learning_rate": 5.868620731856566e-06, "loss": 1.1668, "step": 289100 }, { "epoch": 8.834850614040448, "grad_norm": 17.77597427368164, "learning_rate": 5.8533227267164366e-06, "loss": 1.0165, "step": 289200 }, { "epoch": 8.837905541638664, "grad_norm": 9.727446556091309, "learning_rate": 5.838024721576307e-06, "loss": 1.0466, "step": 289300 }, { "epoch": 8.84096046923688, "grad_norm": 4.185564994812012, "learning_rate": 5.822726716436177e-06, "loss": 1.2831, "step": 289400 }, { "epoch": 8.844015396835095, "grad_norm": 13.54357624053955, "learning_rate": 5.8075816913474485e-06, "loss": 1.2904, "step": 289500 }, { "epoch": 8.84707032443331, "grad_norm": 3.099177598953247, "learning_rate": 5.792283686207319e-06, "loss": 1.1769, "step": 289600 }, { "epoch": 8.850125252031527, "grad_norm": 18.087329864501953, "learning_rate": 5.776985681067189e-06, "loss": 1.0588, "step": 289700 }, { "epoch": 8.853180179629742, "grad_norm": 4.369546413421631, "learning_rate": 5.7616876759270595e-06, "loss": 0.9452, "step": 289800 }, { "epoch": 8.856235107227958, "grad_norm": 6.6351447105407715, "learning_rate": 5.74638967078693e-06, "loss": 1.1907, "step": 289900 }, { "epoch": 8.859290034826175, "grad_norm": 13.804483413696289, "learning_rate": 5.7310916656468e-06, "loss": 1.4949, "step": 290000 }, { "epoch": 8.862344962424391, "grad_norm": 14.235151290893555, "learning_rate": 5.71579366050667e-06, "loss": 1.0575, "step": 290100 }, { "epoch": 8.865399890022607, "grad_norm": 5.2210235595703125, "learning_rate": 5.700495655366541e-06, "loss": 1.2209, "step": 290200 }, { "epoch": 8.868454817620822, "grad_norm": 4.419956684112549, "learning_rate": 5.685197650226411e-06, "loss": 1.1516, "step": 290300 }, { "epoch": 8.871509745219038, "grad_norm": 11.630651473999023, "learning_rate": 5.669899645086281e-06, "loss": 1.1808, "step": 290400 }, { "epoch": 8.874564672817254, "grad_norm": 4.662689685821533, "learning_rate": 5.654601639946152e-06, "loss": 1.5103, "step": 290500 }, { "epoch": 8.87761960041547, "grad_norm": 5.733534812927246, "learning_rate": 5.639303634806022e-06, "loss": 1.1499, "step": 290600 }, { "epoch": 8.880674528013685, "grad_norm": 4.516679763793945, "learning_rate": 5.624005629665892e-06, "loss": 1.0164, "step": 290700 }, { "epoch": 8.883729455611903, "grad_norm": 9.872199058532715, "learning_rate": 5.6087076245257626e-06, "loss": 1.0991, "step": 290800 }, { "epoch": 8.886784383210118, "grad_norm": 5.59382963180542, "learning_rate": 5.593409619385633e-06, "loss": 1.1131, "step": 290900 }, { "epoch": 8.889839310808334, "grad_norm": 4.885242938995361, "learning_rate": 5.578111614245503e-06, "loss": 1.1361, "step": 291000 }, { "epoch": 8.89289423840655, "grad_norm": 9.7994966506958, "learning_rate": 5.5628136091053735e-06, "loss": 1.0083, "step": 291100 }, { "epoch": 8.895949166004765, "grad_norm": 6.640185356140137, "learning_rate": 5.547515603965244e-06, "loss": 1.1433, "step": 291200 }, { "epoch": 8.899004093602981, "grad_norm": 8.89209270477295, "learning_rate": 5.532217598825114e-06, "loss": 0.9466, "step": 291300 }, { "epoch": 8.902059021201197, "grad_norm": 14.11497974395752, "learning_rate": 5.516919593684984e-06, "loss": 1.0704, "step": 291400 }, { "epoch": 8.905113948799414, "grad_norm": 4.109190940856934, "learning_rate": 5.501621588544854e-06, "loss": 1.2348, "step": 291500 }, { "epoch": 8.90816887639763, "grad_norm": 4.684236526489258, "learning_rate": 5.486323583404724e-06, "loss": 1.1173, "step": 291600 }, { "epoch": 8.911223803995846, "grad_norm": 5.410808563232422, "learning_rate": 5.4710255782645945e-06, "loss": 1.1324, "step": 291700 }, { "epoch": 8.914278731594061, "grad_norm": 7.469236373901367, "learning_rate": 5.455727573124465e-06, "loss": 1.0794, "step": 291800 }, { "epoch": 8.917333659192277, "grad_norm": 7.113474369049072, "learning_rate": 5.440429567984335e-06, "loss": 1.1584, "step": 291900 }, { "epoch": 8.920388586790493, "grad_norm": 7.708322048187256, "learning_rate": 5.4251315628442054e-06, "loss": 1.0252, "step": 292000 }, { "epoch": 8.923443514388708, "grad_norm": 12.872169494628906, "learning_rate": 5.409833557704076e-06, "loss": 1.3283, "step": 292100 }, { "epoch": 8.926498441986926, "grad_norm": 12.395779609680176, "learning_rate": 5.394535552563946e-06, "loss": 1.3083, "step": 292200 }, { "epoch": 8.929553369585141, "grad_norm": 0.006554835010319948, "learning_rate": 5.379237547423816e-06, "loss": 1.2929, "step": 292300 }, { "epoch": 8.932608297183357, "grad_norm": 15.197608947753906, "learning_rate": 5.363939542283687e-06, "loss": 1.0421, "step": 292400 }, { "epoch": 8.935663224781573, "grad_norm": 4.074192047119141, "learning_rate": 5.348641537143557e-06, "loss": 1.1509, "step": 292500 }, { "epoch": 8.938718152379789, "grad_norm": 10.62905502319336, "learning_rate": 5.333496512054828e-06, "loss": 0.9111, "step": 292600 }, { "epoch": 8.941773079978004, "grad_norm": 5.3739118576049805, "learning_rate": 5.318198506914699e-06, "loss": 1.1221, "step": 292700 }, { "epoch": 8.94482800757622, "grad_norm": 7.762158393859863, "learning_rate": 5.302900501774569e-06, "loss": 1.1094, "step": 292800 }, { "epoch": 8.947882935174436, "grad_norm": 2.196645736694336, "learning_rate": 5.287602496634439e-06, "loss": 1.1351, "step": 292900 }, { "epoch": 8.950937862772653, "grad_norm": 2.2527201175689697, "learning_rate": 5.2723044914943096e-06, "loss": 0.873, "step": 293000 }, { "epoch": 8.953992790370869, "grad_norm": 4.806405544281006, "learning_rate": 5.25700648635418e-06, "loss": 1.2991, "step": 293100 }, { "epoch": 8.957047717969084, "grad_norm": 3.205721855163574, "learning_rate": 5.24170848121405e-06, "loss": 1.3393, "step": 293200 }, { "epoch": 8.9601026455673, "grad_norm": 2.415811777114868, "learning_rate": 5.2264104760739205e-06, "loss": 1.1942, "step": 293300 }, { "epoch": 8.963157573165516, "grad_norm": 12.766294479370117, "learning_rate": 5.211112470933791e-06, "loss": 1.0837, "step": 293400 }, { "epoch": 8.966212500763731, "grad_norm": 6.507505416870117, "learning_rate": 5.195814465793661e-06, "loss": 1.0663, "step": 293500 }, { "epoch": 8.969267428361947, "grad_norm": 1.3235666751861572, "learning_rate": 5.1805164606535314e-06, "loss": 0.9236, "step": 293600 }, { "epoch": 8.972322355960163, "grad_norm": 3.646758794784546, "learning_rate": 5.165218455513402e-06, "loss": 1.2117, "step": 293700 }, { "epoch": 8.97537728355838, "grad_norm": 5.10009241104126, "learning_rate": 5.149920450373272e-06, "loss": 1.0119, "step": 293800 }, { "epoch": 8.978432211156596, "grad_norm": 4.952513217926025, "learning_rate": 5.134622445233142e-06, "loss": 1.0933, "step": 293900 }, { "epoch": 8.981487138754812, "grad_norm": 3.1457278728485107, "learning_rate": 5.119324440093013e-06, "loss": 1.1274, "step": 294000 }, { "epoch": 8.984542066353027, "grad_norm": 10.905735969543457, "learning_rate": 5.104026434952883e-06, "loss": 1.1653, "step": 294100 }, { "epoch": 8.987596993951243, "grad_norm": 6.256532669067383, "learning_rate": 5.088728429812753e-06, "loss": 1.3448, "step": 294200 }, { "epoch": 8.990651921549459, "grad_norm": 2.832413673400879, "learning_rate": 5.073430424672623e-06, "loss": 1.0589, "step": 294300 }, { "epoch": 8.993706849147674, "grad_norm": 5.614567279815674, "learning_rate": 5.058132419532493e-06, "loss": 1.2088, "step": 294400 }, { "epoch": 8.996761776745892, "grad_norm": 2.566842555999756, "learning_rate": 5.042834414392363e-06, "loss": 1.171, "step": 294500 }, { "epoch": 8.999816704344108, "grad_norm": 3.9589226245880127, "learning_rate": 5.027536409252234e-06, "loss": 1.3417, "step": 294600 }, { "epoch": 9.0, "eval_accuracy": 0.7450357426528992, "eval_loss": 0.6936118602752686, "eval_runtime": 1891.6602, "eval_samples_per_second": 17.304, "eval_steps_per_second": 4.326, "step": 294606 }, { "epoch": 9.002871631942323, "grad_norm": 3.6633384227752686, "learning_rate": 5.012238404112104e-06, "loss": 1.1321, "step": 294700 }, { "epoch": 9.005926559540539, "grad_norm": 3.5542895793914795, "learning_rate": 4.996940398971974e-06, "loss": 1.0508, "step": 294800 }, { "epoch": 9.008981487138755, "grad_norm": 14.556180000305176, "learning_rate": 4.981642393831845e-06, "loss": 0.9371, "step": 294900 }, { "epoch": 9.01203641473697, "grad_norm": 7.045968055725098, "learning_rate": 4.966344388691715e-06, "loss": 0.9754, "step": 295000 }, { "epoch": 9.015091342335186, "grad_norm": 9.45659351348877, "learning_rate": 4.951046383551585e-06, "loss": 1.1034, "step": 295100 }, { "epoch": 9.018146269933403, "grad_norm": 0.298971951007843, "learning_rate": 4.9357483784114555e-06, "loss": 1.2968, "step": 295200 }, { "epoch": 9.021201197531619, "grad_norm": 8.019341468811035, "learning_rate": 4.920450373271326e-06, "loss": 0.9928, "step": 295300 }, { "epoch": 9.024256125129835, "grad_norm": 7.284595966339111, "learning_rate": 4.905152368131196e-06, "loss": 1.3505, "step": 295400 }, { "epoch": 9.02731105272805, "grad_norm": 5.352906227111816, "learning_rate": 4.8900073430424675e-06, "loss": 0.9553, "step": 295500 }, { "epoch": 9.030365980326266, "grad_norm": 1.7522146701812744, "learning_rate": 4.874709337902338e-06, "loss": 1.1125, "step": 295600 }, { "epoch": 9.033420907924482, "grad_norm": 3.4803643226623535, "learning_rate": 4.859411332762208e-06, "loss": 1.3101, "step": 295700 }, { "epoch": 9.036475835522698, "grad_norm": 3.04394793510437, "learning_rate": 4.8441133276220784e-06, "loss": 1.0202, "step": 295800 }, { "epoch": 9.039530763120913, "grad_norm": 2.7401368618011475, "learning_rate": 4.828815322481949e-06, "loss": 0.9919, "step": 295900 }, { "epoch": 9.04258569071913, "grad_norm": 8.470032691955566, "learning_rate": 4.813517317341819e-06, "loss": 1.2767, "step": 296000 }, { "epoch": 9.045640618317346, "grad_norm": 4.937151908874512, "learning_rate": 4.798219312201689e-06, "loss": 1.078, "step": 296100 }, { "epoch": 9.048695545915562, "grad_norm": 5.262059211730957, "learning_rate": 4.78292130706156e-06, "loss": 1.1927, "step": 296200 }, { "epoch": 9.051750473513778, "grad_norm": 5.695133209228516, "learning_rate": 4.76762330192143e-06, "loss": 0.99, "step": 296300 }, { "epoch": 9.054805401111993, "grad_norm": 6.795228004455566, "learning_rate": 4.7523252967813e-06, "loss": 1.2354, "step": 296400 }, { "epoch": 9.057860328710209, "grad_norm": 3.628720760345459, "learning_rate": 4.737027291641171e-06, "loss": 1.2754, "step": 296500 }, { "epoch": 9.060915256308425, "grad_norm": 13.540319442749023, "learning_rate": 4.721729286501041e-06, "loss": 0.9649, "step": 296600 }, { "epoch": 9.063970183906642, "grad_norm": 6.967558860778809, "learning_rate": 4.706431281360911e-06, "loss": 1.1276, "step": 296700 }, { "epoch": 9.067025111504858, "grad_norm": 8.045135498046875, "learning_rate": 4.6911332762207815e-06, "loss": 1.2847, "step": 296800 }, { "epoch": 9.070080039103074, "grad_norm": 11.819404602050781, "learning_rate": 4.675835271080652e-06, "loss": 1.0835, "step": 296900 }, { "epoch": 9.07313496670129, "grad_norm": 4.425651550292969, "learning_rate": 4.660537265940522e-06, "loss": 1.1954, "step": 297000 }, { "epoch": 9.076189894299505, "grad_norm": 5.550767421722412, "learning_rate": 4.645239260800392e-06, "loss": 1.053, "step": 297100 }, { "epoch": 9.07924482189772, "grad_norm": 3.0203421115875244, "learning_rate": 4.629941255660262e-06, "loss": 1.034, "step": 297200 }, { "epoch": 9.082299749495936, "grad_norm": 17.99982261657715, "learning_rate": 4.614643250520132e-06, "loss": 1.1257, "step": 297300 }, { "epoch": 9.085354677094152, "grad_norm": 12.948797225952148, "learning_rate": 4.5993452453800026e-06, "loss": 1.0862, "step": 297400 }, { "epoch": 9.08840960469237, "grad_norm": 4.156719207763672, "learning_rate": 4.584047240239873e-06, "loss": 1.0115, "step": 297500 }, { "epoch": 9.091464532290585, "grad_norm": 7.684045791625977, "learning_rate": 4.568749235099743e-06, "loss": 1.1726, "step": 297600 }, { "epoch": 9.0945194598888, "grad_norm": 3.3296148777008057, "learning_rate": 4.5534512299596135e-06, "loss": 1.0634, "step": 297700 }, { "epoch": 9.097574387487017, "grad_norm": 18.09479331970215, "learning_rate": 4.538306204870886e-06, "loss": 1.1221, "step": 297800 }, { "epoch": 9.100629315085232, "grad_norm": 12.1429443359375, "learning_rate": 4.523008199730756e-06, "loss": 1.4943, "step": 297900 }, { "epoch": 9.103684242683448, "grad_norm": 9.859488487243652, "learning_rate": 4.5077101945906255e-06, "loss": 0.9249, "step": 298000 }, { "epoch": 9.106739170281664, "grad_norm": 3.7430107593536377, "learning_rate": 4.492412189450496e-06, "loss": 1.0232, "step": 298100 }, { "epoch": 9.109794097879881, "grad_norm": 4.004197120666504, "learning_rate": 4.477114184310366e-06, "loss": 1.4043, "step": 298200 }, { "epoch": 9.112849025478097, "grad_norm": 8.506260871887207, "learning_rate": 4.461816179170236e-06, "loss": 1.0303, "step": 298300 }, { "epoch": 9.115903953076312, "grad_norm": 6.506143093109131, "learning_rate": 4.446518174030107e-06, "loss": 1.2924, "step": 298400 }, { "epoch": 9.118958880674528, "grad_norm": 3.8566203117370605, "learning_rate": 4.431220168889977e-06, "loss": 1.1308, "step": 298500 }, { "epoch": 9.122013808272744, "grad_norm": 9.278569221496582, "learning_rate": 4.415922163749847e-06, "loss": 1.3331, "step": 298600 }, { "epoch": 9.12506873587096, "grad_norm": 3.4998819828033447, "learning_rate": 4.400624158609718e-06, "loss": 1.1288, "step": 298700 }, { "epoch": 9.128123663469175, "grad_norm": 5.379426956176758, "learning_rate": 4.385326153469588e-06, "loss": 1.1651, "step": 298800 }, { "epoch": 9.131178591067393, "grad_norm": 4.583761215209961, "learning_rate": 4.370028148329458e-06, "loss": 1.0494, "step": 298900 }, { "epoch": 9.134233518665608, "grad_norm": 6.520390033721924, "learning_rate": 4.3547301431893286e-06, "loss": 1.0415, "step": 299000 }, { "epoch": 9.137288446263824, "grad_norm": 11.750895500183105, "learning_rate": 4.339432138049199e-06, "loss": 0.9235, "step": 299100 }, { "epoch": 9.14034337386204, "grad_norm": 6.232056617736816, "learning_rate": 4.324134132909069e-06, "loss": 0.9507, "step": 299200 }, { "epoch": 9.143398301460255, "grad_norm": 4.731393337249756, "learning_rate": 4.3088361277689395e-06, "loss": 1.1672, "step": 299300 }, { "epoch": 9.146453229058471, "grad_norm": 6.675495624542236, "learning_rate": 4.29353812262881e-06, "loss": 1.4035, "step": 299400 }, { "epoch": 9.149508156656687, "grad_norm": 8.32022476196289, "learning_rate": 4.27824011748868e-06, "loss": 0.8868, "step": 299500 }, { "epoch": 9.152563084254902, "grad_norm": 10.285120964050293, "learning_rate": 4.26294211234855e-06, "loss": 1.1058, "step": 299600 }, { "epoch": 9.15561801185312, "grad_norm": 10.524968147277832, "learning_rate": 4.247644107208421e-06, "loss": 1.064, "step": 299700 }, { "epoch": 9.158672939451336, "grad_norm": 8.351820945739746, "learning_rate": 4.23234610206829e-06, "loss": 1.0773, "step": 299800 }, { "epoch": 9.161727867049551, "grad_norm": 8.429049491882324, "learning_rate": 4.2170480969281605e-06, "loss": 1.3182, "step": 299900 }, { "epoch": 9.164782794647767, "grad_norm": 8.29355525970459, "learning_rate": 4.201750091788031e-06, "loss": 1.1658, "step": 300000 }, { "epoch": 9.167837722245983, "grad_norm": 1.961021900177002, "learning_rate": 4.186452086647901e-06, "loss": 1.1509, "step": 300100 }, { "epoch": 9.170892649844198, "grad_norm": 7.757428169250488, "learning_rate": 4.1711540815077714e-06, "loss": 1.1293, "step": 300200 }, { "epoch": 9.173947577442414, "grad_norm": 3.276634693145752, "learning_rate": 4.155856076367642e-06, "loss": 1.2559, "step": 300300 }, { "epoch": 9.17700250504063, "grad_norm": 3.877615213394165, "learning_rate": 4.140558071227512e-06, "loss": 1.6137, "step": 300400 }, { "epoch": 9.180057432638847, "grad_norm": 17.469383239746094, "learning_rate": 4.125413046138784e-06, "loss": 1.1948, "step": 300500 }, { "epoch": 9.183112360237063, "grad_norm": 8.345824241638184, "learning_rate": 4.1101150409986546e-06, "loss": 1.2168, "step": 300600 }, { "epoch": 9.186167287835278, "grad_norm": 2.92280650138855, "learning_rate": 4.094817035858525e-06, "loss": 1.0463, "step": 300700 }, { "epoch": 9.189222215433494, "grad_norm": 1.5842794179916382, "learning_rate": 4.079519030718394e-06, "loss": 1.1139, "step": 300800 }, { "epoch": 9.19227714303171, "grad_norm": 6.409753322601318, "learning_rate": 4.064221025578265e-06, "loss": 1.2472, "step": 300900 }, { "epoch": 9.195332070629926, "grad_norm": 8.010771751403809, "learning_rate": 4.048923020438135e-06, "loss": 1.619, "step": 301000 }, { "epoch": 9.198386998228141, "grad_norm": 4.597021579742432, "learning_rate": 4.033625015298005e-06, "loss": 0.9576, "step": 301100 }, { "epoch": 9.201441925826359, "grad_norm": 10.441827774047852, "learning_rate": 4.0183270101578756e-06, "loss": 1.0894, "step": 301200 }, { "epoch": 9.204496853424574, "grad_norm": 3.910430431365967, "learning_rate": 4.003029005017746e-06, "loss": 1.1124, "step": 301300 }, { "epoch": 9.20755178102279, "grad_norm": 4.471078395843506, "learning_rate": 3.987730999877616e-06, "loss": 1.0833, "step": 301400 }, { "epoch": 9.210606708621006, "grad_norm": 12.565783500671387, "learning_rate": 3.9724329947374865e-06, "loss": 1.3408, "step": 301500 }, { "epoch": 9.213661636219221, "grad_norm": 4.613448619842529, "learning_rate": 3.957134989597357e-06, "loss": 1.0302, "step": 301600 }, { "epoch": 9.216716563817437, "grad_norm": 12.828263282775879, "learning_rate": 3.941836984457227e-06, "loss": 1.2364, "step": 301700 }, { "epoch": 9.219771491415653, "grad_norm": 17.703357696533203, "learning_rate": 3.9265389793170974e-06, "loss": 1.1193, "step": 301800 }, { "epoch": 9.22282641901387, "grad_norm": 4.079858303070068, "learning_rate": 3.911240974176968e-06, "loss": 1.165, "step": 301900 }, { "epoch": 9.225881346612086, "grad_norm": 5.560415744781494, "learning_rate": 3.895942969036838e-06, "loss": 1.4166, "step": 302000 }, { "epoch": 9.228936274210302, "grad_norm": 4.343051910400391, "learning_rate": 3.880644963896708e-06, "loss": 1.0804, "step": 302100 }, { "epoch": 9.231991201808517, "grad_norm": 11.356123924255371, "learning_rate": 3.865346958756579e-06, "loss": 0.9958, "step": 302200 }, { "epoch": 9.235046129406733, "grad_norm": 8.211952209472656, "learning_rate": 3.850048953616449e-06, "loss": 0.9514, "step": 302300 }, { "epoch": 9.238101057004949, "grad_norm": 6.394819736480713, "learning_rate": 3.834750948476319e-06, "loss": 0.9585, "step": 302400 }, { "epoch": 9.241155984603164, "grad_norm": 7.153532028198242, "learning_rate": 3.81945294333619e-06, "loss": 1.3609, "step": 302500 }, { "epoch": 9.24421091220138, "grad_norm": 3.2434120178222656, "learning_rate": 3.804154938196059e-06, "loss": 1.1877, "step": 302600 }, { "epoch": 9.247265839799597, "grad_norm": 9.158300399780273, "learning_rate": 3.7888569330559294e-06, "loss": 1.0076, "step": 302700 }, { "epoch": 9.250320767397813, "grad_norm": 7.51426362991333, "learning_rate": 3.7735589279157997e-06, "loss": 1.0217, "step": 302800 }, { "epoch": 9.253375694996029, "grad_norm": 3.0722098350524902, "learning_rate": 3.75826092277567e-06, "loss": 1.121, "step": 302900 }, { "epoch": 9.256430622594245, "grad_norm": 2.5164806842803955, "learning_rate": 3.7431158976869418e-06, "loss": 1.4275, "step": 303000 }, { "epoch": 9.25948555019246, "grad_norm": 8.31301212310791, "learning_rate": 3.727817892546812e-06, "loss": 1.0477, "step": 303100 }, { "epoch": 9.262540477790676, "grad_norm": 2.260798215866089, "learning_rate": 3.7125198874066824e-06, "loss": 1.1562, "step": 303200 }, { "epoch": 9.265595405388892, "grad_norm": 3.4062960147857666, "learning_rate": 3.6972218822665527e-06, "loss": 1.1084, "step": 303300 }, { "epoch": 9.268650332987109, "grad_norm": 6.769488334655762, "learning_rate": 3.681923877126423e-06, "loss": 1.0529, "step": 303400 }, { "epoch": 9.271705260585325, "grad_norm": 10.702885627746582, "learning_rate": 3.6666258719862933e-06, "loss": 1.2395, "step": 303500 }, { "epoch": 9.27476018818354, "grad_norm": 5.437123775482178, "learning_rate": 3.651327866846163e-06, "loss": 1.0823, "step": 303600 }, { "epoch": 9.277815115781756, "grad_norm": 8.948333740234375, "learning_rate": 3.6360298617060335e-06, "loss": 1.1325, "step": 303700 }, { "epoch": 9.280870043379972, "grad_norm": 6.607554912567139, "learning_rate": 3.620731856565904e-06, "loss": 1.3233, "step": 303800 }, { "epoch": 9.283924970978187, "grad_norm": 7.056314468383789, "learning_rate": 3.605433851425774e-06, "loss": 1.1637, "step": 303900 }, { "epoch": 9.286979898576403, "grad_norm": 3.6274731159210205, "learning_rate": 3.5901358462856444e-06, "loss": 1.2798, "step": 304000 }, { "epoch": 9.290034826174619, "grad_norm": 3.242490768432617, "learning_rate": 3.5748378411455148e-06, "loss": 1.1217, "step": 304100 }, { "epoch": 9.293089753772836, "grad_norm": 25.473543167114258, "learning_rate": 3.559539836005385e-06, "loss": 1.0237, "step": 304200 }, { "epoch": 9.296144681371052, "grad_norm": 6.082782745361328, "learning_rate": 3.5442418308652554e-06, "loss": 1.3431, "step": 304300 }, { "epoch": 9.299199608969268, "grad_norm": 6.279635429382324, "learning_rate": 3.5289438257251257e-06, "loss": 1.1487, "step": 304400 }, { "epoch": 9.302254536567483, "grad_norm": 6.003155708312988, "learning_rate": 3.513645820584996e-06, "loss": 1.1343, "step": 304500 }, { "epoch": 9.305309464165699, "grad_norm": 4.19450044631958, "learning_rate": 3.4983478154448663e-06, "loss": 1.1376, "step": 304600 }, { "epoch": 9.308364391763915, "grad_norm": 3.5574228763580322, "learning_rate": 3.4830498103047366e-06, "loss": 1.089, "step": 304700 }, { "epoch": 9.31141931936213, "grad_norm": 8.025125503540039, "learning_rate": 3.467751805164607e-06, "loss": 1.0255, "step": 304800 }, { "epoch": 9.314474246960348, "grad_norm": 3.3775525093078613, "learning_rate": 3.4524538000244772e-06, "loss": 1.0984, "step": 304900 }, { "epoch": 9.317529174558564, "grad_norm": 4.80861234664917, "learning_rate": 3.4371557948843475e-06, "loss": 1.1859, "step": 305000 }, { "epoch": 9.32058410215678, "grad_norm": 16.27497673034668, "learning_rate": 3.421857789744218e-06, "loss": 1.3898, "step": 305100 }, { "epoch": 9.323639029754995, "grad_norm": 21.027254104614258, "learning_rate": 3.406559784604088e-06, "loss": 1.0316, "step": 305200 }, { "epoch": 9.32669395735321, "grad_norm": 4.368397235870361, "learning_rate": 3.3912617794639585e-06, "loss": 1.0431, "step": 305300 }, { "epoch": 9.329748884951426, "grad_norm": 3.5449934005737305, "learning_rate": 3.375963774323828e-06, "loss": 1.0531, "step": 305400 }, { "epoch": 9.332803812549642, "grad_norm": 5.919412612915039, "learning_rate": 3.3606657691836982e-06, "loss": 1.1873, "step": 305500 }, { "epoch": 9.335858740147858, "grad_norm": 2.567575693130493, "learning_rate": 3.3455207440949705e-06, "loss": 1.085, "step": 305600 }, { "epoch": 9.338913667746075, "grad_norm": 12.758691787719727, "learning_rate": 3.3302227389548408e-06, "loss": 1.0725, "step": 305700 }, { "epoch": 9.34196859534429, "grad_norm": 3.886571168899536, "learning_rate": 3.314924733814711e-06, "loss": 1.1712, "step": 305800 }, { "epoch": 9.345023522942506, "grad_norm": 8.74039077758789, "learning_rate": 3.2996267286745814e-06, "loss": 1.5482, "step": 305900 }, { "epoch": 9.348078450540722, "grad_norm": 5.288308143615723, "learning_rate": 3.2843287235344517e-06, "loss": 1.2712, "step": 306000 }, { "epoch": 9.351133378138938, "grad_norm": 9.838608741760254, "learning_rate": 3.269030718394322e-06, "loss": 1.143, "step": 306100 }, { "epoch": 9.354188305737154, "grad_norm": 5.224956035614014, "learning_rate": 3.2537327132541923e-06, "loss": 1.01, "step": 306200 }, { "epoch": 9.35724323333537, "grad_norm": 5.820688724517822, "learning_rate": 3.2384347081140618e-06, "loss": 1.1268, "step": 306300 }, { "epoch": 9.360298160933587, "grad_norm": 2.071725368499756, "learning_rate": 3.223136702973932e-06, "loss": 0.8517, "step": 306400 }, { "epoch": 9.363353088531802, "grad_norm": 17.318157196044922, "learning_rate": 3.2078386978338024e-06, "loss": 1.4184, "step": 306500 }, { "epoch": 9.366408016130018, "grad_norm": 8.712560653686523, "learning_rate": 3.1925406926936727e-06, "loss": 1.0402, "step": 306600 }, { "epoch": 9.369462943728234, "grad_norm": 9.98942756652832, "learning_rate": 3.177242687553543e-06, "loss": 1.0045, "step": 306700 }, { "epoch": 9.37251787132645, "grad_norm": 42.62532043457031, "learning_rate": 3.1619446824134133e-06, "loss": 1.188, "step": 306800 }, { "epoch": 9.375572798924665, "grad_norm": 7.075395584106445, "learning_rate": 3.1466466772732836e-06, "loss": 1.1637, "step": 306900 }, { "epoch": 9.37862772652288, "grad_norm": 3.448315382003784, "learning_rate": 3.131348672133154e-06, "loss": 0.9102, "step": 307000 }, { "epoch": 9.381682654121096, "grad_norm": 10.89814281463623, "learning_rate": 3.1160506669930243e-06, "loss": 1.0922, "step": 307100 }, { "epoch": 9.384737581719314, "grad_norm": 25.21414566040039, "learning_rate": 3.1007526618528946e-06, "loss": 1.1627, "step": 307200 }, { "epoch": 9.38779250931753, "grad_norm": 9.04411506652832, "learning_rate": 3.085454656712765e-06, "loss": 1.1222, "step": 307300 }, { "epoch": 9.390847436915745, "grad_norm": 7.841292381286621, "learning_rate": 3.070156651572635e-06, "loss": 0.9225, "step": 307400 }, { "epoch": 9.393902364513961, "grad_norm": 9.054701805114746, "learning_rate": 3.0548586464325055e-06, "loss": 1.1585, "step": 307500 }, { "epoch": 9.396957292112177, "grad_norm": 4.471860408782959, "learning_rate": 3.039560641292376e-06, "loss": 1.3753, "step": 307600 }, { "epoch": 9.400012219710392, "grad_norm": 4.296054363250732, "learning_rate": 3.0242626361522457e-06, "loss": 1.1125, "step": 307700 }, { "epoch": 9.403067147308608, "grad_norm": 4.867402076721191, "learning_rate": 3.008964631012116e-06, "loss": 0.8846, "step": 307800 }, { "epoch": 9.406122074906826, "grad_norm": 5.357305526733398, "learning_rate": 2.9938196059233878e-06, "loss": 1.1692, "step": 307900 }, { "epoch": 9.409177002505041, "grad_norm": 5.116623878479004, "learning_rate": 2.978521600783258e-06, "loss": 0.8862, "step": 308000 }, { "epoch": 9.412231930103257, "grad_norm": 2.731386661529541, "learning_rate": 2.9632235956431284e-06, "loss": 0.8937, "step": 308100 }, { "epoch": 9.415286857701473, "grad_norm": 4.1633501052856445, "learning_rate": 2.9479255905029987e-06, "loss": 1.3928, "step": 308200 }, { "epoch": 9.418341785299688, "grad_norm": 6.098063945770264, "learning_rate": 2.932627585362869e-06, "loss": 1.1213, "step": 308300 }, { "epoch": 9.421396712897904, "grad_norm": 9.61737060546875, "learning_rate": 2.9173295802227393e-06, "loss": 1.0766, "step": 308400 }, { "epoch": 9.42445164049612, "grad_norm": 6.344211578369141, "learning_rate": 2.9020315750826096e-06, "loss": 1.0052, "step": 308500 }, { "epoch": 9.427506568094337, "grad_norm": 5.4332170486450195, "learning_rate": 2.8867335699424795e-06, "loss": 1.1181, "step": 308600 }, { "epoch": 9.430561495692553, "grad_norm": 3.6471481323242188, "learning_rate": 2.87143556480235e-06, "loss": 1.3035, "step": 308700 }, { "epoch": 9.433616423290768, "grad_norm": 7.321690082550049, "learning_rate": 2.85613755966222e-06, "loss": 1.0789, "step": 308800 }, { "epoch": 9.436671350888984, "grad_norm": 6.397017955780029, "learning_rate": 2.8408395545220905e-06, "loss": 1.3215, "step": 308900 }, { "epoch": 9.4397262784872, "grad_norm": 7.486538887023926, "learning_rate": 2.8255415493819608e-06, "loss": 1.0702, "step": 309000 }, { "epoch": 9.442781206085415, "grad_norm": 13.288386344909668, "learning_rate": 2.810243544241831e-06, "loss": 1.0518, "step": 309100 }, { "epoch": 9.445836133683631, "grad_norm": 4.601072788238525, "learning_rate": 2.7949455391017014e-06, "loss": 1.1671, "step": 309200 }, { "epoch": 9.448891061281847, "grad_norm": 10.149903297424316, "learning_rate": 2.7796475339615717e-06, "loss": 1.16, "step": 309300 }, { "epoch": 9.451945988880064, "grad_norm": 6.56465482711792, "learning_rate": 2.764349528821442e-06, "loss": 0.9742, "step": 309400 }, { "epoch": 9.45500091647828, "grad_norm": 11.361266136169434, "learning_rate": 2.7490515236813123e-06, "loss": 1.1496, "step": 309500 }, { "epoch": 9.458055844076496, "grad_norm": 6.141317844390869, "learning_rate": 2.733753518541182e-06, "loss": 1.4959, "step": 309600 }, { "epoch": 9.461110771674711, "grad_norm": 10.651205062866211, "learning_rate": 2.7184555134010525e-06, "loss": 1.0295, "step": 309700 }, { "epoch": 9.464165699272927, "grad_norm": 4.727423191070557, "learning_rate": 2.703157508260923e-06, "loss": 1.2795, "step": 309800 }, { "epoch": 9.467220626871143, "grad_norm": 10.940389633178711, "learning_rate": 2.687859503120793e-06, "loss": 1.1406, "step": 309900 }, { "epoch": 9.470275554469358, "grad_norm": 3.7946255207061768, "learning_rate": 2.6725614979806634e-06, "loss": 1.1332, "step": 310000 }, { "epoch": 9.473330482067574, "grad_norm": 7.23513650894165, "learning_rate": 2.6572634928405337e-06, "loss": 0.9592, "step": 310100 }, { "epoch": 9.476385409665792, "grad_norm": 7.274559020996094, "learning_rate": 2.641965487700404e-06, "loss": 1.1219, "step": 310200 }, { "epoch": 9.479440337264007, "grad_norm": 4.508282661437988, "learning_rate": 2.6266674825602744e-06, "loss": 1.164, "step": 310300 }, { "epoch": 9.482495264862223, "grad_norm": 18.602556228637695, "learning_rate": 2.6113694774201447e-06, "loss": 1.0263, "step": 310400 }, { "epoch": 9.485550192460439, "grad_norm": 3.0879483222961426, "learning_rate": 2.5960714722800146e-06, "loss": 1.0635, "step": 310500 }, { "epoch": 9.488605120058654, "grad_norm": 12.606521606445312, "learning_rate": 2.580773467139885e-06, "loss": 1.1495, "step": 310600 }, { "epoch": 9.49166004765687, "grad_norm": 17.32771110534668, "learning_rate": 2.565475461999755e-06, "loss": 1.5242, "step": 310700 }, { "epoch": 9.494714975255086, "grad_norm": 14.84314250946045, "learning_rate": 2.550330436911027e-06, "loss": 1.0958, "step": 310800 }, { "epoch": 9.497769902853303, "grad_norm": 21.077354431152344, "learning_rate": 2.5350324317708973e-06, "loss": 1.1022, "step": 310900 }, { "epoch": 9.500824830451519, "grad_norm": 6.4609246253967285, "learning_rate": 2.5197344266307676e-06, "loss": 1.3823, "step": 311000 }, { "epoch": 9.503879758049735, "grad_norm": 3.544327974319458, "learning_rate": 2.504436421490638e-06, "loss": 1.1492, "step": 311100 }, { "epoch": 9.50693468564795, "grad_norm": 14.659440994262695, "learning_rate": 2.489138416350508e-06, "loss": 1.0902, "step": 311200 }, { "epoch": 9.509989613246166, "grad_norm": 6.885101318359375, "learning_rate": 2.4738404112103785e-06, "loss": 1.0379, "step": 311300 }, { "epoch": 9.513044540844382, "grad_norm": 19.84307098388672, "learning_rate": 2.4585424060702484e-06, "loss": 1.0196, "step": 311400 }, { "epoch": 9.516099468442597, "grad_norm": 15.895182609558105, "learning_rate": 2.4432444009301187e-06, "loss": 1.2111, "step": 311500 }, { "epoch": 9.519154396040815, "grad_norm": 9.044354438781738, "learning_rate": 2.427946395789989e-06, "loss": 1.1352, "step": 311600 }, { "epoch": 9.52220932363903, "grad_norm": 5.041273593902588, "learning_rate": 2.4126483906498593e-06, "loss": 1.1897, "step": 311700 }, { "epoch": 9.525264251237246, "grad_norm": 6.682356357574463, "learning_rate": 2.3973503855097296e-06, "loss": 1.2135, "step": 311800 }, { "epoch": 9.528319178835462, "grad_norm": 10.315226554870605, "learning_rate": 2.3820523803696e-06, "loss": 1.0691, "step": 311900 }, { "epoch": 9.531374106433677, "grad_norm": 6.702560901641846, "learning_rate": 2.3667543752294703e-06, "loss": 1.0155, "step": 312000 }, { "epoch": 9.534429034031893, "grad_norm": 3.5035879611968994, "learning_rate": 2.3514563700893406e-06, "loss": 1.126, "step": 312100 }, { "epoch": 9.537483961630109, "grad_norm": 9.319344520568848, "learning_rate": 2.336158364949211e-06, "loss": 1.0382, "step": 312200 }, { "epoch": 9.540538889228324, "grad_norm": 8.65913200378418, "learning_rate": 2.3208603598090808e-06, "loss": 0.9698, "step": 312300 }, { "epoch": 9.543593816826542, "grad_norm": 11.004213333129883, "learning_rate": 2.305562354668951e-06, "loss": 1.2223, "step": 312400 }, { "epoch": 9.546648744424758, "grad_norm": 6.043313503265381, "learning_rate": 2.2902643495288214e-06, "loss": 0.9833, "step": 312500 }, { "epoch": 9.549703672022973, "grad_norm": 2.2580814361572266, "learning_rate": 2.2749663443886917e-06, "loss": 1.1875, "step": 312600 }, { "epoch": 9.552758599621189, "grad_norm": 6.7491278648376465, "learning_rate": 2.259668339248562e-06, "loss": 1.0376, "step": 312700 }, { "epoch": 9.555813527219405, "grad_norm": 3.858389377593994, "learning_rate": 2.2443703341084323e-06, "loss": 1.4318, "step": 312800 }, { "epoch": 9.55886845481762, "grad_norm": 4.029982089996338, "learning_rate": 2.2290723289683026e-06, "loss": 0.9203, "step": 312900 }, { "epoch": 9.561923382415836, "grad_norm": 10.59288215637207, "learning_rate": 2.213774323828173e-06, "loss": 1.0497, "step": 313000 }, { "epoch": 9.564978310014052, "grad_norm": 6.483738422393799, "learning_rate": 2.1984763186880432e-06, "loss": 1.3017, "step": 313100 }, { "epoch": 9.56803323761227, "grad_norm": 8.11201286315918, "learning_rate": 2.183178313547913e-06, "loss": 1.0831, "step": 313200 }, { "epoch": 9.571088165210485, "grad_norm": 6.691319465637207, "learning_rate": 2.168033288459185e-06, "loss": 1.1194, "step": 313300 }, { "epoch": 9.5741430928087, "grad_norm": 17.671133041381836, "learning_rate": 2.1527352833190552e-06, "loss": 1.0706, "step": 313400 }, { "epoch": 9.577198020406916, "grad_norm": 4.4608869552612305, "learning_rate": 2.1374372781789255e-06, "loss": 1.1263, "step": 313500 }, { "epoch": 9.580252948005132, "grad_norm": 5.7190446853637695, "learning_rate": 2.122139273038796e-06, "loss": 1.0572, "step": 313600 }, { "epoch": 9.583307875603348, "grad_norm": 13.810622215270996, "learning_rate": 2.106841267898666e-06, "loss": 1.1233, "step": 313700 }, { "epoch": 9.586362803201563, "grad_norm": 3.3503191471099854, "learning_rate": 2.0915432627585365e-06, "loss": 1.2015, "step": 313800 }, { "epoch": 9.58941773079978, "grad_norm": 19.035480499267578, "learning_rate": 2.0762452576184068e-06, "loss": 1.5642, "step": 313900 }, { "epoch": 9.592472658397996, "grad_norm": 3.9842231273651123, "learning_rate": 2.060947252478277e-06, "loss": 0.9717, "step": 314000 }, { "epoch": 9.595527585996212, "grad_norm": 1.611772894859314, "learning_rate": 2.0456492473381474e-06, "loss": 0.9999, "step": 314100 }, { "epoch": 9.598582513594428, "grad_norm": 8.103382110595703, "learning_rate": 2.0303512421980173e-06, "loss": 1.092, "step": 314200 }, { "epoch": 9.601637441192644, "grad_norm": 3.4380149841308594, "learning_rate": 2.0150532370578876e-06, "loss": 1.1504, "step": 314300 }, { "epoch": 9.60469236879086, "grad_norm": 11.057697296142578, "learning_rate": 1.999755231917758e-06, "loss": 1.1114, "step": 314400 }, { "epoch": 9.607747296389075, "grad_norm": 3.9668221473693848, "learning_rate": 1.984457226777628e-06, "loss": 1.0464, "step": 314500 }, { "epoch": 9.610802223987292, "grad_norm": 3.590650796890259, "learning_rate": 1.9691592216374985e-06, "loss": 1.231, "step": 314600 }, { "epoch": 9.613857151585508, "grad_norm": 6.6729817390441895, "learning_rate": 1.953861216497369e-06, "loss": 1.1776, "step": 314700 }, { "epoch": 9.616912079183724, "grad_norm": 13.663302421569824, "learning_rate": 1.938563211357239e-06, "loss": 1.1555, "step": 314800 }, { "epoch": 9.61996700678194, "grad_norm": 16.635059356689453, "learning_rate": 1.9232652062171094e-06, "loss": 1.199, "step": 314900 }, { "epoch": 9.623021934380155, "grad_norm": 5.7954230308532715, "learning_rate": 1.9079672010769797e-06, "loss": 1.1143, "step": 315000 }, { "epoch": 9.62607686197837, "grad_norm": 4.609525203704834, "learning_rate": 1.8926691959368498e-06, "loss": 1.0792, "step": 315100 }, { "epoch": 9.629131789576586, "grad_norm": 3.5899178981781006, "learning_rate": 1.8773711907967202e-06, "loss": 1.2464, "step": 315200 }, { "epoch": 9.632186717174802, "grad_norm": 4.332881927490234, "learning_rate": 1.8620731856565905e-06, "loss": 1.0524, "step": 315300 }, { "epoch": 9.63524164477302, "grad_norm": 11.647631645202637, "learning_rate": 1.8467751805164608e-06, "loss": 1.1577, "step": 315400 }, { "epoch": 9.638296572371235, "grad_norm": 4.526301383972168, "learning_rate": 1.831477175376331e-06, "loss": 1.3823, "step": 315500 }, { "epoch": 9.641351499969451, "grad_norm": 6.170322895050049, "learning_rate": 1.8161791702362014e-06, "loss": 1.2213, "step": 315600 }, { "epoch": 9.644406427567667, "grad_norm": 16.08941650390625, "learning_rate": 1.8008811650960717e-06, "loss": 1.2398, "step": 315700 }, { "epoch": 9.647461355165882, "grad_norm": 13.574227333068848, "learning_rate": 1.785583159955942e-06, "loss": 1.1134, "step": 315800 }, { "epoch": 9.650516282764098, "grad_norm": 17.764299392700195, "learning_rate": 1.7702851548158123e-06, "loss": 1.2985, "step": 315900 }, { "epoch": 9.653571210362314, "grad_norm": 1.291345477104187, "learning_rate": 1.7549871496756822e-06, "loss": 1.1614, "step": 316000 }, { "epoch": 9.656626137960531, "grad_norm": 11.866973876953125, "learning_rate": 1.7396891445355525e-06, "loss": 0.9506, "step": 316100 }, { "epoch": 9.659681065558747, "grad_norm": 5.622418403625488, "learning_rate": 1.7243911393954228e-06, "loss": 1.0569, "step": 316200 }, { "epoch": 9.662735993156963, "grad_norm": 24.80992317199707, "learning_rate": 1.7090931342552931e-06, "loss": 1.3718, "step": 316300 }, { "epoch": 9.665790920755178, "grad_norm": 1.4948173761367798, "learning_rate": 1.6937951291151634e-06, "loss": 1.2264, "step": 316400 }, { "epoch": 9.668845848353394, "grad_norm": 0.5281773805618286, "learning_rate": 1.6784971239750338e-06, "loss": 1.0053, "step": 316500 }, { "epoch": 9.67190077595161, "grad_norm": 0.00047525722766295075, "learning_rate": 1.663199118834904e-06, "loss": 0.9378, "step": 316600 }, { "epoch": 9.674955703549825, "grad_norm": 9.675890922546387, "learning_rate": 1.6479011136947744e-06, "loss": 1.0745, "step": 316700 }, { "epoch": 9.678010631148041, "grad_norm": 9.780930519104004, "learning_rate": 1.6326031085546447e-06, "loss": 1.0422, "step": 316800 }, { "epoch": 9.681065558746258, "grad_norm": 10.785592079162598, "learning_rate": 1.6173051034145146e-06, "loss": 1.0482, "step": 316900 }, { "epoch": 9.684120486344474, "grad_norm": 12.213613510131836, "learning_rate": 1.6020070982743849e-06, "loss": 1.1848, "step": 317000 }, { "epoch": 9.68717541394269, "grad_norm": 3.4983580112457275, "learning_rate": 1.5867090931342552e-06, "loss": 1.0039, "step": 317100 }, { "epoch": 9.690230341540905, "grad_norm": 3.852393865585327, "learning_rate": 1.5714110879941255e-06, "loss": 1.2497, "step": 317200 }, { "epoch": 9.693285269139121, "grad_norm": 6.902023792266846, "learning_rate": 1.5562660629053973e-06, "loss": 1.1257, "step": 317300 }, { "epoch": 9.696340196737337, "grad_norm": 19.849651336669922, "learning_rate": 1.5409680577652676e-06, "loss": 1.1279, "step": 317400 }, { "epoch": 9.699395124335553, "grad_norm": 2.910151720046997, "learning_rate": 1.525670052625138e-06, "loss": 1.6378, "step": 317500 }, { "epoch": 9.70245005193377, "grad_norm": 5.355201244354248, "learning_rate": 1.510372047485008e-06, "loss": 0.9853, "step": 317600 }, { "epoch": 9.705504979531986, "grad_norm": 5.7815775871276855, "learning_rate": 1.4950740423448783e-06, "loss": 1.1443, "step": 317700 }, { "epoch": 9.708559907130201, "grad_norm": 10.988548278808594, "learning_rate": 1.4797760372047486e-06, "loss": 1.0067, "step": 317800 }, { "epoch": 9.711614834728417, "grad_norm": 7.885775089263916, "learning_rate": 1.464478032064619e-06, "loss": 1.126, "step": 317900 }, { "epoch": 9.714669762326633, "grad_norm": 5.713939666748047, "learning_rate": 1.4491800269244892e-06, "loss": 1.1281, "step": 318000 }, { "epoch": 9.717724689924848, "grad_norm": 39.32889175415039, "learning_rate": 1.4338820217843593e-06, "loss": 1.2703, "step": 318100 }, { "epoch": 9.720779617523064, "grad_norm": 10.617399215698242, "learning_rate": 1.4185840166442296e-06, "loss": 1.0991, "step": 318200 }, { "epoch": 9.723834545121282, "grad_norm": 2.650892734527588, "learning_rate": 1.4032860115041e-06, "loss": 1.1306, "step": 318300 }, { "epoch": 9.726889472719497, "grad_norm": 5.076470375061035, "learning_rate": 1.3879880063639703e-06, "loss": 1.4098, "step": 318400 }, { "epoch": 9.729944400317713, "grad_norm": 7.810786724090576, "learning_rate": 1.3728429812752418e-06, "loss": 1.4563, "step": 318500 }, { "epoch": 9.732999327915929, "grad_norm": 30.487869262695312, "learning_rate": 1.357544976135112e-06, "loss": 1.1686, "step": 318600 }, { "epoch": 9.736054255514144, "grad_norm": 15.498873710632324, "learning_rate": 1.3422469709949822e-06, "loss": 1.0622, "step": 318700 }, { "epoch": 9.73910918311236, "grad_norm": 12.014664649963379, "learning_rate": 1.3269489658548526e-06, "loss": 1.0155, "step": 318800 }, { "epoch": 9.742164110710576, "grad_norm": 3.6885623931884766, "learning_rate": 1.3116509607147229e-06, "loss": 1.0831, "step": 318900 }, { "epoch": 9.745219038308791, "grad_norm": 11.527379035949707, "learning_rate": 1.2963529555745932e-06, "loss": 1.2619, "step": 319000 }, { "epoch": 9.748273965907009, "grad_norm": 3.429348945617676, "learning_rate": 1.2810549504344633e-06, "loss": 1.1128, "step": 319100 }, { "epoch": 9.751328893505224, "grad_norm": 5.2909064292907715, "learning_rate": 1.2657569452943336e-06, "loss": 1.2422, "step": 319200 }, { "epoch": 9.75438382110344, "grad_norm": 11.643526077270508, "learning_rate": 1.2504589401542039e-06, "loss": 1.235, "step": 319300 }, { "epoch": 9.757438748701656, "grad_norm": 8.330674171447754, "learning_rate": 1.2351609350140742e-06, "loss": 1.0515, "step": 319400 }, { "epoch": 9.760493676299872, "grad_norm": 6.047513484954834, "learning_rate": 1.2198629298739445e-06, "loss": 1.1808, "step": 319500 }, { "epoch": 9.763548603898087, "grad_norm": 3.790055990219116, "learning_rate": 1.2045649247338148e-06, "loss": 1.1685, "step": 319600 }, { "epoch": 9.766603531496303, "grad_norm": 11.13185977935791, "learning_rate": 1.1892669195936851e-06, "loss": 1.0524, "step": 319700 }, { "epoch": 9.769658459094519, "grad_norm": 3.2141988277435303, "learning_rate": 1.1739689144535554e-06, "loss": 1.2405, "step": 319800 }, { "epoch": 9.772713386692736, "grad_norm": 3.883542060852051, "learning_rate": 1.1586709093134257e-06, "loss": 1.1421, "step": 319900 }, { "epoch": 9.775768314290952, "grad_norm": 3.9310715198516846, "learning_rate": 1.1433729041732958e-06, "loss": 0.9411, "step": 320000 }, { "epoch": 9.778823241889167, "grad_norm": 4.970428943634033, "learning_rate": 1.1280748990331662e-06, "loss": 1.0685, "step": 320100 }, { "epoch": 9.781878169487383, "grad_norm": 9.02895736694336, "learning_rate": 1.1127768938930365e-06, "loss": 1.5052, "step": 320200 }, { "epoch": 9.784933097085599, "grad_norm": 7.479128837585449, "learning_rate": 1.0974788887529068e-06, "loss": 1.251, "step": 320300 }, { "epoch": 9.787988024683814, "grad_norm": 9.539288520812988, "learning_rate": 1.0821808836127769e-06, "loss": 1.0766, "step": 320400 }, { "epoch": 9.79104295228203, "grad_norm": 4.297487258911133, "learning_rate": 1.0668828784726472e-06, "loss": 1.1509, "step": 320500 }, { "epoch": 9.794097879880248, "grad_norm": 8.104453086853027, "learning_rate": 1.0515848733325175e-06, "loss": 1.1988, "step": 320600 }, { "epoch": 9.797152807478463, "grad_norm": 5.961874485015869, "learning_rate": 1.0362868681923878e-06, "loss": 1.2145, "step": 320700 }, { "epoch": 9.800207735076679, "grad_norm": 13.597023010253906, "learning_rate": 1.0209888630522581e-06, "loss": 1.0281, "step": 320800 }, { "epoch": 9.803262662674895, "grad_norm": 5.224933624267578, "learning_rate": 1.0056908579121282e-06, "loss": 1.0692, "step": 320900 }, { "epoch": 9.80631759027311, "grad_norm": 6.424704074859619, "learning_rate": 9.905458328233998e-07, "loss": 1.1236, "step": 321000 }, { "epoch": 9.809372517871326, "grad_norm": 2.5682260990142822, "learning_rate": 9.7524782768327e-07, "loss": 1.1231, "step": 321100 }, { "epoch": 9.812427445469542, "grad_norm": 11.884291648864746, "learning_rate": 9.599498225431404e-07, "loss": 1.4074, "step": 321200 }, { "epoch": 9.81548237306776, "grad_norm": 5.21375846862793, "learning_rate": 9.446518174030108e-07, "loss": 1.0686, "step": 321300 }, { "epoch": 9.818537300665975, "grad_norm": 4.836977481842041, "learning_rate": 9.293538122628809e-07, "loss": 1.1199, "step": 321400 }, { "epoch": 9.82159222826419, "grad_norm": 3.2210793495178223, "learning_rate": 9.140558071227512e-07, "loss": 1.0152, "step": 321500 }, { "epoch": 9.824647155862406, "grad_norm": 2.9414494037628174, "learning_rate": 8.987578019826215e-07, "loss": 1.08, "step": 321600 }, { "epoch": 9.827702083460622, "grad_norm": 3.7541918754577637, "learning_rate": 8.834597968424918e-07, "loss": 1.1733, "step": 321700 }, { "epoch": 9.830757011058838, "grad_norm": 3.0762391090393066, "learning_rate": 8.681617917023619e-07, "loss": 1.2142, "step": 321800 }, { "epoch": 9.833811938657053, "grad_norm": 6.95817232131958, "learning_rate": 8.528637865622323e-07, "loss": 1.3079, "step": 321900 }, { "epoch": 9.836866866255269, "grad_norm": 5.202836990356445, "learning_rate": 8.375657814221026e-07, "loss": 1.2138, "step": 322000 }, { "epoch": 9.839921793853486, "grad_norm": 4.930724620819092, "learning_rate": 8.222677762819729e-07, "loss": 1.1406, "step": 322100 }, { "epoch": 9.842976721451702, "grad_norm": 5.462831974029541, "learning_rate": 8.069697711418432e-07, "loss": 1.0949, "step": 322200 }, { "epoch": 9.846031649049918, "grad_norm": 6.223543167114258, "learning_rate": 7.916717660017134e-07, "loss": 1.1979, "step": 322300 }, { "epoch": 9.849086576648133, "grad_norm": 3.8657338619232178, "learning_rate": 7.763737608615837e-07, "loss": 0.9746, "step": 322400 }, { "epoch": 9.85214150424635, "grad_norm": 4.022277355194092, "learning_rate": 7.61075755721454e-07, "loss": 1.2531, "step": 322500 }, { "epoch": 9.855196431844565, "grad_norm": 12.097512245178223, "learning_rate": 7.457777505813242e-07, "loss": 1.2117, "step": 322600 }, { "epoch": 9.85825135944278, "grad_norm": 20.343971252441406, "learning_rate": 7.304797454411945e-07, "loss": 1.1413, "step": 322700 }, { "epoch": 9.861306287040996, "grad_norm": 20.783205032348633, "learning_rate": 7.151817403010648e-07, "loss": 1.0459, "step": 322800 }, { "epoch": 9.864361214639214, "grad_norm": 18.140941619873047, "learning_rate": 6.99883735160935e-07, "loss": 1.0668, "step": 322900 }, { "epoch": 9.86741614223743, "grad_norm": 3.446868419647217, "learning_rate": 6.845857300208053e-07, "loss": 1.0923, "step": 323000 }, { "epoch": 9.870471069835645, "grad_norm": 4.770709991455078, "learning_rate": 6.692877248806755e-07, "loss": 1.0852, "step": 323100 }, { "epoch": 9.87352599743386, "grad_norm": 12.75534439086914, "learning_rate": 6.539897197405459e-07, "loss": 1.0167, "step": 323200 }, { "epoch": 9.876580925032076, "grad_norm": 5.473841667175293, "learning_rate": 6.386917146004162e-07, "loss": 1.1701, "step": 323300 }, { "epoch": 9.879635852630292, "grad_norm": 2.295653820037842, "learning_rate": 6.233937094602865e-07, "loss": 1.2056, "step": 323400 }, { "epoch": 9.882690780228508, "grad_norm": 3.8608434200286865, "learning_rate": 6.080957043201567e-07, "loss": 1.1849, "step": 323500 }, { "epoch": 9.885745707826725, "grad_norm": 0.010462761856615543, "learning_rate": 5.92797699180027e-07, "loss": 1.1015, "step": 323600 }, { "epoch": 9.888800635424941, "grad_norm": 12.851978302001953, "learning_rate": 5.774996940398973e-07, "loss": 1.0735, "step": 323700 }, { "epoch": 9.891855563023157, "grad_norm": 7.366413116455078, "learning_rate": 5.622016888997675e-07, "loss": 1.0405, "step": 323800 }, { "epoch": 9.894910490621372, "grad_norm": 6.953218936920166, "learning_rate": 5.469036837596378e-07, "loss": 0.9401, "step": 323900 }, { "epoch": 9.897965418219588, "grad_norm": 2.4097061157226562, "learning_rate": 5.31605678619508e-07, "loss": 1.2281, "step": 324000 }, { "epoch": 9.901020345817804, "grad_norm": 6.127590179443359, "learning_rate": 5.164606535307796e-07, "loss": 1.069, "step": 324100 }, { "epoch": 9.90407527341602, "grad_norm": 8.705816268920898, "learning_rate": 5.011626483906499e-07, "loss": 1.1442, "step": 324200 }, { "epoch": 9.907130201014237, "grad_norm": 2.994471311569214, "learning_rate": 4.858646432505201e-07, "loss": 1.0269, "step": 324300 }, { "epoch": 9.910185128612452, "grad_norm": 1.9631928205490112, "learning_rate": 4.7056663811039046e-07, "loss": 1.3986, "step": 324400 }, { "epoch": 9.913240056210668, "grad_norm": 1.6768262386322021, "learning_rate": 4.5526863297026066e-07, "loss": 1.1931, "step": 324500 }, { "epoch": 9.916294983808884, "grad_norm": 5.2185235023498535, "learning_rate": 4.39970627830131e-07, "loss": 1.1374, "step": 324600 }, { "epoch": 9.9193499114071, "grad_norm": 5.210101127624512, "learning_rate": 4.2467262269000123e-07, "loss": 1.2098, "step": 324700 }, { "epoch": 9.922404839005315, "grad_norm": 6.5714545249938965, "learning_rate": 4.0937461754987154e-07, "loss": 0.9999, "step": 324800 }, { "epoch": 9.925459766603531, "grad_norm": 3.536860942840576, "learning_rate": 3.9407661240974174e-07, "loss": 1.1187, "step": 324900 }, { "epoch": 9.928514694201747, "grad_norm": 9.010876655578613, "learning_rate": 3.7877860726961205e-07, "loss": 1.3612, "step": 325000 }, { "epoch": 9.931569621799964, "grad_norm": 18.201263427734375, "learning_rate": 3.634806021294823e-07, "loss": 1.1017, "step": 325100 }, { "epoch": 9.93462454939818, "grad_norm": 10.483688354492188, "learning_rate": 3.4818259698935257e-07, "loss": 1.0799, "step": 325200 }, { "epoch": 9.937679476996395, "grad_norm": 3.2840802669525146, "learning_rate": 3.328845918492229e-07, "loss": 1.2754, "step": 325300 }, { "epoch": 9.940734404594611, "grad_norm": 3.8806185722351074, "learning_rate": 3.175865867090932e-07, "loss": 1.0955, "step": 325400 }, { "epoch": 9.943789332192827, "grad_norm": 10.518257141113281, "learning_rate": 3.0228858156896344e-07, "loss": 1.3088, "step": 325500 }, { "epoch": 9.946844259791042, "grad_norm": 10.4476957321167, "learning_rate": 2.869905764288337e-07, "loss": 1.0003, "step": 325600 }, { "epoch": 9.949899187389258, "grad_norm": 10.631278991699219, "learning_rate": 2.7169257128870396e-07, "loss": 1.2052, "step": 325700 }, { "epoch": 9.952954114987476, "grad_norm": 15.547146797180176, "learning_rate": 2.563945661485742e-07, "loss": 0.9217, "step": 325800 }, { "epoch": 9.956009042585691, "grad_norm": 8.524794578552246, "learning_rate": 2.410965610084445e-07, "loss": 1.2376, "step": 325900 }, { "epoch": 9.959063970183907, "grad_norm": 10.924880981445312, "learning_rate": 2.2579855586831478e-07, "loss": 0.891, "step": 326000 }, { "epoch": 9.962118897782123, "grad_norm": 6.899311542510986, "learning_rate": 2.1050055072818504e-07, "loss": 1.0654, "step": 326100 }, { "epoch": 9.965173825380338, "grad_norm": 9.488914489746094, "learning_rate": 1.9520254558805532e-07, "loss": 1.3211, "step": 326200 }, { "epoch": 9.968228752978554, "grad_norm": 4.252932071685791, "learning_rate": 1.799045404479256e-07, "loss": 1.0891, "step": 326300 }, { "epoch": 9.97128368057677, "grad_norm": 2.436544418334961, "learning_rate": 1.6460653530779588e-07, "loss": 1.1258, "step": 326400 }, { "epoch": 9.974338608174985, "grad_norm": 10.499905586242676, "learning_rate": 1.4930853016766614e-07, "loss": 1.2108, "step": 326500 }, { "epoch": 9.977393535773203, "grad_norm": 6.818589687347412, "learning_rate": 1.3401052502753642e-07, "loss": 1.131, "step": 326600 }, { "epoch": 9.980448463371419, "grad_norm": 12.36090087890625, "learning_rate": 1.1871251988740668e-07, "loss": 1.1388, "step": 326700 }, { "epoch": 9.983503390969634, "grad_norm": 15.297159194946289, "learning_rate": 1.0341451474727696e-07, "loss": 1.1518, "step": 326800 }, { "epoch": 9.98655831856785, "grad_norm": 3.3843600749969482, "learning_rate": 8.811650960714722e-08, "loss": 0.97, "step": 326900 }, { "epoch": 9.989613246166066, "grad_norm": 0.6283736228942871, "learning_rate": 7.28185044670175e-08, "loss": 1.2331, "step": 327000 }, { "epoch": 9.992668173764281, "grad_norm": 12.6235933303833, "learning_rate": 5.7520499326887774e-08, "loss": 1.2373, "step": 327100 }, { "epoch": 9.995723101362497, "grad_norm": 9.48670768737793, "learning_rate": 4.222249418675805e-08, "loss": 0.9996, "step": 327200 }, { "epoch": 9.998778028960714, "grad_norm": 10.663695335388184, "learning_rate": 2.707746909802962e-08, "loss": 1.1021, "step": 327300 }, { "epoch": 10.0, "eval_accuracy": 0.7415225759149509, "eval_loss": 0.7014397382736206, "eval_runtime": 1867.5396, "eval_samples_per_second": 17.528, "eval_steps_per_second": 4.382, "step": 327340 }, { "epoch": 10.0, "step": 327340, "total_flos": 2.8536683814220615e+20, "train_loss": 1.1817914221248462, "train_runtime": 92460.3356, "train_samples_per_second": 14.161, "train_steps_per_second": 3.54 } ], "logging_steps": 100, "max_steps": 327340, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8536683814220615e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }