{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 3000, "global_step": 34068, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005870611717740989, "grad_norm": 5.2924604415893555, "learning_rate": 1e-05, "loss": 0.0386, "step": 10 }, { "epoch": 0.0011741223435481978, "grad_norm": 4.440995216369629, "learning_rate": 2e-05, "loss": 0.0194, "step": 20 }, { "epoch": 0.0017611835153222965, "grad_norm": 5.551347255706787, "learning_rate": 3e-05, "loss": 0.0188, "step": 30 }, { "epoch": 0.0023482446870963956, "grad_norm": 5.97813606262207, "learning_rate": 4e-05, "loss": 0.0513, "step": 40 }, { "epoch": 0.0029353058588704943, "grad_norm": 1.9274829626083374, "learning_rate": 5e-05, "loss": 0.022, "step": 50 }, { "epoch": 0.003522367030644593, "grad_norm": 6.143263816833496, "learning_rate": 6e-05, "loss": 0.0487, "step": 60 }, { "epoch": 0.004109428202418692, "grad_norm": 1.7340521812438965, "learning_rate": 7e-05, "loss": 0.0194, "step": 70 }, { "epoch": 0.004696489374192791, "grad_norm": 0.8576582074165344, "learning_rate": 8e-05, "loss": 0.0094, "step": 80 }, { "epoch": 0.0052835505459668895, "grad_norm": 0.2858774662017822, "learning_rate": 9e-05, "loss": 0.0148, "step": 90 }, { "epoch": 0.005870611717740989, "grad_norm": 1.6273711919784546, "learning_rate": 0.0001, "loss": 0.0168, "step": 100 }, { "epoch": 0.006457672889515088, "grad_norm": 2.489337682723999, "learning_rate": 9.999997861546707e-05, "loss": 0.0111, "step": 110 }, { "epoch": 0.007044734061289186, "grad_norm": 1.9543702602386475, "learning_rate": 9.99999144618865e-05, "loss": 0.0223, "step": 120 }, { "epoch": 0.007631795233063285, "grad_norm": 1.2999861240386963, "learning_rate": 9.999980753931321e-05, "loss": 0.0223, "step": 130 }, { "epoch": 0.008218856404837384, "grad_norm": 1.1619627475738525, "learning_rate": 9.999965784783865e-05, "loss": 0.012, "step": 140 }, { "epoch": 0.008805917576611482, "grad_norm": 1.8998370170593262, "learning_rate": 9.999946538759087e-05, "loss": 0.008, "step": 150 }, { "epoch": 0.009392978748385582, "grad_norm": 0.3024078905582428, "learning_rate": 9.999923015873447e-05, "loss": 0.0194, "step": 160 }, { "epoch": 0.00998003992015968, "grad_norm": 0.44685640931129456, "learning_rate": 9.999895216147068e-05, "loss": 0.0108, "step": 170 }, { "epoch": 0.010567101091933779, "grad_norm": 0.9512900114059448, "learning_rate": 9.99986313960373e-05, "loss": 0.0082, "step": 180 }, { "epoch": 0.011154162263707879, "grad_norm": 2.5597662925720215, "learning_rate": 9.99982678627087e-05, "loss": 0.0237, "step": 190 }, { "epoch": 0.011741223435481977, "grad_norm": 0.7417121529579163, "learning_rate": 9.999786156179584e-05, "loss": 0.0248, "step": 200 }, { "epoch": 0.012328284607256075, "grad_norm": 2.8113763332366943, "learning_rate": 9.999741249364625e-05, "loss": 0.0234, "step": 210 }, { "epoch": 0.012915345779030175, "grad_norm": 2.2599666118621826, "learning_rate": 9.999692065864407e-05, "loss": 0.0222, "step": 220 }, { "epoch": 0.013502406950804274, "grad_norm": 3.0797460079193115, "learning_rate": 9.999638605721e-05, "loss": 0.0278, "step": 230 }, { "epoch": 0.014089468122578372, "grad_norm": 5.824965953826904, "learning_rate": 9.999580868980134e-05, "loss": 0.0263, "step": 240 }, { "epoch": 0.014676529294352472, "grad_norm": 0.3678293526172638, "learning_rate": 9.999518855691194e-05, "loss": 0.0118, "step": 250 }, { "epoch": 0.01526359046612657, "grad_norm": 1.307920217514038, "learning_rate": 9.999452565907225e-05, "loss": 0.036, "step": 260 }, { "epoch": 0.01585065163790067, "grad_norm": 3.1141858100891113, "learning_rate": 9.999381999684934e-05, "loss": 0.0113, "step": 270 }, { "epoch": 0.01643771280967477, "grad_norm": 2.5242245197296143, "learning_rate": 9.999307157084676e-05, "loss": 0.0289, "step": 280 }, { "epoch": 0.017024773981448867, "grad_norm": 0.8906269669532776, "learning_rate": 9.999228038170475e-05, "loss": 0.0391, "step": 290 }, { "epoch": 0.017611835153222965, "grad_norm": 1.5113784074783325, "learning_rate": 9.999144643010004e-05, "loss": 0.0195, "step": 300 }, { "epoch": 0.018198896324997063, "grad_norm": 1.154229998588562, "learning_rate": 9.999056971674601e-05, "loss": 0.0123, "step": 310 }, { "epoch": 0.018785957496771165, "grad_norm": 0.10093630105257034, "learning_rate": 9.998965024239256e-05, "loss": 0.0244, "step": 320 }, { "epoch": 0.019373018668545263, "grad_norm": 1.6571542024612427, "learning_rate": 9.99886880078262e-05, "loss": 0.0317, "step": 330 }, { "epoch": 0.01996007984031936, "grad_norm": 1.0903353691101074, "learning_rate": 9.998768301387001e-05, "loss": 0.0325, "step": 340 }, { "epoch": 0.02054714101209346, "grad_norm": 3.5400142669677734, "learning_rate": 9.998663526138365e-05, "loss": 0.0164, "step": 350 }, { "epoch": 0.021134202183867558, "grad_norm": 0.31252965331077576, "learning_rate": 9.998554475126332e-05, "loss": 0.0464, "step": 360 }, { "epoch": 0.02172126335564166, "grad_norm": 2.111171245574951, "learning_rate": 9.998441148444184e-05, "loss": 0.0366, "step": 370 }, { "epoch": 0.022308324527415758, "grad_norm": 1.4933933019638062, "learning_rate": 9.99832354618886e-05, "loss": 0.0595, "step": 380 }, { "epoch": 0.022895385699189856, "grad_norm": 1.3113383054733276, "learning_rate": 9.998201668460952e-05, "loss": 0.0237, "step": 390 }, { "epoch": 0.023482446870963954, "grad_norm": 2.266380786895752, "learning_rate": 9.998075515364715e-05, "loss": 0.0261, "step": 400 }, { "epoch": 0.024069508042738053, "grad_norm": 3.4844086170196533, "learning_rate": 9.997945087008055e-05, "loss": 0.0303, "step": 410 }, { "epoch": 0.02465656921451215, "grad_norm": 1.4794069528579712, "learning_rate": 9.99781038350254e-05, "loss": 0.0139, "step": 420 }, { "epoch": 0.025243630386286253, "grad_norm": 1.4122917652130127, "learning_rate": 9.997671404963391e-05, "loss": 0.0305, "step": 430 }, { "epoch": 0.02583069155806035, "grad_norm": 2.149915933609009, "learning_rate": 9.99752815150949e-05, "loss": 0.016, "step": 440 }, { "epoch": 0.02641775272983445, "grad_norm": 2.181758165359497, "learning_rate": 9.99738062326337e-05, "loss": 0.046, "step": 450 }, { "epoch": 0.027004813901608547, "grad_norm": 0.137978196144104, "learning_rate": 9.997228820351228e-05, "loss": 0.0335, "step": 460 }, { "epoch": 0.027591875073382646, "grad_norm": 1.9188355207443237, "learning_rate": 9.997072742902912e-05, "loss": 0.0401, "step": 470 }, { "epoch": 0.028178936245156744, "grad_norm": 0.17180809378623962, "learning_rate": 9.996912391051925e-05, "loss": 0.0182, "step": 480 }, { "epoch": 0.028765997416930845, "grad_norm": 2.3665249347686768, "learning_rate": 9.996747764935431e-05, "loss": 0.0272, "step": 490 }, { "epoch": 0.029353058588704944, "grad_norm": 1.9356086254119873, "learning_rate": 9.996578864694249e-05, "loss": 0.0418, "step": 500 }, { "epoch": 0.029940119760479042, "grad_norm": 2.297760009765625, "learning_rate": 9.996405690472852e-05, "loss": 0.0493, "step": 510 }, { "epoch": 0.03052718093225314, "grad_norm": 1.4701766967773438, "learning_rate": 9.996228242419372e-05, "loss": 0.0372, "step": 520 }, { "epoch": 0.03111424210402724, "grad_norm": 0.682854175567627, "learning_rate": 9.996046520685592e-05, "loss": 0.0212, "step": 530 }, { "epoch": 0.03170130327580134, "grad_norm": 1.8583916425704956, "learning_rate": 9.995860525426954e-05, "loss": 0.0327, "step": 540 }, { "epoch": 0.032288364447575435, "grad_norm": 0.3811361491680145, "learning_rate": 9.995670256802554e-05, "loss": 0.0392, "step": 550 }, { "epoch": 0.03287542561934954, "grad_norm": 2.1470518112182617, "learning_rate": 9.995475714975146e-05, "loss": 0.0272, "step": 560 }, { "epoch": 0.03346248679112364, "grad_norm": 3.5489206314086914, "learning_rate": 9.995276900111139e-05, "loss": 0.043, "step": 570 }, { "epoch": 0.03404954796289773, "grad_norm": 1.1328600645065308, "learning_rate": 9.995073812380594e-05, "loss": 0.0266, "step": 580 }, { "epoch": 0.034636609134671835, "grad_norm": 2.3162460327148438, "learning_rate": 9.994866451957225e-05, "loss": 0.0196, "step": 590 }, { "epoch": 0.03522367030644593, "grad_norm": 1.2759737968444824, "learning_rate": 9.994654819018408e-05, "loss": 0.0459, "step": 600 }, { "epoch": 0.03581073147822003, "grad_norm": 5.27681303024292, "learning_rate": 9.99443891374517e-05, "loss": 0.0207, "step": 610 }, { "epoch": 0.036397792649994126, "grad_norm": 0.07440024614334106, "learning_rate": 9.99421873632219e-05, "loss": 0.0409, "step": 620 }, { "epoch": 0.03698485382176823, "grad_norm": 2.167147636413574, "learning_rate": 9.993994286937805e-05, "loss": 0.0327, "step": 630 }, { "epoch": 0.03757191499354233, "grad_norm": 1.7458330392837524, "learning_rate": 9.993765565784006e-05, "loss": 0.0354, "step": 640 }, { "epoch": 0.038158976165316424, "grad_norm": 2.8717799186706543, "learning_rate": 9.993532573056436e-05, "loss": 0.0221, "step": 650 }, { "epoch": 0.038746037337090526, "grad_norm": 3.0224597454071045, "learning_rate": 9.993295308954391e-05, "loss": 0.0353, "step": 660 }, { "epoch": 0.03933309850886462, "grad_norm": 2.830904960632324, "learning_rate": 9.993053773680823e-05, "loss": 0.0428, "step": 670 }, { "epoch": 0.03992015968063872, "grad_norm": 0.8768723011016846, "learning_rate": 9.992807967442339e-05, "loss": 0.0204, "step": 680 }, { "epoch": 0.040507220852412824, "grad_norm": 1.538102626800537, "learning_rate": 9.992557890449195e-05, "loss": 0.0353, "step": 690 }, { "epoch": 0.04109428202418692, "grad_norm": 1.5642238855361938, "learning_rate": 9.992303542915302e-05, "loss": 0.0253, "step": 700 }, { "epoch": 0.04168134319596102, "grad_norm": 1.8776806592941284, "learning_rate": 9.992044925058224e-05, "loss": 0.0279, "step": 710 }, { "epoch": 0.042268404367735116, "grad_norm": 1.9298291206359863, "learning_rate": 9.99178203709918e-05, "loss": 0.0366, "step": 720 }, { "epoch": 0.04285546553950922, "grad_norm": 1.1513307094573975, "learning_rate": 9.991514879263038e-05, "loss": 0.0207, "step": 730 }, { "epoch": 0.04344252671128332, "grad_norm": 1.3171567916870117, "learning_rate": 9.991243451778318e-05, "loss": 0.0389, "step": 740 }, { "epoch": 0.044029587883057414, "grad_norm": 6.66058874130249, "learning_rate": 9.990967754877197e-05, "loss": 0.0263, "step": 750 }, { "epoch": 0.044616649054831516, "grad_norm": 1.9210799932479858, "learning_rate": 9.9906877887955e-05, "loss": 0.0424, "step": 760 }, { "epoch": 0.04520371022660561, "grad_norm": 4.144294738769531, "learning_rate": 9.990403553772704e-05, "loss": 0.0382, "step": 770 }, { "epoch": 0.04579077139837971, "grad_norm": 2.204535722732544, "learning_rate": 9.990115050051939e-05, "loss": 0.0266, "step": 780 }, { "epoch": 0.04637783257015381, "grad_norm": 2.062601089477539, "learning_rate": 9.989822277879985e-05, "loss": 0.0211, "step": 790 }, { "epoch": 0.04696489374192791, "grad_norm": 2.904456377029419, "learning_rate": 9.989525237507276e-05, "loss": 0.0311, "step": 800 }, { "epoch": 0.04755195491370201, "grad_norm": 2.250447988510132, "learning_rate": 9.989223929187893e-05, "loss": 0.033, "step": 810 }, { "epoch": 0.048139016085476105, "grad_norm": 0.9330112338066101, "learning_rate": 9.988918353179568e-05, "loss": 0.0361, "step": 820 }, { "epoch": 0.04872607725725021, "grad_norm": 1.713057518005371, "learning_rate": 9.988608509743688e-05, "loss": 0.0267, "step": 830 }, { "epoch": 0.0493131384290243, "grad_norm": 1.5273679494857788, "learning_rate": 9.988294399145285e-05, "loss": 0.03, "step": 840 }, { "epoch": 0.0499001996007984, "grad_norm": 2.2448716163635254, "learning_rate": 9.987976021653046e-05, "loss": 0.0215, "step": 850 }, { "epoch": 0.050487260772572505, "grad_norm": 1.0452334880828857, "learning_rate": 9.987653377539303e-05, "loss": 0.0273, "step": 860 }, { "epoch": 0.0510743219443466, "grad_norm": 1.7774131298065186, "learning_rate": 9.987326467080041e-05, "loss": 0.043, "step": 870 }, { "epoch": 0.0516613831161207, "grad_norm": 4.27858829498291, "learning_rate": 9.98699529055489e-05, "loss": 0.0451, "step": 880 }, { "epoch": 0.052248444287894796, "grad_norm": 2.2619693279266357, "learning_rate": 9.986659848247135e-05, "loss": 0.0499, "step": 890 }, { "epoch": 0.0528355054596689, "grad_norm": 1.6962707042694092, "learning_rate": 9.986320140443708e-05, "loss": 0.0243, "step": 900 }, { "epoch": 0.053422566631443, "grad_norm": 7.003340244293213, "learning_rate": 9.985976167435187e-05, "loss": 0.0397, "step": 910 }, { "epoch": 0.054009627803217095, "grad_norm": 3.3854308128356934, "learning_rate": 9.9856279295158e-05, "loss": 0.0439, "step": 920 }, { "epoch": 0.054596688974991196, "grad_norm": 1.678689956665039, "learning_rate": 9.985275426983425e-05, "loss": 0.0311, "step": 930 }, { "epoch": 0.05518375014676529, "grad_norm": 3.6396162509918213, "learning_rate": 9.984918660139583e-05, "loss": 0.0326, "step": 940 }, { "epoch": 0.05577081131853939, "grad_norm": 0.7963114380836487, "learning_rate": 9.984557629289449e-05, "loss": 0.0229, "step": 950 }, { "epoch": 0.05635787249031349, "grad_norm": 0.7552520632743835, "learning_rate": 9.984192334741839e-05, "loss": 0.0124, "step": 960 }, { "epoch": 0.05694493366208759, "grad_norm": 1.0268884897232056, "learning_rate": 9.98382277680922e-05, "loss": 0.0254, "step": 970 }, { "epoch": 0.05753199483386169, "grad_norm": 0.2784561216831207, "learning_rate": 9.983448955807708e-05, "loss": 0.0427, "step": 980 }, { "epoch": 0.058119056005635786, "grad_norm": 3.4149084091186523, "learning_rate": 9.983070872057059e-05, "loss": 0.0236, "step": 990 }, { "epoch": 0.05870611717740989, "grad_norm": 2.656593084335327, "learning_rate": 9.982688525880679e-05, "loss": 0.0451, "step": 1000 }, { "epoch": 0.05929317834918398, "grad_norm": 1.5874123573303223, "learning_rate": 9.98230191760562e-05, "loss": 0.0279, "step": 1010 }, { "epoch": 0.059880239520958084, "grad_norm": 1.9736193418502808, "learning_rate": 9.981911047562583e-05, "loss": 0.0276, "step": 1020 }, { "epoch": 0.060467300692732186, "grad_norm": 1.7418795824050903, "learning_rate": 9.981515916085906e-05, "loss": 0.0346, "step": 1030 }, { "epoch": 0.06105436186450628, "grad_norm": 0.9295284748077393, "learning_rate": 9.981116523513579e-05, "loss": 0.0397, "step": 1040 }, { "epoch": 0.06164142303628038, "grad_norm": 1.0048223733901978, "learning_rate": 9.980712870187236e-05, "loss": 0.0387, "step": 1050 }, { "epoch": 0.06222848420805448, "grad_norm": 2.988097667694092, "learning_rate": 9.980304956452153e-05, "loss": 0.0375, "step": 1060 }, { "epoch": 0.06281554537982857, "grad_norm": 2.827054500579834, "learning_rate": 9.979892782657253e-05, "loss": 0.0321, "step": 1070 }, { "epoch": 0.06340260655160268, "grad_norm": 2.4515411853790283, "learning_rate": 9.9794763491551e-05, "loss": 0.0378, "step": 1080 }, { "epoch": 0.06398966772337678, "grad_norm": 0.5734860897064209, "learning_rate": 9.979055656301905e-05, "loss": 0.0483, "step": 1090 }, { "epoch": 0.06457672889515087, "grad_norm": 5.403695106506348, "learning_rate": 9.978630704457521e-05, "loss": 0.0204, "step": 1100 }, { "epoch": 0.06516379006692498, "grad_norm": 1.776674509048462, "learning_rate": 9.978201493985444e-05, "loss": 0.0555, "step": 1110 }, { "epoch": 0.06575085123869907, "grad_norm": 1.797873616218567, "learning_rate": 9.97776802525281e-05, "loss": 0.0452, "step": 1120 }, { "epoch": 0.06633791241047317, "grad_norm": 0.5363721251487732, "learning_rate": 9.977330298630402e-05, "loss": 0.0593, "step": 1130 }, { "epoch": 0.06692497358224728, "grad_norm": 1.8825099468231201, "learning_rate": 9.976888314492644e-05, "loss": 0.0433, "step": 1140 }, { "epoch": 0.06751203475402137, "grad_norm": 2.1495919227600098, "learning_rate": 9.9764420732176e-05, "loss": 0.0448, "step": 1150 }, { "epoch": 0.06809909592579547, "grad_norm": 0.7973845601081848, "learning_rate": 9.975991575186977e-05, "loss": 0.0228, "step": 1160 }, { "epoch": 0.06868615709756956, "grad_norm": 2.296572208404541, "learning_rate": 9.97553682078612e-05, "loss": 0.0329, "step": 1170 }, { "epoch": 0.06927321826934367, "grad_norm": 1.6185338497161865, "learning_rate": 9.975077810404021e-05, "loss": 0.0204, "step": 1180 }, { "epoch": 0.06986027944111776, "grad_norm": 5.656839847564697, "learning_rate": 9.974614544433307e-05, "loss": 0.0311, "step": 1190 }, { "epoch": 0.07044734061289186, "grad_norm": 7.7276201248168945, "learning_rate": 9.974147023270249e-05, "loss": 0.0601, "step": 1200 }, { "epoch": 0.07103440178466597, "grad_norm": 1.1774202585220337, "learning_rate": 9.973675247314753e-05, "loss": 0.0264, "step": 1210 }, { "epoch": 0.07162146295644006, "grad_norm": 2.7271509170532227, "learning_rate": 9.973199216970368e-05, "loss": 0.0543, "step": 1220 }, { "epoch": 0.07220852412821416, "grad_norm": 0.4096977114677429, "learning_rate": 9.972718932644283e-05, "loss": 0.0362, "step": 1230 }, { "epoch": 0.07279558529998825, "grad_norm": 1.7209502458572388, "learning_rate": 9.972234394747324e-05, "loss": 0.0249, "step": 1240 }, { "epoch": 0.07338264647176236, "grad_norm": 2.486680746078491, "learning_rate": 9.971745603693956e-05, "loss": 0.0282, "step": 1250 }, { "epoch": 0.07396970764353646, "grad_norm": 1.500947117805481, "learning_rate": 9.971252559902277e-05, "loss": 0.0368, "step": 1260 }, { "epoch": 0.07455676881531055, "grad_norm": 1.1559159755706787, "learning_rate": 9.970755263794035e-05, "loss": 0.0182, "step": 1270 }, { "epoch": 0.07514382998708466, "grad_norm": 1.5027871131896973, "learning_rate": 9.970253715794603e-05, "loss": 0.0675, "step": 1280 }, { "epoch": 0.07573089115885875, "grad_norm": 3.4849696159362793, "learning_rate": 9.969747916332996e-05, "loss": 0.0535, "step": 1290 }, { "epoch": 0.07631795233063285, "grad_norm": 3.05766224861145, "learning_rate": 9.969237865841867e-05, "loss": 0.0312, "step": 1300 }, { "epoch": 0.07690501350240696, "grad_norm": 1.5578389167785645, "learning_rate": 9.968723564757503e-05, "loss": 0.0529, "step": 1310 }, { "epoch": 0.07749207467418105, "grad_norm": 2.0051987171173096, "learning_rate": 9.968205013519826e-05, "loss": 0.0385, "step": 1320 }, { "epoch": 0.07807913584595515, "grad_norm": 5.790458679199219, "learning_rate": 9.967682212572398e-05, "loss": 0.0809, "step": 1330 }, { "epoch": 0.07866619701772924, "grad_norm": 3.024461269378662, "learning_rate": 9.967155162362413e-05, "loss": 0.049, "step": 1340 }, { "epoch": 0.07925325818950335, "grad_norm": 2.9698665142059326, "learning_rate": 9.966623863340696e-05, "loss": 0.0572, "step": 1350 }, { "epoch": 0.07984031936127745, "grad_norm": 3.04648494720459, "learning_rate": 9.966088315961715e-05, "loss": 0.0347, "step": 1360 }, { "epoch": 0.08042738053305154, "grad_norm": 3.894293785095215, "learning_rate": 9.965548520683563e-05, "loss": 0.037, "step": 1370 }, { "epoch": 0.08101444170482565, "grad_norm": 1.584315299987793, "learning_rate": 9.965004477967974e-05, "loss": 0.0264, "step": 1380 }, { "epoch": 0.08160150287659974, "grad_norm": 3.116762161254883, "learning_rate": 9.964456188280311e-05, "loss": 0.0578, "step": 1390 }, { "epoch": 0.08218856404837384, "grad_norm": 1.2944003343582153, "learning_rate": 9.96390365208957e-05, "loss": 0.0467, "step": 1400 }, { "epoch": 0.08277562522014793, "grad_norm": 3.4657671451568604, "learning_rate": 9.96334686986838e-05, "loss": 0.0424, "step": 1410 }, { "epoch": 0.08336268639192204, "grad_norm": 1.4936097860336304, "learning_rate": 9.962785842093003e-05, "loss": 0.0591, "step": 1420 }, { "epoch": 0.08394974756369614, "grad_norm": 2.914396047592163, "learning_rate": 9.962220569243332e-05, "loss": 0.0809, "step": 1430 }, { "epoch": 0.08453680873547023, "grad_norm": 1.9370061159133911, "learning_rate": 9.961651051802891e-05, "loss": 0.0477, "step": 1440 }, { "epoch": 0.08512386990724434, "grad_norm": 2.7822482585906982, "learning_rate": 9.961077290258833e-05, "loss": 0.0577, "step": 1450 }, { "epoch": 0.08571093107901843, "grad_norm": 2.286750555038452, "learning_rate": 9.960499285101945e-05, "loss": 0.0275, "step": 1460 }, { "epoch": 0.08629799225079253, "grad_norm": 4.1236186027526855, "learning_rate": 9.95991703682664e-05, "loss": 0.0332, "step": 1470 }, { "epoch": 0.08688505342256664, "grad_norm": 2.3775007724761963, "learning_rate": 9.959330545930963e-05, "loss": 0.0347, "step": 1480 }, { "epoch": 0.08747211459434073, "grad_norm": 2.8342068195343018, "learning_rate": 9.958739812916586e-05, "loss": 0.059, "step": 1490 }, { "epoch": 0.08805917576611483, "grad_norm": 1.1539396047592163, "learning_rate": 9.958144838288814e-05, "loss": 0.0251, "step": 1500 }, { "epoch": 0.08864623693788892, "grad_norm": 1.3793189525604248, "learning_rate": 9.957545622556574e-05, "loss": 0.0301, "step": 1510 }, { "epoch": 0.08923329810966303, "grad_norm": 3.5822372436523438, "learning_rate": 9.956942166232427e-05, "loss": 0.0583, "step": 1520 }, { "epoch": 0.08982035928143713, "grad_norm": 1.3325873613357544, "learning_rate": 9.956334469832556e-05, "loss": 0.0327, "step": 1530 }, { "epoch": 0.09040742045321122, "grad_norm": 2.214155435562134, "learning_rate": 9.955722533876773e-05, "loss": 0.031, "step": 1540 }, { "epoch": 0.09099448162498533, "grad_norm": 5.501091480255127, "learning_rate": 9.955106358888517e-05, "loss": 0.0349, "step": 1550 }, { "epoch": 0.09158154279675942, "grad_norm": 3.1143295764923096, "learning_rate": 9.954485945394856e-05, "loss": 0.0314, "step": 1560 }, { "epoch": 0.09216860396853352, "grad_norm": 2.103813886642456, "learning_rate": 9.953861293926474e-05, "loss": 0.0362, "step": 1570 }, { "epoch": 0.09275566514030761, "grad_norm": 4.420012474060059, "learning_rate": 9.95323240501769e-05, "loss": 0.0528, "step": 1580 }, { "epoch": 0.09334272631208172, "grad_norm": 2.2697951793670654, "learning_rate": 9.952599279206444e-05, "loss": 0.0502, "step": 1590 }, { "epoch": 0.09392978748385582, "grad_norm": 1.0340734720230103, "learning_rate": 9.951961917034299e-05, "loss": 0.043, "step": 1600 }, { "epoch": 0.09451684865562991, "grad_norm": 1.4463516473770142, "learning_rate": 9.951320319046442e-05, "loss": 0.0338, "step": 1610 }, { "epoch": 0.09510390982740402, "grad_norm": 2.913320779800415, "learning_rate": 9.950674485791685e-05, "loss": 0.0416, "step": 1620 }, { "epoch": 0.09569097099917812, "grad_norm": 3.4532904624938965, "learning_rate": 9.950024417822462e-05, "loss": 0.0406, "step": 1630 }, { "epoch": 0.09627803217095221, "grad_norm": 2.0158588886260986, "learning_rate": 9.949370115694827e-05, "loss": 0.0257, "step": 1640 }, { "epoch": 0.09686509334272632, "grad_norm": 4.916737079620361, "learning_rate": 9.94871157996846e-05, "loss": 0.0444, "step": 1650 }, { "epoch": 0.09745215451450041, "grad_norm": 1.541121006011963, "learning_rate": 9.948048811206658e-05, "loss": 0.0632, "step": 1660 }, { "epoch": 0.09803921568627451, "grad_norm": 2.2335431575775146, "learning_rate": 9.947381809976344e-05, "loss": 0.0418, "step": 1670 }, { "epoch": 0.0986262768580486, "grad_norm": 2.0486209392547607, "learning_rate": 9.946710576848058e-05, "loss": 0.0551, "step": 1680 }, { "epoch": 0.09921333802982271, "grad_norm": 1.574038028717041, "learning_rate": 9.946035112395958e-05, "loss": 0.042, "step": 1690 }, { "epoch": 0.0998003992015968, "grad_norm": 1.9351248741149902, "learning_rate": 9.945355417197824e-05, "loss": 0.0333, "step": 1700 }, { "epoch": 0.1003874603733709, "grad_norm": 2.6506927013397217, "learning_rate": 9.944671491835056e-05, "loss": 0.0425, "step": 1710 }, { "epoch": 0.10097452154514501, "grad_norm": 1.2277346849441528, "learning_rate": 9.943983336892669e-05, "loss": 0.0346, "step": 1720 }, { "epoch": 0.1015615827169191, "grad_norm": 1.3820018768310547, "learning_rate": 9.9432909529593e-05, "loss": 0.0618, "step": 1730 }, { "epoch": 0.1021486438886932, "grad_norm": 2.7665960788726807, "learning_rate": 9.9425943406272e-05, "loss": 0.0408, "step": 1740 }, { "epoch": 0.1027357050604673, "grad_norm": 3.641716718673706, "learning_rate": 9.941893500492241e-05, "loss": 0.0346, "step": 1750 }, { "epoch": 0.1033227662322414, "grad_norm": 1.6728957891464233, "learning_rate": 9.941188433153904e-05, "loss": 0.0319, "step": 1760 }, { "epoch": 0.1039098274040155, "grad_norm": 1.0305782556533813, "learning_rate": 9.940479139215293e-05, "loss": 0.0613, "step": 1770 }, { "epoch": 0.10449688857578959, "grad_norm": 0.9661641120910645, "learning_rate": 9.939765619283124e-05, "loss": 0.0389, "step": 1780 }, { "epoch": 0.1050839497475637, "grad_norm": 5.86223840713501, "learning_rate": 9.93904787396773e-05, "loss": 0.0552, "step": 1790 }, { "epoch": 0.1056710109193378, "grad_norm": 1.668977975845337, "learning_rate": 9.938325903883055e-05, "loss": 0.0501, "step": 1800 }, { "epoch": 0.10625807209111189, "grad_norm": 1.9258061647415161, "learning_rate": 9.937599709646661e-05, "loss": 0.0328, "step": 1810 }, { "epoch": 0.106845133262886, "grad_norm": 0.44692739844322205, "learning_rate": 9.936869291879718e-05, "loss": 0.038, "step": 1820 }, { "epoch": 0.1074321944346601, "grad_norm": 1.2521891593933105, "learning_rate": 9.936134651207015e-05, "loss": 0.0392, "step": 1830 }, { "epoch": 0.10801925560643419, "grad_norm": 1.4813244342803955, "learning_rate": 9.935395788256947e-05, "loss": 0.0469, "step": 1840 }, { "epoch": 0.10860631677820828, "grad_norm": 1.6620100736618042, "learning_rate": 9.934652703661527e-05, "loss": 0.0468, "step": 1850 }, { "epoch": 0.10919337794998239, "grad_norm": 3.2421047687530518, "learning_rate": 9.933905398056372e-05, "loss": 0.0315, "step": 1860 }, { "epoch": 0.10978043912175649, "grad_norm": 2.316253900527954, "learning_rate": 9.933153872080714e-05, "loss": 0.0435, "step": 1870 }, { "epoch": 0.11036750029353058, "grad_norm": 0.8498309850692749, "learning_rate": 9.932398126377396e-05, "loss": 0.0351, "step": 1880 }, { "epoch": 0.11095456146530469, "grad_norm": 2.0742297172546387, "learning_rate": 9.931638161592867e-05, "loss": 0.0398, "step": 1890 }, { "epoch": 0.11154162263707879, "grad_norm": 1.81075918674469, "learning_rate": 9.930873978377187e-05, "loss": 0.028, "step": 1900 }, { "epoch": 0.11212868380885288, "grad_norm": 1.3079307079315186, "learning_rate": 9.930105577384026e-05, "loss": 0.0291, "step": 1910 }, { "epoch": 0.11271574498062698, "grad_norm": 0.64143306016922, "learning_rate": 9.929332959270659e-05, "loss": 0.0541, "step": 1920 }, { "epoch": 0.11330280615240108, "grad_norm": 0.9896833300590515, "learning_rate": 9.928556124697967e-05, "loss": 0.0363, "step": 1930 }, { "epoch": 0.11388986732417518, "grad_norm": 2.4763922691345215, "learning_rate": 9.927775074330441e-05, "loss": 0.0506, "step": 1940 }, { "epoch": 0.11447692849594927, "grad_norm": 1.346570611000061, "learning_rate": 9.926989808836178e-05, "loss": 0.0332, "step": 1950 }, { "epoch": 0.11506398966772338, "grad_norm": 2.1324174404144287, "learning_rate": 9.926200328886878e-05, "loss": 0.0887, "step": 1960 }, { "epoch": 0.11565105083949748, "grad_norm": 2.1258816719055176, "learning_rate": 9.92540663515785e-05, "loss": 0.0322, "step": 1970 }, { "epoch": 0.11623811201127157, "grad_norm": 2.4790914058685303, "learning_rate": 9.924608728328001e-05, "loss": 0.0322, "step": 1980 }, { "epoch": 0.11682517318304568, "grad_norm": 0.9583444595336914, "learning_rate": 9.923806609079847e-05, "loss": 0.0328, "step": 1990 }, { "epoch": 0.11741223435481977, "grad_norm": 1.7796015739440918, "learning_rate": 9.923000278099508e-05, "loss": 0.0259, "step": 2000 }, { "epoch": 0.11799929552659387, "grad_norm": 2.2700517177581787, "learning_rate": 9.922189736076701e-05, "loss": 0.0307, "step": 2010 }, { "epoch": 0.11858635669836796, "grad_norm": 2.4041314125061035, "learning_rate": 9.921374983704752e-05, "loss": 0.0342, "step": 2020 }, { "epoch": 0.11917341787014207, "grad_norm": 3.097506523132324, "learning_rate": 9.92055602168058e-05, "loss": 0.0376, "step": 2030 }, { "epoch": 0.11976047904191617, "grad_norm": 1.7169874906539917, "learning_rate": 9.919732850704716e-05, "loss": 0.0414, "step": 2040 }, { "epoch": 0.12034754021369026, "grad_norm": 1.2689332962036133, "learning_rate": 9.918905471481281e-05, "loss": 0.0262, "step": 2050 }, { "epoch": 0.12093460138546437, "grad_norm": 1.9969502687454224, "learning_rate": 9.918073884718e-05, "loss": 0.0394, "step": 2060 }, { "epoch": 0.12152166255723847, "grad_norm": 1.0878665447235107, "learning_rate": 9.917238091126198e-05, "loss": 0.0381, "step": 2070 }, { "epoch": 0.12210872372901256, "grad_norm": 2.4405057430267334, "learning_rate": 9.916398091420797e-05, "loss": 0.0796, "step": 2080 }, { "epoch": 0.12269578490078666, "grad_norm": 1.5867035388946533, "learning_rate": 9.915553886320317e-05, "loss": 0.06, "step": 2090 }, { "epoch": 0.12328284607256076, "grad_norm": 2.4386491775512695, "learning_rate": 9.914705476546875e-05, "loss": 0.043, "step": 2100 }, { "epoch": 0.12386990724433486, "grad_norm": 2.065495491027832, "learning_rate": 9.913852862826185e-05, "loss": 0.0352, "step": 2110 }, { "epoch": 0.12445696841610895, "grad_norm": 2.307507038116455, "learning_rate": 9.912996045887556e-05, "loss": 0.0348, "step": 2120 }, { "epoch": 0.12504402958788305, "grad_norm": 1.7963237762451172, "learning_rate": 9.912135026463895e-05, "loss": 0.0634, "step": 2130 }, { "epoch": 0.12563109075965714, "grad_norm": 2.4037539958953857, "learning_rate": 9.911269805291699e-05, "loss": 0.0472, "step": 2140 }, { "epoch": 0.12621815193143127, "grad_norm": 2.245288133621216, "learning_rate": 9.910400383111067e-05, "loss": 0.0429, "step": 2150 }, { "epoch": 0.12680521310320536, "grad_norm": 4.1519622802734375, "learning_rate": 9.909526760665682e-05, "loss": 0.0489, "step": 2160 }, { "epoch": 0.12739227427497946, "grad_norm": 2.5041582584381104, "learning_rate": 9.908648938702825e-05, "loss": 0.0571, "step": 2170 }, { "epoch": 0.12797933544675355, "grad_norm": 2.4531331062316895, "learning_rate": 9.90776691797337e-05, "loss": 0.0373, "step": 2180 }, { "epoch": 0.12856639661852765, "grad_norm": 2.2434446811676025, "learning_rate": 9.90688069923178e-05, "loss": 0.0427, "step": 2190 }, { "epoch": 0.12915345779030174, "grad_norm": 3.575446605682373, "learning_rate": 9.90599028323611e-05, "loss": 0.0774, "step": 2200 }, { "epoch": 0.12974051896207583, "grad_norm": 1.2064034938812256, "learning_rate": 9.905095670748005e-05, "loss": 0.0395, "step": 2210 }, { "epoch": 0.13032758013384996, "grad_norm": 1.6163547039031982, "learning_rate": 9.904196862532702e-05, "loss": 0.0319, "step": 2220 }, { "epoch": 0.13091464130562405, "grad_norm": 3.1924421787261963, "learning_rate": 9.903293859359023e-05, "loss": 0.0688, "step": 2230 }, { "epoch": 0.13150170247739815, "grad_norm": 1.619741439819336, "learning_rate": 9.902386661999379e-05, "loss": 0.0367, "step": 2240 }, { "epoch": 0.13208876364917224, "grad_norm": 8.740411758422852, "learning_rate": 9.901475271229772e-05, "loss": 0.0716, "step": 2250 }, { "epoch": 0.13267582482094634, "grad_norm": 2.1869382858276367, "learning_rate": 9.900559687829786e-05, "loss": 0.0332, "step": 2260 }, { "epoch": 0.13326288599272043, "grad_norm": 1.2823442220687866, "learning_rate": 9.899639912582596e-05, "loss": 0.0351, "step": 2270 }, { "epoch": 0.13384994716449455, "grad_norm": 1.6115927696228027, "learning_rate": 9.89871594627496e-05, "loss": 0.0401, "step": 2280 }, { "epoch": 0.13443700833626865, "grad_norm": 3.726031541824341, "learning_rate": 9.897787789697221e-05, "loss": 0.0517, "step": 2290 }, { "epoch": 0.13502406950804274, "grad_norm": 1.1957895755767822, "learning_rate": 9.896855443643308e-05, "loss": 0.0309, "step": 2300 }, { "epoch": 0.13561113067981684, "grad_norm": 2.5738515853881836, "learning_rate": 9.895918908910731e-05, "loss": 0.0342, "step": 2310 }, { "epoch": 0.13619819185159093, "grad_norm": 0.5897283554077148, "learning_rate": 9.894978186300585e-05, "loss": 0.0306, "step": 2320 }, { "epoch": 0.13678525302336503, "grad_norm": 1.397139549255371, "learning_rate": 9.894033276617547e-05, "loss": 0.0575, "step": 2330 }, { "epoch": 0.13737231419513912, "grad_norm": 3.719864845275879, "learning_rate": 9.893084180669873e-05, "loss": 0.037, "step": 2340 }, { "epoch": 0.13795937536691324, "grad_norm": 3.00734806060791, "learning_rate": 9.892130899269405e-05, "loss": 0.0457, "step": 2350 }, { "epoch": 0.13854643653868734, "grad_norm": 1.681618332862854, "learning_rate": 9.891173433231559e-05, "loss": 0.0437, "step": 2360 }, { "epoch": 0.13913349771046143, "grad_norm": 1.3892427682876587, "learning_rate": 9.890211783375338e-05, "loss": 0.049, "step": 2370 }, { "epoch": 0.13972055888223553, "grad_norm": 1.5249607563018799, "learning_rate": 9.889245950523315e-05, "loss": 0.0313, "step": 2380 }, { "epoch": 0.14030762005400962, "grad_norm": 1.1263105869293213, "learning_rate": 9.888275935501647e-05, "loss": 0.0289, "step": 2390 }, { "epoch": 0.14089468122578372, "grad_norm": 2.9068267345428467, "learning_rate": 9.887301739140066e-05, "loss": 0.0304, "step": 2400 }, { "epoch": 0.1414817423975578, "grad_norm": 3.913470506668091, "learning_rate": 9.886323362271882e-05, "loss": 0.0698, "step": 2410 }, { "epoch": 0.14206880356933194, "grad_norm": 4.11099910736084, "learning_rate": 9.88534080573398e-05, "loss": 0.0468, "step": 2420 }, { "epoch": 0.14265586474110603, "grad_norm": 1.2189699411392212, "learning_rate": 9.884354070366822e-05, "loss": 0.0537, "step": 2430 }, { "epoch": 0.14324292591288013, "grad_norm": 1.7198377847671509, "learning_rate": 9.883363157014442e-05, "loss": 0.0648, "step": 2440 }, { "epoch": 0.14382998708465422, "grad_norm": 2.9167282581329346, "learning_rate": 9.882368066524448e-05, "loss": 0.0611, "step": 2450 }, { "epoch": 0.14441704825642832, "grad_norm": 2.9056789875030518, "learning_rate": 9.881368799748021e-05, "loss": 0.0394, "step": 2460 }, { "epoch": 0.1450041094282024, "grad_norm": 1.222402811050415, "learning_rate": 9.880365357539917e-05, "loss": 0.0432, "step": 2470 }, { "epoch": 0.1455911705999765, "grad_norm": 2.0323326587677, "learning_rate": 9.879357740758462e-05, "loss": 0.0323, "step": 2480 }, { "epoch": 0.14617823177175063, "grad_norm": 0.8139905333518982, "learning_rate": 9.878345950265552e-05, "loss": 0.0462, "step": 2490 }, { "epoch": 0.14676529294352472, "grad_norm": 2.266211986541748, "learning_rate": 9.877329986926653e-05, "loss": 0.0525, "step": 2500 }, { "epoch": 0.14735235411529882, "grad_norm": 1.2149319648742676, "learning_rate": 9.876309851610801e-05, "loss": 0.0648, "step": 2510 }, { "epoch": 0.1479394152870729, "grad_norm": 3.378032922744751, "learning_rate": 9.875285545190603e-05, "loss": 0.0435, "step": 2520 }, { "epoch": 0.148526476458847, "grad_norm": 2.399960994720459, "learning_rate": 9.874257068542227e-05, "loss": 0.0452, "step": 2530 }, { "epoch": 0.1491135376306211, "grad_norm": 0.4165957272052765, "learning_rate": 9.873224422545417e-05, "loss": 0.0454, "step": 2540 }, { "epoch": 0.1497005988023952, "grad_norm": 2.799889087677002, "learning_rate": 9.872187608083478e-05, "loss": 0.0626, "step": 2550 }, { "epoch": 0.15028765997416932, "grad_norm": 4.759644508361816, "learning_rate": 9.871146626043282e-05, "loss": 0.0394, "step": 2560 }, { "epoch": 0.1508747211459434, "grad_norm": 2.0836448669433594, "learning_rate": 9.870101477315263e-05, "loss": 0.0443, "step": 2570 }, { "epoch": 0.1514617823177175, "grad_norm": 3.5232982635498047, "learning_rate": 9.869052162793424e-05, "loss": 0.069, "step": 2580 }, { "epoch": 0.1520488434894916, "grad_norm": 5.089994430541992, "learning_rate": 9.867998683375329e-05, "loss": 0.0566, "step": 2590 }, { "epoch": 0.1526359046612657, "grad_norm": 2.8544843196868896, "learning_rate": 9.866941039962104e-05, "loss": 0.0492, "step": 2600 }, { "epoch": 0.1532229658330398, "grad_norm": 3.8631744384765625, "learning_rate": 9.865879233458438e-05, "loss": 0.0381, "step": 2610 }, { "epoch": 0.15381002700481392, "grad_norm": 0.28329482674598694, "learning_rate": 9.86481326477258e-05, "loss": 0.0337, "step": 2620 }, { "epoch": 0.154397088176588, "grad_norm": 5.953649044036865, "learning_rate": 9.863743134816342e-05, "loss": 0.0877, "step": 2630 }, { "epoch": 0.1549841493483621, "grad_norm": 1.7450274229049683, "learning_rate": 9.862668844505087e-05, "loss": 0.033, "step": 2640 }, { "epoch": 0.1555712105201362, "grad_norm": 1.802662968635559, "learning_rate": 9.86159039475775e-05, "loss": 0.0534, "step": 2650 }, { "epoch": 0.1561582716919103, "grad_norm": 3.2051076889038086, "learning_rate": 9.86050778649681e-05, "loss": 0.0614, "step": 2660 }, { "epoch": 0.1567453328636844, "grad_norm": 1.5251544713974, "learning_rate": 9.859421020648317e-05, "loss": 0.0462, "step": 2670 }, { "epoch": 0.15733239403545848, "grad_norm": 4.515710830688477, "learning_rate": 9.858330098141866e-05, "loss": 0.0397, "step": 2680 }, { "epoch": 0.1579194552072326, "grad_norm": 2.036057233810425, "learning_rate": 9.857235019910611e-05, "loss": 0.0555, "step": 2690 }, { "epoch": 0.1585065163790067, "grad_norm": 3.6127116680145264, "learning_rate": 9.856135786891265e-05, "loss": 0.049, "step": 2700 }, { "epoch": 0.1590935775507808, "grad_norm": 1.9806824922561646, "learning_rate": 9.855032400024089e-05, "loss": 0.0474, "step": 2710 }, { "epoch": 0.1596806387225549, "grad_norm": 1.8766403198242188, "learning_rate": 9.853924860252898e-05, "loss": 0.046, "step": 2720 }, { "epoch": 0.16026769989432899, "grad_norm": 3.2558913230895996, "learning_rate": 9.852813168525064e-05, "loss": 0.031, "step": 2730 }, { "epoch": 0.16085476106610308, "grad_norm": 3.1267333030700684, "learning_rate": 9.851697325791505e-05, "loss": 0.0531, "step": 2740 }, { "epoch": 0.16144182223787718, "grad_norm": 2.883910894393921, "learning_rate": 9.850577333006693e-05, "loss": 0.0829, "step": 2750 }, { "epoch": 0.1620288834096513, "grad_norm": 1.4868563413619995, "learning_rate": 9.84945319112865e-05, "loss": 0.0369, "step": 2760 }, { "epoch": 0.1626159445814254, "grad_norm": 1.7794392108917236, "learning_rate": 9.848324901118943e-05, "loss": 0.0447, "step": 2770 }, { "epoch": 0.1632030057531995, "grad_norm": 4.2652692794799805, "learning_rate": 9.847192463942694e-05, "loss": 0.0315, "step": 2780 }, { "epoch": 0.16379006692497358, "grad_norm": 1.231309413909912, "learning_rate": 9.846055880568566e-05, "loss": 0.0454, "step": 2790 }, { "epoch": 0.16437712809674768, "grad_norm": 1.4655780792236328, "learning_rate": 9.844915151968773e-05, "loss": 0.0717, "step": 2800 }, { "epoch": 0.16496418926852177, "grad_norm": 3.672654390335083, "learning_rate": 9.843770279119069e-05, "loss": 0.0501, "step": 2810 }, { "epoch": 0.16555125044029587, "grad_norm": 3.1150739192962646, "learning_rate": 9.842621262998761e-05, "loss": 0.0567, "step": 2820 }, { "epoch": 0.16613831161207, "grad_norm": 2.981383800506592, "learning_rate": 9.841468104590695e-05, "loss": 0.0298, "step": 2830 }, { "epoch": 0.16672537278384408, "grad_norm": 2.442925214767456, "learning_rate": 9.840310804881261e-05, "loss": 0.045, "step": 2840 }, { "epoch": 0.16731243395561818, "grad_norm": 1.6846442222595215, "learning_rate": 9.839149364860389e-05, "loss": 0.0526, "step": 2850 }, { "epoch": 0.16789949512739227, "grad_norm": 0.9890480637550354, "learning_rate": 9.837983785521559e-05, "loss": 0.0292, "step": 2860 }, { "epoch": 0.16848655629916637, "grad_norm": 1.3169161081314087, "learning_rate": 9.83681406786178e-05, "loss": 0.0278, "step": 2870 }, { "epoch": 0.16907361747094046, "grad_norm": 2.093355894088745, "learning_rate": 9.835640212881608e-05, "loss": 0.0343, "step": 2880 }, { "epoch": 0.16966067864271456, "grad_norm": 1.1763924360275269, "learning_rate": 9.834462221585139e-05, "loss": 0.0638, "step": 2890 }, { "epoch": 0.17024773981448868, "grad_norm": 3.9817452430725098, "learning_rate": 9.833280094980002e-05, "loss": 0.038, "step": 2900 }, { "epoch": 0.17083480098626277, "grad_norm": 4.304747104644775, "learning_rate": 9.832093834077367e-05, "loss": 0.0494, "step": 2910 }, { "epoch": 0.17142186215803687, "grad_norm": 1.3423402309417725, "learning_rate": 9.83090343989194e-05, "loss": 0.0425, "step": 2920 }, { "epoch": 0.17200892332981096, "grad_norm": 0.36195164918899536, "learning_rate": 9.829708913441962e-05, "loss": 0.0492, "step": 2930 }, { "epoch": 0.17259598450158506, "grad_norm": 1.1042572259902954, "learning_rate": 9.828510255749208e-05, "loss": 0.055, "step": 2940 }, { "epoch": 0.17318304567335915, "grad_norm": 2.505880355834961, "learning_rate": 9.827307467838987e-05, "loss": 0.0296, "step": 2950 }, { "epoch": 0.17377010684513328, "grad_norm": 3.9850759506225586, "learning_rate": 9.826100550740143e-05, "loss": 0.0679, "step": 2960 }, { "epoch": 0.17435716801690737, "grad_norm": 0.4390197694301605, "learning_rate": 9.824889505485048e-05, "loss": 0.0392, "step": 2970 }, { "epoch": 0.17494422918868147, "grad_norm": 3.0325474739074707, "learning_rate": 9.823674333109608e-05, "loss": 0.0519, "step": 2980 }, { "epoch": 0.17553129036045556, "grad_norm": 1.549139380455017, "learning_rate": 9.82245503465326e-05, "loss": 0.0487, "step": 2990 }, { "epoch": 0.17611835153222966, "grad_norm": 1.021997094154358, "learning_rate": 9.821231611158969e-05, "loss": 0.0475, "step": 3000 }, { "epoch": 0.17611835153222966, "eval_loss": 0.4372119903564453, "eval_runtime": 269.7559, "eval_samples_per_second": 3.503, "eval_steps_per_second": 3.503, "step": 3000 }, { "epoch": 0.17670541270400375, "grad_norm": 6.882436275482178, "learning_rate": 9.820004063673228e-05, "loss": 0.054, "step": 3010 }, { "epoch": 0.17729247387577785, "grad_norm": 3.3868536949157715, "learning_rate": 9.818772393246058e-05, "loss": 0.0398, "step": 3020 }, { "epoch": 0.17787953504755197, "grad_norm": 0.5208640694618225, "learning_rate": 9.817536600931007e-05, "loss": 0.0301, "step": 3030 }, { "epoch": 0.17846659621932606, "grad_norm": 0.7582460045814514, "learning_rate": 9.81629668778515e-05, "loss": 0.0741, "step": 3040 }, { "epoch": 0.17905365739110016, "grad_norm": 2.827852964401245, "learning_rate": 9.815052654869084e-05, "loss": 0.0484, "step": 3050 }, { "epoch": 0.17964071856287425, "grad_norm": 3.6020514965057373, "learning_rate": 9.813804503246932e-05, "loss": 0.0369, "step": 3060 }, { "epoch": 0.18022777973464835, "grad_norm": 2.883612632751465, "learning_rate": 9.812552233986338e-05, "loss": 0.078, "step": 3070 }, { "epoch": 0.18081484090642244, "grad_norm": 3.0130109786987305, "learning_rate": 9.811295848158472e-05, "loss": 0.0429, "step": 3080 }, { "epoch": 0.18140190207819654, "grad_norm": 2.726775646209717, "learning_rate": 9.810035346838023e-05, "loss": 0.039, "step": 3090 }, { "epoch": 0.18198896324997066, "grad_norm": 1.5122536420822144, "learning_rate": 9.8087707311032e-05, "loss": 0.0394, "step": 3100 }, { "epoch": 0.18257602442174475, "grad_norm": 1.2344636917114258, "learning_rate": 9.807502002035729e-05, "loss": 0.0362, "step": 3110 }, { "epoch": 0.18316308559351885, "grad_norm": 1.4892998933792114, "learning_rate": 9.80622916072086e-05, "loss": 0.0367, "step": 3120 }, { "epoch": 0.18375014676529294, "grad_norm": 0.42101845145225525, "learning_rate": 9.804952208247358e-05, "loss": 0.0818, "step": 3130 }, { "epoch": 0.18433720793706704, "grad_norm": 1.3625999689102173, "learning_rate": 9.803671145707502e-05, "loss": 0.0472, "step": 3140 }, { "epoch": 0.18492426910884113, "grad_norm": 1.7136095762252808, "learning_rate": 9.80238597419709e-05, "loss": 0.0401, "step": 3150 }, { "epoch": 0.18551133028061523, "grad_norm": 4.23172664642334, "learning_rate": 9.801096694815435e-05, "loss": 0.0521, "step": 3160 }, { "epoch": 0.18609839145238935, "grad_norm": 1.758823037147522, "learning_rate": 9.799803308665362e-05, "loss": 0.0273, "step": 3170 }, { "epoch": 0.18668545262416344, "grad_norm": 1.8653748035430908, "learning_rate": 9.798505816853208e-05, "loss": 0.041, "step": 3180 }, { "epoch": 0.18727251379593754, "grad_norm": 3.101517677307129, "learning_rate": 9.797204220488823e-05, "loss": 0.0409, "step": 3190 }, { "epoch": 0.18785957496771163, "grad_norm": 1.3658429384231567, "learning_rate": 9.795898520685569e-05, "loss": 0.0265, "step": 3200 }, { "epoch": 0.18844663613948573, "grad_norm": 2.253669500350952, "learning_rate": 9.794588718560319e-05, "loss": 0.0267, "step": 3210 }, { "epoch": 0.18903369731125982, "grad_norm": 1.501289963722229, "learning_rate": 9.793274815233451e-05, "loss": 0.0544, "step": 3220 }, { "epoch": 0.18962075848303392, "grad_norm": 3.602738618850708, "learning_rate": 9.791956811828855e-05, "loss": 0.0646, "step": 3230 }, { "epoch": 0.19020781965480804, "grad_norm": 0.38867709040641785, "learning_rate": 9.790634709473924e-05, "loss": 0.0511, "step": 3240 }, { "epoch": 0.19079488082658214, "grad_norm": 2.0799927711486816, "learning_rate": 9.789308509299562e-05, "loss": 0.0828, "step": 3250 }, { "epoch": 0.19138194199835623, "grad_norm": 3.6796083450317383, "learning_rate": 9.787978212440176e-05, "loss": 0.0487, "step": 3260 }, { "epoch": 0.19196900317013033, "grad_norm": 4.739702224731445, "learning_rate": 9.786643820033674e-05, "loss": 0.0409, "step": 3270 }, { "epoch": 0.19255606434190442, "grad_norm": 3.3475122451782227, "learning_rate": 9.785305333221474e-05, "loss": 0.0451, "step": 3280 }, { "epoch": 0.19314312551367852, "grad_norm": 1.3186380863189697, "learning_rate": 9.78396275314849e-05, "loss": 0.0375, "step": 3290 }, { "epoch": 0.19373018668545264, "grad_norm": 4.777583122253418, "learning_rate": 9.782616080963143e-05, "loss": 0.0564, "step": 3300 }, { "epoch": 0.19431724785722673, "grad_norm": 2.913907051086426, "learning_rate": 9.781265317817347e-05, "loss": 0.0492, "step": 3310 }, { "epoch": 0.19490430902900083, "grad_norm": 0.868087649345398, "learning_rate": 9.779910464866523e-05, "loss": 0.0491, "step": 3320 }, { "epoch": 0.19549137020077492, "grad_norm": 1.7826279401779175, "learning_rate": 9.778551523269586e-05, "loss": 0.0515, "step": 3330 }, { "epoch": 0.19607843137254902, "grad_norm": 2.674586296081543, "learning_rate": 9.777188494188948e-05, "loss": 0.0635, "step": 3340 }, { "epoch": 0.1966654925443231, "grad_norm": 1.7887827157974243, "learning_rate": 9.775821378790519e-05, "loss": 0.043, "step": 3350 }, { "epoch": 0.1972525537160972, "grad_norm": 0.4741100072860718, "learning_rate": 9.774450178243706e-05, "loss": 0.0551, "step": 3360 }, { "epoch": 0.19783961488787133, "grad_norm": 4.381477355957031, "learning_rate": 9.773074893721407e-05, "loss": 0.0437, "step": 3370 }, { "epoch": 0.19842667605964542, "grad_norm": 3.6629550457000732, "learning_rate": 9.771695526400013e-05, "loss": 0.0527, "step": 3380 }, { "epoch": 0.19901373723141952, "grad_norm": 3.1709678173065186, "learning_rate": 9.770312077459411e-05, "loss": 0.0477, "step": 3390 }, { "epoch": 0.1996007984031936, "grad_norm": 3.2368292808532715, "learning_rate": 9.768924548082979e-05, "loss": 0.0583, "step": 3400 }, { "epoch": 0.2001878595749677, "grad_norm": 4.261107921600342, "learning_rate": 9.76753293945758e-05, "loss": 0.0682, "step": 3410 }, { "epoch": 0.2007749207467418, "grad_norm": 0.5269186496734619, "learning_rate": 9.766137252773572e-05, "loss": 0.0341, "step": 3420 }, { "epoch": 0.2013619819185159, "grad_norm": 0.7132035493850708, "learning_rate": 9.764737489224799e-05, "loss": 0.0636, "step": 3430 }, { "epoch": 0.20194904309029002, "grad_norm": 1.4716511964797974, "learning_rate": 9.763333650008593e-05, "loss": 0.047, "step": 3440 }, { "epoch": 0.20253610426206411, "grad_norm": 0.8930634260177612, "learning_rate": 9.76192573632577e-05, "loss": 0.0346, "step": 3450 }, { "epoch": 0.2031231654338382, "grad_norm": 3.027501344680786, "learning_rate": 9.760513749380635e-05, "loss": 0.0553, "step": 3460 }, { "epoch": 0.2037102266056123, "grad_norm": 1.5398187637329102, "learning_rate": 9.759097690380976e-05, "loss": 0.0426, "step": 3470 }, { "epoch": 0.2042972877773864, "grad_norm": 4.19457483291626, "learning_rate": 9.757677560538061e-05, "loss": 0.0463, "step": 3480 }, { "epoch": 0.2048843489491605, "grad_norm": 1.2234585285186768, "learning_rate": 9.756253361066643e-05, "loss": 0.06, "step": 3490 }, { "epoch": 0.2054714101209346, "grad_norm": 5.896811008453369, "learning_rate": 9.754825093184958e-05, "loss": 0.0657, "step": 3500 }, { "epoch": 0.2060584712927087, "grad_norm": 1.2498093843460083, "learning_rate": 9.753392758114718e-05, "loss": 0.0586, "step": 3510 }, { "epoch": 0.2066455324644828, "grad_norm": 2.946573495864868, "learning_rate": 9.751956357081115e-05, "loss": 0.055, "step": 3520 }, { "epoch": 0.2072325936362569, "grad_norm": 1.8234636783599854, "learning_rate": 9.750515891312819e-05, "loss": 0.0415, "step": 3530 }, { "epoch": 0.207819654808031, "grad_norm": 2.5186047554016113, "learning_rate": 9.749071362041981e-05, "loss": 0.0572, "step": 3540 }, { "epoch": 0.2084067159798051, "grad_norm": 2.565370798110962, "learning_rate": 9.747622770504221e-05, "loss": 0.0623, "step": 3550 }, { "epoch": 0.20899377715157919, "grad_norm": 0.7187139391899109, "learning_rate": 9.746170117938638e-05, "loss": 0.0672, "step": 3560 }, { "epoch": 0.20958083832335328, "grad_norm": 1.5226011276245117, "learning_rate": 9.744713405587804e-05, "loss": 0.0674, "step": 3570 }, { "epoch": 0.2101678994951274, "grad_norm": 2.0840280055999756, "learning_rate": 9.743252634697767e-05, "loss": 0.0365, "step": 3580 }, { "epoch": 0.2107549606669015, "grad_norm": 3.5004069805145264, "learning_rate": 9.741787806518035e-05, "loss": 0.0414, "step": 3590 }, { "epoch": 0.2113420218386756, "grad_norm": 2.170786142349243, "learning_rate": 9.740318922301602e-05, "loss": 0.0421, "step": 3600 }, { "epoch": 0.2119290830104497, "grad_norm": 1.3566417694091797, "learning_rate": 9.738845983304921e-05, "loss": 0.0337, "step": 3610 }, { "epoch": 0.21251614418222378, "grad_norm": 0.991325318813324, "learning_rate": 9.737368990787916e-05, "loss": 0.0429, "step": 3620 }, { "epoch": 0.21310320535399788, "grad_norm": 2.2573013305664062, "learning_rate": 9.735887946013982e-05, "loss": 0.0424, "step": 3630 }, { "epoch": 0.213690266525772, "grad_norm": 3.2587900161743164, "learning_rate": 9.734402850249973e-05, "loss": 0.0472, "step": 3640 }, { "epoch": 0.2142773276975461, "grad_norm": 2.0947930812835693, "learning_rate": 9.732913704766216e-05, "loss": 0.0409, "step": 3650 }, { "epoch": 0.2148643888693202, "grad_norm": 1.4325376749038696, "learning_rate": 9.731420510836494e-05, "loss": 0.0279, "step": 3660 }, { "epoch": 0.21545145004109428, "grad_norm": 2.6911299228668213, "learning_rate": 9.729923269738062e-05, "loss": 0.0498, "step": 3670 }, { "epoch": 0.21603851121286838, "grad_norm": 3.6013898849487305, "learning_rate": 9.728421982751628e-05, "loss": 0.0534, "step": 3680 }, { "epoch": 0.21662557238464247, "grad_norm": 2.846738576889038, "learning_rate": 9.726916651161367e-05, "loss": 0.0426, "step": 3690 }, { "epoch": 0.21721263355641657, "grad_norm": 1.4632606506347656, "learning_rate": 9.725407276254909e-05, "loss": 0.0434, "step": 3700 }, { "epoch": 0.2177996947281907, "grad_norm": 1.5441242456436157, "learning_rate": 9.723893859323348e-05, "loss": 0.046, "step": 3710 }, { "epoch": 0.21838675589996479, "grad_norm": 3.459610939025879, "learning_rate": 9.722376401661233e-05, "loss": 0.0638, "step": 3720 }, { "epoch": 0.21897381707173888, "grad_norm": 1.4089329242706299, "learning_rate": 9.720854904566566e-05, "loss": 0.0545, "step": 3730 }, { "epoch": 0.21956087824351297, "grad_norm": 2.2323861122131348, "learning_rate": 9.71932936934081e-05, "loss": 0.0362, "step": 3740 }, { "epoch": 0.22014793941528707, "grad_norm": 2.6593523025512695, "learning_rate": 9.717799797288877e-05, "loss": 0.0515, "step": 3750 }, { "epoch": 0.22073500058706116, "grad_norm": 3.250030755996704, "learning_rate": 9.716266189719136e-05, "loss": 0.0502, "step": 3760 }, { "epoch": 0.22132206175883526, "grad_norm": 1.6188584566116333, "learning_rate": 9.714728547943405e-05, "loss": 0.0603, "step": 3770 }, { "epoch": 0.22190912293060938, "grad_norm": 2.989541530609131, "learning_rate": 9.713186873276955e-05, "loss": 0.0725, "step": 3780 }, { "epoch": 0.22249618410238348, "grad_norm": 1.4514538049697876, "learning_rate": 9.711641167038506e-05, "loss": 0.0487, "step": 3790 }, { "epoch": 0.22308324527415757, "grad_norm": 1.2434719800949097, "learning_rate": 9.710091430550224e-05, "loss": 0.0434, "step": 3800 }, { "epoch": 0.22367030644593167, "grad_norm": 3.2450573444366455, "learning_rate": 9.708537665137727e-05, "loss": 0.0575, "step": 3810 }, { "epoch": 0.22425736761770576, "grad_norm": 1.1823923587799072, "learning_rate": 9.706979872130077e-05, "loss": 0.0542, "step": 3820 }, { "epoch": 0.22484442878947986, "grad_norm": 2.7143499851226807, "learning_rate": 9.70541805285978e-05, "loss": 0.0447, "step": 3830 }, { "epoch": 0.22543148996125395, "grad_norm": 0.8574681282043457, "learning_rate": 9.703852208662786e-05, "loss": 0.0438, "step": 3840 }, { "epoch": 0.22601855113302807, "grad_norm": 1.8387888669967651, "learning_rate": 9.702282340878493e-05, "loss": 0.0439, "step": 3850 }, { "epoch": 0.22660561230480217, "grad_norm": 4.398098468780518, "learning_rate": 9.700708450849732e-05, "loss": 0.0438, "step": 3860 }, { "epoch": 0.22719267347657626, "grad_norm": 2.6266028881073, "learning_rate": 9.69913053992278e-05, "loss": 0.0772, "step": 3870 }, { "epoch": 0.22777973464835036, "grad_norm": 1.9431968927383423, "learning_rate": 9.697548609447355e-05, "loss": 0.0633, "step": 3880 }, { "epoch": 0.22836679582012445, "grad_norm": 3.3955740928649902, "learning_rate": 9.695962660776607e-05, "loss": 0.0491, "step": 3890 }, { "epoch": 0.22895385699189855, "grad_norm": 1.481558918952942, "learning_rate": 9.694372695267131e-05, "loss": 0.0289, "step": 3900 }, { "epoch": 0.22954091816367264, "grad_norm": 1.9872463941574097, "learning_rate": 9.692778714278952e-05, "loss": 0.0392, "step": 3910 }, { "epoch": 0.23012797933544676, "grad_norm": 2.365849733352661, "learning_rate": 9.69118071917553e-05, "loss": 0.0416, "step": 3920 }, { "epoch": 0.23071504050722086, "grad_norm": 1.6903983354568481, "learning_rate": 9.689578711323761e-05, "loss": 0.0867, "step": 3930 }, { "epoch": 0.23130210167899495, "grad_norm": 2.933515787124634, "learning_rate": 9.687972692093973e-05, "loss": 0.0453, "step": 3940 }, { "epoch": 0.23188916285076905, "grad_norm": 1.2972553968429565, "learning_rate": 9.686362662859927e-05, "loss": 0.0307, "step": 3950 }, { "epoch": 0.23247622402254314, "grad_norm": 1.4731982946395874, "learning_rate": 9.68474862499881e-05, "loss": 0.0461, "step": 3960 }, { "epoch": 0.23306328519431724, "grad_norm": 2.5573713779449463, "learning_rate": 9.683130579891238e-05, "loss": 0.0514, "step": 3970 }, { "epoch": 0.23365034636609136, "grad_norm": 0.724399983882904, "learning_rate": 9.68150852892126e-05, "loss": 0.0447, "step": 3980 }, { "epoch": 0.23423740753786546, "grad_norm": 1.0394184589385986, "learning_rate": 9.679882473476344e-05, "loss": 0.0422, "step": 3990 }, { "epoch": 0.23482446870963955, "grad_norm": 2.5117719173431396, "learning_rate": 9.67825241494739e-05, "loss": 0.0565, "step": 4000 }, { "epoch": 0.23541152988141364, "grad_norm": 1.7553863525390625, "learning_rate": 9.676618354728722e-05, "loss": 0.0574, "step": 4010 }, { "epoch": 0.23599859105318774, "grad_norm": 1.3056498765945435, "learning_rate": 9.67498029421808e-05, "loss": 0.0361, "step": 4020 }, { "epoch": 0.23658565222496183, "grad_norm": 3.3928043842315674, "learning_rate": 9.673338234816632e-05, "loss": 0.0842, "step": 4030 }, { "epoch": 0.23717271339673593, "grad_norm": 0.8964785933494568, "learning_rate": 9.671692177928966e-05, "loss": 0.0488, "step": 4040 }, { "epoch": 0.23775977456851005, "grad_norm": 1.5078647136688232, "learning_rate": 9.670042124963087e-05, "loss": 0.0474, "step": 4050 }, { "epoch": 0.23834683574028415, "grad_norm": 1.2007118463516235, "learning_rate": 9.668388077330421e-05, "loss": 0.0445, "step": 4060 }, { "epoch": 0.23893389691205824, "grad_norm": 3.7927136421203613, "learning_rate": 9.666730036445809e-05, "loss": 0.0506, "step": 4070 }, { "epoch": 0.23952095808383234, "grad_norm": 2.2696220874786377, "learning_rate": 9.665068003727507e-05, "loss": 0.0663, "step": 4080 }, { "epoch": 0.24010801925560643, "grad_norm": 0.5591484308242798, "learning_rate": 9.663401980597188e-05, "loss": 0.0292, "step": 4090 }, { "epoch": 0.24069508042738053, "grad_norm": 4.547415256500244, "learning_rate": 9.661731968479936e-05, "loss": 0.053, "step": 4100 }, { "epoch": 0.24128214159915462, "grad_norm": 2.9865922927856445, "learning_rate": 9.660057968804249e-05, "loss": 0.0562, "step": 4110 }, { "epoch": 0.24186920277092874, "grad_norm": 5.648647308349609, "learning_rate": 9.658379983002035e-05, "loss": 0.0488, "step": 4120 }, { "epoch": 0.24245626394270284, "grad_norm": 0.5779200792312622, "learning_rate": 9.65669801250861e-05, "loss": 0.0637, "step": 4130 }, { "epoch": 0.24304332511447693, "grad_norm": 2.952974796295166, "learning_rate": 9.655012058762703e-05, "loss": 0.054, "step": 4140 }, { "epoch": 0.24363038628625103, "grad_norm": 1.615088939666748, "learning_rate": 9.653322123206445e-05, "loss": 0.0583, "step": 4150 }, { "epoch": 0.24421744745802512, "grad_norm": 2.245837926864624, "learning_rate": 9.651628207285377e-05, "loss": 0.0692, "step": 4160 }, { "epoch": 0.24480450862979922, "grad_norm": 1.3981366157531738, "learning_rate": 9.649930312448441e-05, "loss": 0.0409, "step": 4170 }, { "epoch": 0.2453915698015733, "grad_norm": 1.2225673198699951, "learning_rate": 9.648228440147987e-05, "loss": 0.0634, "step": 4180 }, { "epoch": 0.24597863097334743, "grad_norm": 2.334582805633545, "learning_rate": 9.646522591839764e-05, "loss": 0.0557, "step": 4190 }, { "epoch": 0.24656569214512153, "grad_norm": 1.7163954973220825, "learning_rate": 9.64481276898292e-05, "loss": 0.0529, "step": 4200 }, { "epoch": 0.24715275331689562, "grad_norm": 1.703839898109436, "learning_rate": 9.64309897304001e-05, "loss": 0.0635, "step": 4210 }, { "epoch": 0.24773981448866972, "grad_norm": 2.077329397201538, "learning_rate": 9.641381205476981e-05, "loss": 0.0439, "step": 4220 }, { "epoch": 0.2483268756604438, "grad_norm": 1.3502600193023682, "learning_rate": 9.639659467763178e-05, "loss": 0.0608, "step": 4230 }, { "epoch": 0.2489139368322179, "grad_norm": 1.0238196849822998, "learning_rate": 9.637933761371345e-05, "loss": 0.0552, "step": 4240 }, { "epoch": 0.249500998003992, "grad_norm": 1.4089394807815552, "learning_rate": 9.636204087777618e-05, "loss": 0.0359, "step": 4250 }, { "epoch": 0.2500880591757661, "grad_norm": 1.6741256713867188, "learning_rate": 9.63447044846153e-05, "loss": 0.0412, "step": 4260 }, { "epoch": 0.2506751203475402, "grad_norm": 3.5649518966674805, "learning_rate": 9.632732844906e-05, "loss": 0.0518, "step": 4270 }, { "epoch": 0.2512621815193143, "grad_norm": 1.3733388185501099, "learning_rate": 9.630991278597344e-05, "loss": 0.0381, "step": 4280 }, { "epoch": 0.25184924269108844, "grad_norm": 3.6348133087158203, "learning_rate": 9.629245751025262e-05, "loss": 0.0884, "step": 4290 }, { "epoch": 0.25243630386286253, "grad_norm": 0.48154565691947937, "learning_rate": 9.62749626368285e-05, "loss": 0.0319, "step": 4300 }, { "epoch": 0.2530233650346366, "grad_norm": 1.678483486175537, "learning_rate": 9.625742818066586e-05, "loss": 0.0802, "step": 4310 }, { "epoch": 0.2536104262064107, "grad_norm": 1.2315188646316528, "learning_rate": 9.623985415676332e-05, "loss": 0.05, "step": 4320 }, { "epoch": 0.2541974873781848, "grad_norm": 1.968169093132019, "learning_rate": 9.622224058015339e-05, "loss": 0.0621, "step": 4330 }, { "epoch": 0.2547845485499589, "grad_norm": 3.186601161956787, "learning_rate": 9.62045874659024e-05, "loss": 0.0751, "step": 4340 }, { "epoch": 0.255371609721733, "grad_norm": 2.511409044265747, "learning_rate": 9.618689482911047e-05, "loss": 0.0486, "step": 4350 }, { "epoch": 0.2559586708935071, "grad_norm": 1.326047420501709, "learning_rate": 9.616916268491158e-05, "loss": 0.057, "step": 4360 }, { "epoch": 0.2565457320652812, "grad_norm": 2.203920602798462, "learning_rate": 9.615139104847348e-05, "loss": 0.054, "step": 4370 }, { "epoch": 0.2571327932370553, "grad_norm": 1.5300490856170654, "learning_rate": 9.613357993499766e-05, "loss": 0.0465, "step": 4380 }, { "epoch": 0.2577198544088294, "grad_norm": 4.299283981323242, "learning_rate": 9.611572935971941e-05, "loss": 0.0615, "step": 4390 }, { "epoch": 0.2583069155806035, "grad_norm": 2.623377561569214, "learning_rate": 9.609783933790784e-05, "loss": 0.0398, "step": 4400 }, { "epoch": 0.2588939767523776, "grad_norm": 1.9349173307418823, "learning_rate": 9.607990988486568e-05, "loss": 0.0352, "step": 4410 }, { "epoch": 0.25948103792415167, "grad_norm": 0.9777126908302307, "learning_rate": 9.606194101592947e-05, "loss": 0.0654, "step": 4420 }, { "epoch": 0.2600680990959258, "grad_norm": 2.232889175415039, "learning_rate": 9.604393274646945e-05, "loss": 0.0337, "step": 4430 }, { "epoch": 0.2606551602676999, "grad_norm": 3.5517923831939697, "learning_rate": 9.602588509188954e-05, "loss": 0.0389, "step": 4440 }, { "epoch": 0.261242221439474, "grad_norm": 5.011184215545654, "learning_rate": 9.600779806762738e-05, "loss": 0.0557, "step": 4450 }, { "epoch": 0.2618292826112481, "grad_norm": 3.305150270462036, "learning_rate": 9.59896716891543e-05, "loss": 0.0456, "step": 4460 }, { "epoch": 0.2624163437830222, "grad_norm": 5.539762496948242, "learning_rate": 9.59715059719752e-05, "loss": 0.0561, "step": 4470 }, { "epoch": 0.2630034049547963, "grad_norm": 2.27252197265625, "learning_rate": 9.595330093162876e-05, "loss": 0.0441, "step": 4480 }, { "epoch": 0.2635904661265704, "grad_norm": 4.284955024719238, "learning_rate": 9.593505658368718e-05, "loss": 0.0701, "step": 4490 }, { "epoch": 0.2641775272983445, "grad_norm": 2.516587018966675, "learning_rate": 9.591677294375636e-05, "loss": 0.0651, "step": 4500 }, { "epoch": 0.2647645884701186, "grad_norm": 2.2761285305023193, "learning_rate": 9.58984500274758e-05, "loss": 0.0501, "step": 4510 }, { "epoch": 0.2653516496418927, "grad_norm": 0.8737339377403259, "learning_rate": 9.588008785051854e-05, "loss": 0.0464, "step": 4520 }, { "epoch": 0.26593871081366677, "grad_norm": 2.2878708839416504, "learning_rate": 9.586168642859128e-05, "loss": 0.0317, "step": 4530 }, { "epoch": 0.26652577198544086, "grad_norm": 2.6433846950531006, "learning_rate": 9.584324577743425e-05, "loss": 0.0861, "step": 4540 }, { "epoch": 0.26711283315721496, "grad_norm": 3.8544507026672363, "learning_rate": 9.582476591282119e-05, "loss": 0.0574, "step": 4550 }, { "epoch": 0.2676998943289891, "grad_norm": 1.556267499923706, "learning_rate": 9.58062468505595e-05, "loss": 0.0326, "step": 4560 }, { "epoch": 0.2682869555007632, "grad_norm": 1.2943931818008423, "learning_rate": 9.578768860649e-05, "loss": 0.0524, "step": 4570 }, { "epoch": 0.2688740166725373, "grad_norm": 2.2922072410583496, "learning_rate": 9.576909119648705e-05, "loss": 0.0417, "step": 4580 }, { "epoch": 0.2694610778443114, "grad_norm": 3.72062611579895, "learning_rate": 9.575045463645858e-05, "loss": 0.059, "step": 4590 }, { "epoch": 0.2700481390160855, "grad_norm": 2.0806610584259033, "learning_rate": 9.573177894234591e-05, "loss": 0.0952, "step": 4600 }, { "epoch": 0.2706352001878596, "grad_norm": 3.698206663131714, "learning_rate": 9.571306413012388e-05, "loss": 0.0572, "step": 4610 }, { "epoch": 0.2712222613596337, "grad_norm": 0.6957169771194458, "learning_rate": 9.569431021580082e-05, "loss": 0.0431, "step": 4620 }, { "epoch": 0.27180932253140777, "grad_norm": 3.8748273849487305, "learning_rate": 9.567551721541846e-05, "loss": 0.0646, "step": 4630 }, { "epoch": 0.27239638370318187, "grad_norm": 2.4940974712371826, "learning_rate": 9.565668514505199e-05, "loss": 0.0543, "step": 4640 }, { "epoch": 0.27298344487495596, "grad_norm": 1.1546156406402588, "learning_rate": 9.563781402081e-05, "loss": 0.0613, "step": 4650 }, { "epoch": 0.27357050604673006, "grad_norm": 1.8990427255630493, "learning_rate": 9.56189038588345e-05, "loss": 0.0841, "step": 4660 }, { "epoch": 0.27415756721850415, "grad_norm": 2.5397582054138184, "learning_rate": 9.559995467530091e-05, "loss": 0.0343, "step": 4670 }, { "epoch": 0.27474462839027824, "grad_norm": 1.3965009450912476, "learning_rate": 9.558096648641797e-05, "loss": 0.064, "step": 4680 }, { "epoch": 0.27533168956205234, "grad_norm": 1.6880937814712524, "learning_rate": 9.556193930842785e-05, "loss": 0.0567, "step": 4690 }, { "epoch": 0.2759187507338265, "grad_norm": 2.050307035446167, "learning_rate": 9.554287315760603e-05, "loss": 0.0569, "step": 4700 }, { "epoch": 0.2765058119056006, "grad_norm": 3.7817277908325195, "learning_rate": 9.552376805026136e-05, "loss": 0.0537, "step": 4710 }, { "epoch": 0.2770928730773747, "grad_norm": 1.3971900939941406, "learning_rate": 9.550462400273596e-05, "loss": 0.0938, "step": 4720 }, { "epoch": 0.2776799342491488, "grad_norm": 1.720454216003418, "learning_rate": 9.54854410314053e-05, "loss": 0.0401, "step": 4730 }, { "epoch": 0.27826699542092287, "grad_norm": 3.3499581813812256, "learning_rate": 9.546621915267815e-05, "loss": 0.0529, "step": 4740 }, { "epoch": 0.27885405659269696, "grad_norm": 1.5244182348251343, "learning_rate": 9.544695838299653e-05, "loss": 0.0506, "step": 4750 }, { "epoch": 0.27944111776447106, "grad_norm": 2.2304186820983887, "learning_rate": 9.542765873883577e-05, "loss": 0.0526, "step": 4760 }, { "epoch": 0.28002817893624515, "grad_norm": 7.138095378875732, "learning_rate": 9.540832023670439e-05, "loss": 0.0547, "step": 4770 }, { "epoch": 0.28061524010801925, "grad_norm": 1.4671604633331299, "learning_rate": 9.53889428931442e-05, "loss": 0.0456, "step": 4780 }, { "epoch": 0.28120230127979334, "grad_norm": 0.7101681232452393, "learning_rate": 9.536952672473021e-05, "loss": 0.0264, "step": 4790 }, { "epoch": 0.28178936245156744, "grad_norm": 1.487818956375122, "learning_rate": 9.535007174807066e-05, "loss": 0.0278, "step": 4800 }, { "epoch": 0.28237642362334153, "grad_norm": 3.8689186573028564, "learning_rate": 9.533057797980696e-05, "loss": 0.0663, "step": 4810 }, { "epoch": 0.2829634847951156, "grad_norm": 1.6633166074752808, "learning_rate": 9.531104543661374e-05, "loss": 0.054, "step": 4820 }, { "epoch": 0.2835505459668898, "grad_norm": 1.5947315692901611, "learning_rate": 9.529147413519873e-05, "loss": 0.0534, "step": 4830 }, { "epoch": 0.2841376071386639, "grad_norm": 1.783229112625122, "learning_rate": 9.52718640923029e-05, "loss": 0.059, "step": 4840 }, { "epoch": 0.28472466831043797, "grad_norm": 0.6430601477622986, "learning_rate": 9.525221532470029e-05, "loss": 0.0335, "step": 4850 }, { "epoch": 0.28531172948221206, "grad_norm": 1.6459349393844604, "learning_rate": 9.523252784919809e-05, "loss": 0.0495, "step": 4860 }, { "epoch": 0.28589879065398616, "grad_norm": 1.862572193145752, "learning_rate": 9.52128016826366e-05, "loss": 0.0582, "step": 4870 }, { "epoch": 0.28648585182576025, "grad_norm": 2.487229108810425, "learning_rate": 9.519303684188922e-05, "loss": 0.0346, "step": 4880 }, { "epoch": 0.28707291299753435, "grad_norm": 4.924187660217285, "learning_rate": 9.517323334386244e-05, "loss": 0.0568, "step": 4890 }, { "epoch": 0.28765997416930844, "grad_norm": 0.5068224668502808, "learning_rate": 9.515339120549576e-05, "loss": 0.0278, "step": 4900 }, { "epoch": 0.28824703534108254, "grad_norm": 1.3012948036193848, "learning_rate": 9.513351044376182e-05, "loss": 0.0562, "step": 4910 }, { "epoch": 0.28883409651285663, "grad_norm": 2.8076093196868896, "learning_rate": 9.51135910756662e-05, "loss": 0.0512, "step": 4920 }, { "epoch": 0.2894211576846307, "grad_norm": 0.8986666202545166, "learning_rate": 9.509363311824761e-05, "loss": 0.0552, "step": 4930 }, { "epoch": 0.2900082188564048, "grad_norm": 1.3616158962249756, "learning_rate": 9.507363658857768e-05, "loss": 0.0357, "step": 4940 }, { "epoch": 0.2905952800281789, "grad_norm": 0.8213547468185425, "learning_rate": 9.505360150376109e-05, "loss": 0.0461, "step": 4950 }, { "epoch": 0.291182341199953, "grad_norm": 1.0852934122085571, "learning_rate": 9.503352788093547e-05, "loss": 0.0537, "step": 4960 }, { "epoch": 0.29176940237172716, "grad_norm": 0.9509713053703308, "learning_rate": 9.501341573727141e-05, "loss": 0.0489, "step": 4970 }, { "epoch": 0.29235646354350125, "grad_norm": 2.768786907196045, "learning_rate": 9.499326508997246e-05, "loss": 0.0475, "step": 4980 }, { "epoch": 0.29294352471527535, "grad_norm": 1.5725703239440918, "learning_rate": 9.497307595627511e-05, "loss": 0.0636, "step": 4990 }, { "epoch": 0.29353058588704944, "grad_norm": 1.2331961393356323, "learning_rate": 9.495284835344879e-05, "loss": 0.0282, "step": 5000 }, { "epoch": 0.29411764705882354, "grad_norm": 1.0896481275558472, "learning_rate": 9.49325822987958e-05, "loss": 0.0411, "step": 5010 }, { "epoch": 0.29470470823059763, "grad_norm": 1.3905304670333862, "learning_rate": 9.491227780965136e-05, "loss": 0.066, "step": 5020 }, { "epoch": 0.29529176940237173, "grad_norm": 1.7909550666809082, "learning_rate": 9.48919349033835e-05, "loss": 0.0641, "step": 5030 }, { "epoch": 0.2958788305741458, "grad_norm": 1.991182804107666, "learning_rate": 9.487155359739321e-05, "loss": 0.0551, "step": 5040 }, { "epoch": 0.2964658917459199, "grad_norm": 1.1114460229873657, "learning_rate": 9.485113390911427e-05, "loss": 0.0409, "step": 5050 }, { "epoch": 0.297052952917694, "grad_norm": 1.673074722290039, "learning_rate": 9.483067585601327e-05, "loss": 0.0579, "step": 5060 }, { "epoch": 0.2976400140894681, "grad_norm": 0.9401737451553345, "learning_rate": 9.481017945558969e-05, "loss": 0.0419, "step": 5070 }, { "epoch": 0.2982270752612422, "grad_norm": 4.425198078155518, "learning_rate": 9.478964472537575e-05, "loss": 0.0296, "step": 5080 }, { "epoch": 0.2988141364330163, "grad_norm": 1.2268235683441162, "learning_rate": 9.476907168293646e-05, "loss": 0.0555, "step": 5090 }, { "epoch": 0.2994011976047904, "grad_norm": 3.2112858295440674, "learning_rate": 9.474846034586964e-05, "loss": 0.0476, "step": 5100 }, { "epoch": 0.29998825877656454, "grad_norm": 1.4233640432357788, "learning_rate": 9.472781073180582e-05, "loss": 0.049, "step": 5110 }, { "epoch": 0.30057531994833864, "grad_norm": 0.9212433695793152, "learning_rate": 9.47071228584083e-05, "loss": 0.0451, "step": 5120 }, { "epoch": 0.30116238112011273, "grad_norm": 1.5184338092803955, "learning_rate": 9.468639674337312e-05, "loss": 0.1159, "step": 5130 }, { "epoch": 0.3017494422918868, "grad_norm": 2.7212154865264893, "learning_rate": 9.466563240442901e-05, "loss": 0.0582, "step": 5140 }, { "epoch": 0.3023365034636609, "grad_norm": 1.481858253479004, "learning_rate": 9.464482985933736e-05, "loss": 0.0572, "step": 5150 }, { "epoch": 0.302923564635435, "grad_norm": 1.0168449878692627, "learning_rate": 9.462398912589232e-05, "loss": 0.0505, "step": 5160 }, { "epoch": 0.3035106258072091, "grad_norm": 1.187330961227417, "learning_rate": 9.460311022192064e-05, "loss": 0.0401, "step": 5170 }, { "epoch": 0.3040976869789832, "grad_norm": 3.9765541553497314, "learning_rate": 9.458219316528175e-05, "loss": 0.0524, "step": 5180 }, { "epoch": 0.3046847481507573, "grad_norm": 1.3952094316482544, "learning_rate": 9.456123797386771e-05, "loss": 0.0778, "step": 5190 }, { "epoch": 0.3052718093225314, "grad_norm": 2.859480381011963, "learning_rate": 9.45402446656032e-05, "loss": 0.0503, "step": 5200 }, { "epoch": 0.3058588704943055, "grad_norm": 1.7766053676605225, "learning_rate": 9.451921325844551e-05, "loss": 0.0457, "step": 5210 }, { "epoch": 0.3064459316660796, "grad_norm": 0.7257308959960938, "learning_rate": 9.449814377038452e-05, "loss": 0.0611, "step": 5220 }, { "epoch": 0.3070329928378537, "grad_norm": 2.225612163543701, "learning_rate": 9.447703621944264e-05, "loss": 0.028, "step": 5230 }, { "epoch": 0.30762005400962783, "grad_norm": 2.1581358909606934, "learning_rate": 9.445589062367491e-05, "loss": 0.0624, "step": 5240 }, { "epoch": 0.3082071151814019, "grad_norm": 1.3447891473770142, "learning_rate": 9.443470700116887e-05, "loss": 0.0334, "step": 5250 }, { "epoch": 0.308794176353176, "grad_norm": 6.130178928375244, "learning_rate": 9.441348537004459e-05, "loss": 0.0685, "step": 5260 }, { "epoch": 0.3093812375249501, "grad_norm": 2.950314521789551, "learning_rate": 9.439222574845465e-05, "loss": 0.0669, "step": 5270 }, { "epoch": 0.3099682986967242, "grad_norm": 6.2969231605529785, "learning_rate": 9.437092815458415e-05, "loss": 0.0807, "step": 5280 }, { "epoch": 0.3105553598684983, "grad_norm": 1.214200496673584, "learning_rate": 9.434959260665064e-05, "loss": 0.0347, "step": 5290 }, { "epoch": 0.3111424210402724, "grad_norm": 5.248291969299316, "learning_rate": 9.432821912290414e-05, "loss": 0.0686, "step": 5300 }, { "epoch": 0.3117294822120465, "grad_norm": 3.0085883140563965, "learning_rate": 9.430680772162716e-05, "loss": 0.0681, "step": 5310 }, { "epoch": 0.3123165433838206, "grad_norm": 2.349296808242798, "learning_rate": 9.428535842113459e-05, "loss": 0.0673, "step": 5320 }, { "epoch": 0.3129036045555947, "grad_norm": 2.5957016944885254, "learning_rate": 9.426387123977378e-05, "loss": 0.0484, "step": 5330 }, { "epoch": 0.3134906657273688, "grad_norm": 0.4194146990776062, "learning_rate": 9.424234619592442e-05, "loss": 0.0643, "step": 5340 }, { "epoch": 0.3140777268991429, "grad_norm": 2.747285842895508, "learning_rate": 9.422078330799868e-05, "loss": 0.0396, "step": 5350 }, { "epoch": 0.31466478807091697, "grad_norm": 1.0044994354248047, "learning_rate": 9.419918259444104e-05, "loss": 0.0649, "step": 5360 }, { "epoch": 0.31525184924269106, "grad_norm": 1.485447883605957, "learning_rate": 9.417754407372832e-05, "loss": 0.0423, "step": 5370 }, { "epoch": 0.3158389104144652, "grad_norm": 3.9312610626220703, "learning_rate": 9.415586776436973e-05, "loss": 0.0753, "step": 5380 }, { "epoch": 0.3164259715862393, "grad_norm": 4.346195220947266, "learning_rate": 9.413415368490678e-05, "loss": 0.0782, "step": 5390 }, { "epoch": 0.3170130327580134, "grad_norm": 1.1620670557022095, "learning_rate": 9.411240185391327e-05, "loss": 0.0659, "step": 5400 }, { "epoch": 0.3176000939297875, "grad_norm": 1.6305251121520996, "learning_rate": 9.409061228999533e-05, "loss": 0.0543, "step": 5410 }, { "epoch": 0.3181871551015616, "grad_norm": 0.7924861311912537, "learning_rate": 9.406878501179135e-05, "loss": 0.0414, "step": 5420 }, { "epoch": 0.3187742162733357, "grad_norm": 0.6430279016494751, "learning_rate": 9.404692003797196e-05, "loss": 0.0288, "step": 5430 }, { "epoch": 0.3193612774451098, "grad_norm": 2.6675634384155273, "learning_rate": 9.402501738724004e-05, "loss": 0.0824, "step": 5440 }, { "epoch": 0.3199483386168839, "grad_norm": 2.216702699661255, "learning_rate": 9.400307707833074e-05, "loss": 0.0398, "step": 5450 }, { "epoch": 0.32053539978865797, "grad_norm": 1.5737630128860474, "learning_rate": 9.398109913001136e-05, "loss": 0.0255, "step": 5460 }, { "epoch": 0.32112246096043207, "grad_norm": 2.763911247253418, "learning_rate": 9.395908356108145e-05, "loss": 0.0471, "step": 5470 }, { "epoch": 0.32170952213220616, "grad_norm": 4.0142059326171875, "learning_rate": 9.393703039037269e-05, "loss": 0.0534, "step": 5480 }, { "epoch": 0.32229658330398026, "grad_norm": 1.8663016557693481, "learning_rate": 9.391493963674899e-05, "loss": 0.0679, "step": 5490 }, { "epoch": 0.32288364447575435, "grad_norm": 2.364773988723755, "learning_rate": 9.389281131910633e-05, "loss": 0.0631, "step": 5500 }, { "epoch": 0.32347070564752844, "grad_norm": 2.4541656970977783, "learning_rate": 9.387064545637287e-05, "loss": 0.0668, "step": 5510 }, { "epoch": 0.3240577668193026, "grad_norm": 1.0280964374542236, "learning_rate": 9.384844206750889e-05, "loss": 0.041, "step": 5520 }, { "epoch": 0.3246448279910767, "grad_norm": 1.2305879592895508, "learning_rate": 9.382620117150673e-05, "loss": 0.0471, "step": 5530 }, { "epoch": 0.3252318891628508, "grad_norm": 0.4076092541217804, "learning_rate": 9.380392278739085e-05, "loss": 0.0598, "step": 5540 }, { "epoch": 0.3258189503346249, "grad_norm": 2.261909008026123, "learning_rate": 9.378160693421778e-05, "loss": 0.071, "step": 5550 }, { "epoch": 0.326406011506399, "grad_norm": 1.7203713655471802, "learning_rate": 9.375925363107604e-05, "loss": 0.0722, "step": 5560 }, { "epoch": 0.32699307267817307, "grad_norm": 2.0784318447113037, "learning_rate": 9.373686289708629e-05, "loss": 0.0466, "step": 5570 }, { "epoch": 0.32758013384994716, "grad_norm": 2.560448169708252, "learning_rate": 9.371443475140108e-05, "loss": 0.0782, "step": 5580 }, { "epoch": 0.32816719502172126, "grad_norm": 1.0502249002456665, "learning_rate": 9.369196921320506e-05, "loss": 0.0564, "step": 5590 }, { "epoch": 0.32875425619349535, "grad_norm": 1.2666996717453003, "learning_rate": 9.366946630171485e-05, "loss": 0.0359, "step": 5600 }, { "epoch": 0.32934131736526945, "grad_norm": 2.948201894760132, "learning_rate": 9.364692603617899e-05, "loss": 0.0797, "step": 5610 }, { "epoch": 0.32992837853704354, "grad_norm": 2.796678066253662, "learning_rate": 9.3624348435878e-05, "loss": 0.0489, "step": 5620 }, { "epoch": 0.33051543970881764, "grad_norm": 3.7815842628479004, "learning_rate": 9.360173352012436e-05, "loss": 0.0638, "step": 5630 }, { "epoch": 0.33110250088059173, "grad_norm": 2.1624441146850586, "learning_rate": 9.357908130826243e-05, "loss": 0.0405, "step": 5640 }, { "epoch": 0.3316895620523659, "grad_norm": 4.668314456939697, "learning_rate": 9.355639181966849e-05, "loss": 0.0904, "step": 5650 }, { "epoch": 0.33227662322414, "grad_norm": 2.2707722187042236, "learning_rate": 9.353366507375072e-05, "loss": 0.0454, "step": 5660 }, { "epoch": 0.3328636843959141, "grad_norm": 0.3836930990219116, "learning_rate": 9.351090108994913e-05, "loss": 0.0489, "step": 5670 }, { "epoch": 0.33345074556768817, "grad_norm": 4.931671142578125, "learning_rate": 9.348809988773564e-05, "loss": 0.0748, "step": 5680 }, { "epoch": 0.33403780673946226, "grad_norm": 2.0783421993255615, "learning_rate": 9.346526148661392e-05, "loss": 0.0308, "step": 5690 }, { "epoch": 0.33462486791123636, "grad_norm": 1.1517889499664307, "learning_rate": 9.344238590611955e-05, "loss": 0.042, "step": 5700 }, { "epoch": 0.33521192908301045, "grad_norm": 1.0855244398117065, "learning_rate": 9.341947316581989e-05, "loss": 0.0806, "step": 5710 }, { "epoch": 0.33579899025478455, "grad_norm": 3.335794448852539, "learning_rate": 9.339652328531403e-05, "loss": 0.0611, "step": 5720 }, { "epoch": 0.33638605142655864, "grad_norm": 3.330859661102295, "learning_rate": 9.337353628423288e-05, "loss": 0.0603, "step": 5730 }, { "epoch": 0.33697311259833274, "grad_norm": 1.3413203954696655, "learning_rate": 9.335051218223912e-05, "loss": 0.0415, "step": 5740 }, { "epoch": 0.33756017377010683, "grad_norm": 1.55022394657135, "learning_rate": 9.332745099902709e-05, "loss": 0.0521, "step": 5750 }, { "epoch": 0.3381472349418809, "grad_norm": 3.1509315967559814, "learning_rate": 9.330435275432293e-05, "loss": 0.0558, "step": 5760 }, { "epoch": 0.338734296113655, "grad_norm": 2.5709340572357178, "learning_rate": 9.328121746788444e-05, "loss": 0.0465, "step": 5770 }, { "epoch": 0.3393213572854291, "grad_norm": 2.223271608352661, "learning_rate": 9.325804515950109e-05, "loss": 0.0934, "step": 5780 }, { "epoch": 0.33990841845720327, "grad_norm": 4.6144280433654785, "learning_rate": 9.323483584899409e-05, "loss": 0.0858, "step": 5790 }, { "epoch": 0.34049547962897736, "grad_norm": 1.6129003763198853, "learning_rate": 9.321158955621621e-05, "loss": 0.0547, "step": 5800 }, { "epoch": 0.34108254080075145, "grad_norm": 2.4976346492767334, "learning_rate": 9.318830630105188e-05, "loss": 0.0815, "step": 5810 }, { "epoch": 0.34166960197252555, "grad_norm": 2.1707308292388916, "learning_rate": 9.31649861034172e-05, "loss": 0.0484, "step": 5820 }, { "epoch": 0.34225666314429964, "grad_norm": 1.6740494966506958, "learning_rate": 9.314162898325981e-05, "loss": 0.0501, "step": 5830 }, { "epoch": 0.34284372431607374, "grad_norm": 2.1383731365203857, "learning_rate": 9.311823496055896e-05, "loss": 0.0345, "step": 5840 }, { "epoch": 0.34343078548784783, "grad_norm": 0.9850202798843384, "learning_rate": 9.309480405532547e-05, "loss": 0.0469, "step": 5850 }, { "epoch": 0.34401784665962193, "grad_norm": 1.57024085521698, "learning_rate": 9.307133628760168e-05, "loss": 0.0623, "step": 5860 }, { "epoch": 0.344604907831396, "grad_norm": 3.165447473526001, "learning_rate": 9.30478316774615e-05, "loss": 0.0543, "step": 5870 }, { "epoch": 0.3451919690031701, "grad_norm": 3.8253517150878906, "learning_rate": 9.302429024501031e-05, "loss": 0.0634, "step": 5880 }, { "epoch": 0.3457790301749442, "grad_norm": 0.32235458493232727, "learning_rate": 9.300071201038503e-05, "loss": 0.0285, "step": 5890 }, { "epoch": 0.3463660913467183, "grad_norm": 1.290035367012024, "learning_rate": 9.297709699375403e-05, "loss": 0.0578, "step": 5900 }, { "epoch": 0.3469531525184924, "grad_norm": 2.2928073406219482, "learning_rate": 9.295344521531717e-05, "loss": 0.0493, "step": 5910 }, { "epoch": 0.34754021369026655, "grad_norm": 3.7337634563446045, "learning_rate": 9.292975669530573e-05, "loss": 0.034, "step": 5920 }, { "epoch": 0.34812727486204065, "grad_norm": 2.512702703475952, "learning_rate": 9.290603145398243e-05, "loss": 0.0702, "step": 5930 }, { "epoch": 0.34871433603381474, "grad_norm": 0.6247937083244324, "learning_rate": 9.288226951164138e-05, "loss": 0.0508, "step": 5940 }, { "epoch": 0.34930139720558884, "grad_norm": 2.273139238357544, "learning_rate": 9.285847088860813e-05, "loss": 0.0506, "step": 5950 }, { "epoch": 0.34988845837736293, "grad_norm": 0.7583631277084351, "learning_rate": 9.283463560523956e-05, "loss": 0.0422, "step": 5960 }, { "epoch": 0.350475519549137, "grad_norm": 0.9775404930114746, "learning_rate": 9.281076368192392e-05, "loss": 0.0366, "step": 5970 }, { "epoch": 0.3510625807209111, "grad_norm": 0.754428505897522, "learning_rate": 9.278685513908083e-05, "loss": 0.0394, "step": 5980 }, { "epoch": 0.3516496418926852, "grad_norm": 2.7804548740386963, "learning_rate": 9.276290999716119e-05, "loss": 0.0675, "step": 5990 }, { "epoch": 0.3522367030644593, "grad_norm": 1.3838735818862915, "learning_rate": 9.273892827664725e-05, "loss": 0.0539, "step": 6000 }, { "epoch": 0.3522367030644593, "eval_loss": 0.43607965111732483, "eval_runtime": 269.6215, "eval_samples_per_second": 3.505, "eval_steps_per_second": 3.505, "step": 6000 }, { "epoch": 0.3528237642362334, "grad_norm": 2.7034237384796143, "learning_rate": 9.27149099980525e-05, "loss": 0.0492, "step": 6010 }, { "epoch": 0.3534108254080075, "grad_norm": 1.50301194190979, "learning_rate": 9.269085518192175e-05, "loss": 0.0312, "step": 6020 }, { "epoch": 0.3539978865797816, "grad_norm": 2.883643388748169, "learning_rate": 9.266676384883101e-05, "loss": 0.0441, "step": 6030 }, { "epoch": 0.3545849477515557, "grad_norm": 2.8949780464172363, "learning_rate": 9.264263601938759e-05, "loss": 0.0393, "step": 6040 }, { "epoch": 0.3551720089233298, "grad_norm": 1.693753719329834, "learning_rate": 9.261847171422996e-05, "loss": 0.0653, "step": 6050 }, { "epoch": 0.35575907009510394, "grad_norm": 4.559887886047363, "learning_rate": 9.259427095402782e-05, "loss": 0.0422, "step": 6060 }, { "epoch": 0.35634613126687803, "grad_norm": 0.9856237173080444, "learning_rate": 9.257003375948207e-05, "loss": 0.0656, "step": 6070 }, { "epoch": 0.3569331924386521, "grad_norm": 2.586559295654297, "learning_rate": 9.254576015132473e-05, "loss": 0.0504, "step": 6080 }, { "epoch": 0.3575202536104262, "grad_norm": 1.9370591640472412, "learning_rate": 9.252145015031899e-05, "loss": 0.0376, "step": 6090 }, { "epoch": 0.3581073147822003, "grad_norm": 1.0948896408081055, "learning_rate": 9.249710377725917e-05, "loss": 0.0433, "step": 6100 }, { "epoch": 0.3586943759539744, "grad_norm": 4.784580707550049, "learning_rate": 9.247272105297074e-05, "loss": 0.0523, "step": 6110 }, { "epoch": 0.3592814371257485, "grad_norm": 2.7346839904785156, "learning_rate": 9.244830199831016e-05, "loss": 0.0408, "step": 6120 }, { "epoch": 0.3598684982975226, "grad_norm": 2.734044313430786, "learning_rate": 9.24238466341651e-05, "loss": 0.0594, "step": 6130 }, { "epoch": 0.3604555594692967, "grad_norm": 6.74056339263916, "learning_rate": 9.239935498145418e-05, "loss": 0.0644, "step": 6140 }, { "epoch": 0.3610426206410708, "grad_norm": 2.151662826538086, "learning_rate": 9.237482706112712e-05, "loss": 0.0398, "step": 6150 }, { "epoch": 0.3616296818128449, "grad_norm": 3.8674449920654297, "learning_rate": 9.235026289416463e-05, "loss": 0.0335, "step": 6160 }, { "epoch": 0.362216742984619, "grad_norm": 1.9136427640914917, "learning_rate": 9.232566250157845e-05, "loss": 0.0455, "step": 6170 }, { "epoch": 0.3628038041563931, "grad_norm": 1.333677887916565, "learning_rate": 9.23010259044113e-05, "loss": 0.0501, "step": 6180 }, { "epoch": 0.36339086532816717, "grad_norm": 0.8308841586112976, "learning_rate": 9.227635312373686e-05, "loss": 0.0387, "step": 6190 }, { "epoch": 0.3639779264999413, "grad_norm": 1.8643447160720825, "learning_rate": 9.225164418065976e-05, "loss": 0.0327, "step": 6200 }, { "epoch": 0.3645649876717154, "grad_norm": 2.7782013416290283, "learning_rate": 9.222689909631557e-05, "loss": 0.067, "step": 6210 }, { "epoch": 0.3651520488434895, "grad_norm": 1.2801259756088257, "learning_rate": 9.220211789187078e-05, "loss": 0.0677, "step": 6220 }, { "epoch": 0.3657391100152636, "grad_norm": 2.8043463230133057, "learning_rate": 9.217730058852276e-05, "loss": 0.0285, "step": 6230 }, { "epoch": 0.3663261711870377, "grad_norm": 3.0080983638763428, "learning_rate": 9.215244720749979e-05, "loss": 0.0352, "step": 6240 }, { "epoch": 0.3669132323588118, "grad_norm": 1.804460048675537, "learning_rate": 9.212755777006097e-05, "loss": 0.0419, "step": 6250 }, { "epoch": 0.3675002935305859, "grad_norm": 2.595419406890869, "learning_rate": 9.210263229749626e-05, "loss": 0.0571, "step": 6260 }, { "epoch": 0.36808735470236, "grad_norm": 2.6692843437194824, "learning_rate": 9.207767081112642e-05, "loss": 0.057, "step": 6270 }, { "epoch": 0.3686744158741341, "grad_norm": 1.075913906097412, "learning_rate": 9.20526733323031e-05, "loss": 0.0438, "step": 6280 }, { "epoch": 0.36926147704590817, "grad_norm": 1.5013805627822876, "learning_rate": 9.202763988240861e-05, "loss": 0.046, "step": 6290 }, { "epoch": 0.36984853821768227, "grad_norm": 1.6638853549957275, "learning_rate": 9.200257048285615e-05, "loss": 0.1082, "step": 6300 }, { "epoch": 0.37043559938945636, "grad_norm": 1.4856232404708862, "learning_rate": 9.197746515508955e-05, "loss": 0.035, "step": 6310 }, { "epoch": 0.37102266056123046, "grad_norm": 2.476149797439575, "learning_rate": 9.195232392058353e-05, "loss": 0.0561, "step": 6320 }, { "epoch": 0.3716097217330046, "grad_norm": 1.2098942995071411, "learning_rate": 9.192714680084336e-05, "loss": 0.0501, "step": 6330 }, { "epoch": 0.3721967829047787, "grad_norm": 1.8765240907669067, "learning_rate": 9.19019338174051e-05, "loss": 0.0531, "step": 6340 }, { "epoch": 0.3727838440765528, "grad_norm": 4.244453430175781, "learning_rate": 9.187668499183546e-05, "loss": 0.0754, "step": 6350 }, { "epoch": 0.3733709052483269, "grad_norm": 1.681510329246521, "learning_rate": 9.185140034573182e-05, "loss": 0.0267, "step": 6360 }, { "epoch": 0.373957966420101, "grad_norm": 1.2957274913787842, "learning_rate": 9.182607990072221e-05, "loss": 0.0533, "step": 6370 }, { "epoch": 0.3745450275918751, "grad_norm": 2.11533522605896, "learning_rate": 9.180072367846523e-05, "loss": 0.0449, "step": 6380 }, { "epoch": 0.3751320887636492, "grad_norm": 4.636698246002197, "learning_rate": 9.177533170065014e-05, "loss": 0.0535, "step": 6390 }, { "epoch": 0.37571914993542327, "grad_norm": 4.604533672332764, "learning_rate": 9.174990398899677e-05, "loss": 0.0448, "step": 6400 }, { "epoch": 0.37630621110719736, "grad_norm": 0.620793879032135, "learning_rate": 9.172444056525549e-05, "loss": 0.0507, "step": 6410 }, { "epoch": 0.37689327227897146, "grad_norm": 1.1245390176773071, "learning_rate": 9.169894145120725e-05, "loss": 0.0473, "step": 6420 }, { "epoch": 0.37748033345074555, "grad_norm": 1.7963658571243286, "learning_rate": 9.167340666866351e-05, "loss": 0.0586, "step": 6430 }, { "epoch": 0.37806739462251965, "grad_norm": 1.785157322883606, "learning_rate": 9.164783623946626e-05, "loss": 0.0585, "step": 6440 }, { "epoch": 0.37865445579429374, "grad_norm": 2.424060821533203, "learning_rate": 9.162223018548795e-05, "loss": 0.0481, "step": 6450 }, { "epoch": 0.37924151696606784, "grad_norm": 1.3500518798828125, "learning_rate": 9.15965885286315e-05, "loss": 0.0361, "step": 6460 }, { "epoch": 0.379828578137842, "grad_norm": 2.119814395904541, "learning_rate": 9.157091129083037e-05, "loss": 0.0841, "step": 6470 }, { "epoch": 0.3804156393096161, "grad_norm": 0.6505934000015259, "learning_rate": 9.154519849404834e-05, "loss": 0.0675, "step": 6480 }, { "epoch": 0.3810027004813902, "grad_norm": 2.0686392784118652, "learning_rate": 9.151945016027965e-05, "loss": 0.0461, "step": 6490 }, { "epoch": 0.38158976165316427, "grad_norm": 3.7864797115325928, "learning_rate": 9.149366631154899e-05, "loss": 0.0576, "step": 6500 }, { "epoch": 0.38217682282493837, "grad_norm": 2.2559974193573, "learning_rate": 9.146784696991132e-05, "loss": 0.0568, "step": 6510 }, { "epoch": 0.38276388399671246, "grad_norm": 3.6570136547088623, "learning_rate": 9.144199215745206e-05, "loss": 0.0521, "step": 6520 }, { "epoch": 0.38335094516848656, "grad_norm": 1.3199042081832886, "learning_rate": 9.141610189628695e-05, "loss": 0.0372, "step": 6530 }, { "epoch": 0.38393800634026065, "grad_norm": 1.8429245948791504, "learning_rate": 9.1390176208562e-05, "loss": 0.053, "step": 6540 }, { "epoch": 0.38452506751203475, "grad_norm": 1.6494383811950684, "learning_rate": 9.136421511645357e-05, "loss": 0.0564, "step": 6550 }, { "epoch": 0.38511212868380884, "grad_norm": 1.8413498401641846, "learning_rate": 9.133821864216829e-05, "loss": 0.0634, "step": 6560 }, { "epoch": 0.38569918985558294, "grad_norm": 2.9860267639160156, "learning_rate": 9.131218680794308e-05, "loss": 0.0516, "step": 6570 }, { "epoch": 0.38628625102735703, "grad_norm": 2.894827365875244, "learning_rate": 9.128611963604507e-05, "loss": 0.0517, "step": 6580 }, { "epoch": 0.3868733121991311, "grad_norm": 5.117035865783691, "learning_rate": 9.126001714877161e-05, "loss": 0.0434, "step": 6590 }, { "epoch": 0.3874603733709053, "grad_norm": 1.3150407075881958, "learning_rate": 9.123387936845032e-05, "loss": 0.032, "step": 6600 }, { "epoch": 0.38804743454267937, "grad_norm": 2.9667177200317383, "learning_rate": 9.120770631743894e-05, "loss": 0.0428, "step": 6610 }, { "epoch": 0.38863449571445347, "grad_norm": 1.4882994890213013, "learning_rate": 9.118149801812543e-05, "loss": 0.0396, "step": 6620 }, { "epoch": 0.38922155688622756, "grad_norm": 2.298806667327881, "learning_rate": 9.115525449292786e-05, "loss": 0.053, "step": 6630 }, { "epoch": 0.38980861805800165, "grad_norm": 2.150547504425049, "learning_rate": 9.112897576429446e-05, "loss": 0.0549, "step": 6640 }, { "epoch": 0.39039567922977575, "grad_norm": 3.4609265327453613, "learning_rate": 9.110266185470358e-05, "loss": 0.0296, "step": 6650 }, { "epoch": 0.39098274040154984, "grad_norm": 2.4302566051483154, "learning_rate": 9.10763127866636e-05, "loss": 0.0588, "step": 6660 }, { "epoch": 0.39156980157332394, "grad_norm": 4.155962944030762, "learning_rate": 9.104992858271307e-05, "loss": 0.0697, "step": 6670 }, { "epoch": 0.39215686274509803, "grad_norm": 1.6231175661087036, "learning_rate": 9.102350926542051e-05, "loss": 0.0406, "step": 6680 }, { "epoch": 0.39274392391687213, "grad_norm": 2.924736976623535, "learning_rate": 9.099705485738454e-05, "loss": 0.0818, "step": 6690 }, { "epoch": 0.3933309850886462, "grad_norm": 1.0622353553771973, "learning_rate": 9.097056538123376e-05, "loss": 0.0334, "step": 6700 }, { "epoch": 0.3939180462604203, "grad_norm": 2.2383320331573486, "learning_rate": 9.094404085962676e-05, "loss": 0.0627, "step": 6710 }, { "epoch": 0.3945051074321944, "grad_norm": 1.8888459205627441, "learning_rate": 9.091748131525212e-05, "loss": 0.0411, "step": 6720 }, { "epoch": 0.3950921686039685, "grad_norm": 1.1852898597717285, "learning_rate": 9.089088677082838e-05, "loss": 0.0539, "step": 6730 }, { "epoch": 0.39567922977574266, "grad_norm": 3.639298677444458, "learning_rate": 9.086425724910403e-05, "loss": 0.0831, "step": 6740 }, { "epoch": 0.39626629094751675, "grad_norm": 3.249659299850464, "learning_rate": 9.083759277285745e-05, "loss": 0.0407, "step": 6750 }, { "epoch": 0.39685335211929085, "grad_norm": 0.2997399866580963, "learning_rate": 9.081089336489694e-05, "loss": 0.0392, "step": 6760 }, { "epoch": 0.39744041329106494, "grad_norm": 2.3485286235809326, "learning_rate": 9.078415904806068e-05, "loss": 0.0672, "step": 6770 }, { "epoch": 0.39802747446283904, "grad_norm": 3.7543492317199707, "learning_rate": 9.07573898452167e-05, "loss": 0.0557, "step": 6780 }, { "epoch": 0.39861453563461313, "grad_norm": 1.3386050462722778, "learning_rate": 9.073058577926287e-05, "loss": 0.0544, "step": 6790 }, { "epoch": 0.3992015968063872, "grad_norm": 2.3343665599823, "learning_rate": 9.070374687312689e-05, "loss": 0.0527, "step": 6800 }, { "epoch": 0.3997886579781613, "grad_norm": 1.942726731300354, "learning_rate": 9.067687314976627e-05, "loss": 0.0418, "step": 6810 }, { "epoch": 0.4003757191499354, "grad_norm": 2.9893906116485596, "learning_rate": 9.064996463216828e-05, "loss": 0.055, "step": 6820 }, { "epoch": 0.4009627803217095, "grad_norm": 1.878150224685669, "learning_rate": 9.062302134334998e-05, "loss": 0.0496, "step": 6830 }, { "epoch": 0.4015498414934836, "grad_norm": 2.2546424865722656, "learning_rate": 9.059604330635813e-05, "loss": 0.065, "step": 6840 }, { "epoch": 0.4021369026652577, "grad_norm": 2.0570788383483887, "learning_rate": 9.056903054426927e-05, "loss": 0.0702, "step": 6850 }, { "epoch": 0.4027239638370318, "grad_norm": 1.2803014516830444, "learning_rate": 9.054198308018957e-05, "loss": 0.0567, "step": 6860 }, { "epoch": 0.4033110250088059, "grad_norm": 1.9832360744476318, "learning_rate": 9.051490093725494e-05, "loss": 0.0405, "step": 6870 }, { "epoch": 0.40389808618058004, "grad_norm": 1.720815896987915, "learning_rate": 9.048778413863097e-05, "loss": 0.0528, "step": 6880 }, { "epoch": 0.40448514735235414, "grad_norm": 2.059218645095825, "learning_rate": 9.046063270751283e-05, "loss": 0.0571, "step": 6890 }, { "epoch": 0.40507220852412823, "grad_norm": 2.6577646732330322, "learning_rate": 9.043344666712537e-05, "loss": 0.0502, "step": 6900 }, { "epoch": 0.4056592696959023, "grad_norm": 2.7444088459014893, "learning_rate": 9.0406226040723e-05, "loss": 0.049, "step": 6910 }, { "epoch": 0.4062463308676764, "grad_norm": 6.111288547515869, "learning_rate": 9.037897085158976e-05, "loss": 0.0829, "step": 6920 }, { "epoch": 0.4068333920394505, "grad_norm": 1.5149964094161987, "learning_rate": 9.03516811230392e-05, "loss": 0.0386, "step": 6930 }, { "epoch": 0.4074204532112246, "grad_norm": 2.5471954345703125, "learning_rate": 9.032435687841445e-05, "loss": 0.0377, "step": 6940 }, { "epoch": 0.4080075143829987, "grad_norm": 1.5845997333526611, "learning_rate": 9.029699814108818e-05, "loss": 0.0735, "step": 6950 }, { "epoch": 0.4085945755547728, "grad_norm": 1.9967899322509766, "learning_rate": 9.026960493446252e-05, "loss": 0.0522, "step": 6960 }, { "epoch": 0.4091816367265469, "grad_norm": 2.7092247009277344, "learning_rate": 9.024217728196913e-05, "loss": 0.0519, "step": 6970 }, { "epoch": 0.409768697898321, "grad_norm": 1.3626161813735962, "learning_rate": 9.02147152070691e-05, "loss": 0.0622, "step": 6980 }, { "epoch": 0.4103557590700951, "grad_norm": 1.4670498371124268, "learning_rate": 9.018721873325295e-05, "loss": 0.0511, "step": 6990 }, { "epoch": 0.4109428202418692, "grad_norm": 1.8022555112838745, "learning_rate": 9.015968788404069e-05, "loss": 0.0385, "step": 7000 }, { "epoch": 0.41152988141364333, "grad_norm": 2.298191547393799, "learning_rate": 9.013212268298168e-05, "loss": 0.0455, "step": 7010 }, { "epoch": 0.4121169425854174, "grad_norm": 1.4644757509231567, "learning_rate": 9.010452315365466e-05, "loss": 0.0441, "step": 7020 }, { "epoch": 0.4127040037571915, "grad_norm": 2.624242067337036, "learning_rate": 9.007688931966778e-05, "loss": 0.0765, "step": 7030 }, { "epoch": 0.4132910649289656, "grad_norm": 1.358687162399292, "learning_rate": 9.004922120465849e-05, "loss": 0.0459, "step": 7040 }, { "epoch": 0.4138781261007397, "grad_norm": 2.6969289779663086, "learning_rate": 9.00215188322936e-05, "loss": 0.0293, "step": 7050 }, { "epoch": 0.4144651872725138, "grad_norm": 2.3530728816986084, "learning_rate": 8.999378222626915e-05, "loss": 0.0656, "step": 7060 }, { "epoch": 0.4150522484442879, "grad_norm": 1.85506010055542, "learning_rate": 8.996601141031056e-05, "loss": 0.0526, "step": 7070 }, { "epoch": 0.415639309616062, "grad_norm": 1.1391503810882568, "learning_rate": 8.993820640817246e-05, "loss": 0.0422, "step": 7080 }, { "epoch": 0.4162263707878361, "grad_norm": 0.47648337483406067, "learning_rate": 8.991036724363872e-05, "loss": 0.0486, "step": 7090 }, { "epoch": 0.4168134319596102, "grad_norm": 0.9943375587463379, "learning_rate": 8.988249394052247e-05, "loss": 0.0581, "step": 7100 }, { "epoch": 0.4174004931313843, "grad_norm": 1.111797571182251, "learning_rate": 8.985458652266595e-05, "loss": 0.0314, "step": 7110 }, { "epoch": 0.41798755430315837, "grad_norm": 1.7706562280654907, "learning_rate": 8.98266450139407e-05, "loss": 0.0735, "step": 7120 }, { "epoch": 0.41857461547493247, "grad_norm": 0.39106282591819763, "learning_rate": 8.979866943824735e-05, "loss": 0.0425, "step": 7130 }, { "epoch": 0.41916167664670656, "grad_norm": 3.4277307987213135, "learning_rate": 8.977065981951566e-05, "loss": 0.0649, "step": 7140 }, { "epoch": 0.4197487378184807, "grad_norm": 3.3305158615112305, "learning_rate": 8.974261618170459e-05, "loss": 0.0554, "step": 7150 }, { "epoch": 0.4203357989902548, "grad_norm": 1.8959918022155762, "learning_rate": 8.97145385488021e-05, "loss": 0.0659, "step": 7160 }, { "epoch": 0.4209228601620289, "grad_norm": 2.176079511642456, "learning_rate": 8.968642694482527e-05, "loss": 0.0516, "step": 7170 }, { "epoch": 0.421509921333803, "grad_norm": 1.8134032487869263, "learning_rate": 8.965828139382026e-05, "loss": 0.0618, "step": 7180 }, { "epoch": 0.4220969825055771, "grad_norm": 2.2923731803894043, "learning_rate": 8.963010191986225e-05, "loss": 0.0512, "step": 7190 }, { "epoch": 0.4226840436773512, "grad_norm": 2.0501365661621094, "learning_rate": 8.960188854705543e-05, "loss": 0.0299, "step": 7200 }, { "epoch": 0.4232711048491253, "grad_norm": 1.6752849817276, "learning_rate": 8.957364129953297e-05, "loss": 0.0518, "step": 7210 }, { "epoch": 0.4238581660208994, "grad_norm": 2.336385726928711, "learning_rate": 8.954536020145708e-05, "loss": 0.0322, "step": 7220 }, { "epoch": 0.42444522719267347, "grad_norm": 0.6073676943778992, "learning_rate": 8.951704527701883e-05, "loss": 0.0727, "step": 7230 }, { "epoch": 0.42503228836444756, "grad_norm": 2.6533095836639404, "learning_rate": 8.948869655043835e-05, "loss": 0.0498, "step": 7240 }, { "epoch": 0.42561934953622166, "grad_norm": 4.454494953155518, "learning_rate": 8.946031404596453e-05, "loss": 0.0647, "step": 7250 }, { "epoch": 0.42620641070799575, "grad_norm": 2.364464044570923, "learning_rate": 8.943189778787528e-05, "loss": 0.0668, "step": 7260 }, { "epoch": 0.42679347187976985, "grad_norm": 2.5888354778289795, "learning_rate": 8.940344780047736e-05, "loss": 0.0444, "step": 7270 }, { "epoch": 0.427380533051544, "grad_norm": 2.2895569801330566, "learning_rate": 8.937496410810631e-05, "loss": 0.0542, "step": 7280 }, { "epoch": 0.4279675942233181, "grad_norm": 2.7025861740112305, "learning_rate": 8.934644673512656e-05, "loss": 0.0789, "step": 7290 }, { "epoch": 0.4285546553950922, "grad_norm": 0.6199250221252441, "learning_rate": 8.931789570593134e-05, "loss": 0.0368, "step": 7300 }, { "epoch": 0.4291417165668663, "grad_norm": 0.8989523649215698, "learning_rate": 8.928931104494267e-05, "loss": 0.0794, "step": 7310 }, { "epoch": 0.4297287777386404, "grad_norm": 1.678736686706543, "learning_rate": 8.926069277661134e-05, "loss": 0.0442, "step": 7320 }, { "epoch": 0.43031583891041447, "grad_norm": 2.9548044204711914, "learning_rate": 8.923204092541688e-05, "loss": 0.0567, "step": 7330 }, { "epoch": 0.43090290008218857, "grad_norm": 0.22019976377487183, "learning_rate": 8.920335551586755e-05, "loss": 0.0756, "step": 7340 }, { "epoch": 0.43148996125396266, "grad_norm": 1.938661813735962, "learning_rate": 8.91746365725003e-05, "loss": 0.0644, "step": 7350 }, { "epoch": 0.43207702242573676, "grad_norm": 1.3080250024795532, "learning_rate": 8.914588411988078e-05, "loss": 0.0676, "step": 7360 }, { "epoch": 0.43266408359751085, "grad_norm": 2.4229815006256104, "learning_rate": 8.911709818260333e-05, "loss": 0.0432, "step": 7370 }, { "epoch": 0.43325114476928495, "grad_norm": 3.186225414276123, "learning_rate": 8.908827878529087e-05, "loss": 0.0397, "step": 7380 }, { "epoch": 0.43383820594105904, "grad_norm": 4.5847320556640625, "learning_rate": 8.905942595259498e-05, "loss": 0.0592, "step": 7390 }, { "epoch": 0.43442526711283314, "grad_norm": 2.033881425857544, "learning_rate": 8.903053970919585e-05, "loss": 0.0615, "step": 7400 }, { "epoch": 0.43501232828460723, "grad_norm": 5.303295135498047, "learning_rate": 8.900162007980221e-05, "loss": 0.0478, "step": 7410 }, { "epoch": 0.4355993894563814, "grad_norm": 3.09973406791687, "learning_rate": 8.897266708915139e-05, "loss": 0.0501, "step": 7420 }, { "epoch": 0.4361864506281555, "grad_norm": 1.85347318649292, "learning_rate": 8.894368076200923e-05, "loss": 0.0613, "step": 7430 }, { "epoch": 0.43677351179992957, "grad_norm": 2.907917022705078, "learning_rate": 8.891466112317008e-05, "loss": 0.0429, "step": 7440 }, { "epoch": 0.43736057297170366, "grad_norm": 1.6881896257400513, "learning_rate": 8.888560819745682e-05, "loss": 0.0397, "step": 7450 }, { "epoch": 0.43794763414347776, "grad_norm": 3.2725038528442383, "learning_rate": 8.885652200972077e-05, "loss": 0.0548, "step": 7460 }, { "epoch": 0.43853469531525185, "grad_norm": 2.4636173248291016, "learning_rate": 8.88274025848417e-05, "loss": 0.0571, "step": 7470 }, { "epoch": 0.43912175648702595, "grad_norm": 3.800265312194824, "learning_rate": 8.879824994772785e-05, "loss": 0.0885, "step": 7480 }, { "epoch": 0.43970881765880004, "grad_norm": 3.0131139755249023, "learning_rate": 8.876906412331582e-05, "loss": 0.0873, "step": 7490 }, { "epoch": 0.44029587883057414, "grad_norm": 0.7383952140808105, "learning_rate": 8.873984513657061e-05, "loss": 0.0426, "step": 7500 }, { "epoch": 0.44088294000234823, "grad_norm": 5.437507629394531, "learning_rate": 8.871059301248563e-05, "loss": 0.0557, "step": 7510 }, { "epoch": 0.44147000117412233, "grad_norm": 1.5192549228668213, "learning_rate": 8.868130777608256e-05, "loss": 0.0522, "step": 7520 }, { "epoch": 0.4420570623458964, "grad_norm": 4.280874729156494, "learning_rate": 8.865198945241147e-05, "loss": 0.0426, "step": 7530 }, { "epoch": 0.4426441235176705, "grad_norm": 2.2068464756011963, "learning_rate": 8.86226380665507e-05, "loss": 0.0399, "step": 7540 }, { "epoch": 0.4432311846894446, "grad_norm": 1.6443748474121094, "learning_rate": 8.859325364360687e-05, "loss": 0.046, "step": 7550 }, { "epoch": 0.44381824586121876, "grad_norm": 1.9048075675964355, "learning_rate": 8.856383620871489e-05, "loss": 0.034, "step": 7560 }, { "epoch": 0.44440530703299286, "grad_norm": 1.6925053596496582, "learning_rate": 8.853438578703786e-05, "loss": 0.071, "step": 7570 }, { "epoch": 0.44499236820476695, "grad_norm": 2.8813109397888184, "learning_rate": 8.850490240376711e-05, "loss": 0.0673, "step": 7580 }, { "epoch": 0.44557942937654105, "grad_norm": 2.660813570022583, "learning_rate": 8.84753860841222e-05, "loss": 0.0497, "step": 7590 }, { "epoch": 0.44616649054831514, "grad_norm": 1.7857190370559692, "learning_rate": 8.844583685335084e-05, "loss": 0.0498, "step": 7600 }, { "epoch": 0.44675355172008924, "grad_norm": 0.7035413980484009, "learning_rate": 8.841625473672888e-05, "loss": 0.0651, "step": 7610 }, { "epoch": 0.44734061289186333, "grad_norm": 0.8729226589202881, "learning_rate": 8.838663975956031e-05, "loss": 0.0376, "step": 7620 }, { "epoch": 0.4479276740636374, "grad_norm": 2.321028232574463, "learning_rate": 8.835699194717724e-05, "loss": 0.092, "step": 7630 }, { "epoch": 0.4485147352354115, "grad_norm": 3.3576223850250244, "learning_rate": 8.832731132493982e-05, "loss": 0.0539, "step": 7640 }, { "epoch": 0.4491017964071856, "grad_norm": 1.8302654027938843, "learning_rate": 8.829759791823632e-05, "loss": 0.05, "step": 7650 }, { "epoch": 0.4496888575789597, "grad_norm": 0.7914695143699646, "learning_rate": 8.826785175248308e-05, "loss": 0.0536, "step": 7660 }, { "epoch": 0.4502759187507338, "grad_norm": 3.6204957962036133, "learning_rate": 8.823807285312434e-05, "loss": 0.0456, "step": 7670 }, { "epoch": 0.4508629799225079, "grad_norm": 0.7834319472312927, "learning_rate": 8.820826124563245e-05, "loss": 0.0553, "step": 7680 }, { "epoch": 0.45145004109428205, "grad_norm": 3.02571439743042, "learning_rate": 8.81784169555077e-05, "loss": 0.0635, "step": 7690 }, { "epoch": 0.45203710226605615, "grad_norm": 2.698456048965454, "learning_rate": 8.814854000827832e-05, "loss": 0.0738, "step": 7700 }, { "epoch": 0.45262416343783024, "grad_norm": 3.3981058597564697, "learning_rate": 8.811863042950053e-05, "loss": 0.0648, "step": 7710 }, { "epoch": 0.45321122460960434, "grad_norm": 2.0070343017578125, "learning_rate": 8.80886882447584e-05, "loss": 0.0651, "step": 7720 }, { "epoch": 0.45379828578137843, "grad_norm": 3.9630744457244873, "learning_rate": 8.805871347966393e-05, "loss": 0.0724, "step": 7730 }, { "epoch": 0.4543853469531525, "grad_norm": 1.4495306015014648, "learning_rate": 8.802870615985694e-05, "loss": 0.0633, "step": 7740 }, { "epoch": 0.4549724081249266, "grad_norm": 1.5340033769607544, "learning_rate": 8.799866631100516e-05, "loss": 0.0371, "step": 7750 }, { "epoch": 0.4555594692967007, "grad_norm": 1.4411876201629639, "learning_rate": 8.79685939588041e-05, "loss": 0.0849, "step": 7760 }, { "epoch": 0.4561465304684748, "grad_norm": 1.4299516677856445, "learning_rate": 8.79384891289771e-05, "loss": 0.0583, "step": 7770 }, { "epoch": 0.4567335916402489, "grad_norm": 2.5478761196136475, "learning_rate": 8.790835184727529e-05, "loss": 0.0352, "step": 7780 }, { "epoch": 0.457320652812023, "grad_norm": 2.1153485774993896, "learning_rate": 8.787818213947749e-05, "loss": 0.0536, "step": 7790 }, { "epoch": 0.4579077139837971, "grad_norm": 1.6506136655807495, "learning_rate": 8.784798003139034e-05, "loss": 0.0434, "step": 7800 }, { "epoch": 0.4584947751555712, "grad_norm": 2.0503387451171875, "learning_rate": 8.781774554884814e-05, "loss": 0.0442, "step": 7810 }, { "epoch": 0.4590818363273453, "grad_norm": 3.163120746612549, "learning_rate": 8.778747871771292e-05, "loss": 0.0487, "step": 7820 }, { "epoch": 0.45966889749911943, "grad_norm": 2.9386367797851562, "learning_rate": 8.775717956387434e-05, "loss": 0.05, "step": 7830 }, { "epoch": 0.46025595867089353, "grad_norm": 1.808761715888977, "learning_rate": 8.772684811324975e-05, "loss": 0.034, "step": 7840 }, { "epoch": 0.4608430198426676, "grad_norm": 1.5630680322647095, "learning_rate": 8.76964843917841e-05, "loss": 0.041, "step": 7850 }, { "epoch": 0.4614300810144417, "grad_norm": 1.536268711090088, "learning_rate": 8.766608842544993e-05, "loss": 0.0503, "step": 7860 }, { "epoch": 0.4620171421862158, "grad_norm": 2.3230173587799072, "learning_rate": 8.763566024024741e-05, "loss": 0.0592, "step": 7870 }, { "epoch": 0.4626042033579899, "grad_norm": 2.439312696456909, "learning_rate": 8.760519986220423e-05, "loss": 0.0606, "step": 7880 }, { "epoch": 0.463191264529764, "grad_norm": 2.7862606048583984, "learning_rate": 8.757470731737562e-05, "loss": 0.0546, "step": 7890 }, { "epoch": 0.4637783257015381, "grad_norm": 2.9655606746673584, "learning_rate": 8.754418263184437e-05, "loss": 0.055, "step": 7900 }, { "epoch": 0.4643653868733122, "grad_norm": 0.7977060079574585, "learning_rate": 8.751362583172068e-05, "loss": 0.0416, "step": 7910 }, { "epoch": 0.4649524480450863, "grad_norm": 2.885378122329712, "learning_rate": 8.748303694314227e-05, "loss": 0.0526, "step": 7920 }, { "epoch": 0.4655395092168604, "grad_norm": 2.614291191101074, "learning_rate": 8.745241599227433e-05, "loss": 0.0417, "step": 7930 }, { "epoch": 0.4661265703886345, "grad_norm": 2.5738768577575684, "learning_rate": 8.742176300530944e-05, "loss": 0.0546, "step": 7940 }, { "epoch": 0.46671363156040857, "grad_norm": 1.6518769264221191, "learning_rate": 8.739107800846757e-05, "loss": 0.0652, "step": 7950 }, { "epoch": 0.4673006927321827, "grad_norm": 2.51871919631958, "learning_rate": 8.736036102799614e-05, "loss": 0.0566, "step": 7960 }, { "epoch": 0.4678877539039568, "grad_norm": 1.9598478078842163, "learning_rate": 8.732961209016983e-05, "loss": 0.0465, "step": 7970 }, { "epoch": 0.4684748150757309, "grad_norm": 3.3827695846557617, "learning_rate": 8.729883122129075e-05, "loss": 0.0357, "step": 7980 }, { "epoch": 0.469061876247505, "grad_norm": 2.2214787006378174, "learning_rate": 8.726801844768825e-05, "loss": 0.056, "step": 7990 }, { "epoch": 0.4696489374192791, "grad_norm": 1.9573979377746582, "learning_rate": 8.7237173795719e-05, "loss": 0.0673, "step": 8000 }, { "epoch": 0.4702359985910532, "grad_norm": 1.816136121749878, "learning_rate": 8.720629729176697e-05, "loss": 0.0649, "step": 8010 }, { "epoch": 0.4708230597628273, "grad_norm": 3.505342721939087, "learning_rate": 8.717538896224332e-05, "loss": 0.0444, "step": 8020 }, { "epoch": 0.4714101209346014, "grad_norm": 2.056030750274658, "learning_rate": 8.714444883358646e-05, "loss": 0.0411, "step": 8030 }, { "epoch": 0.4719971821063755, "grad_norm": 2.1952550411224365, "learning_rate": 8.711347693226201e-05, "loss": 0.0374, "step": 8040 }, { "epoch": 0.4725842432781496, "grad_norm": 1.5355472564697266, "learning_rate": 8.708247328476273e-05, "loss": 0.0468, "step": 8050 }, { "epoch": 0.47317130444992367, "grad_norm": 2.78885817527771, "learning_rate": 8.705143791760859e-05, "loss": 0.0665, "step": 8060 }, { "epoch": 0.47375836562169776, "grad_norm": 1.5621267557144165, "learning_rate": 8.702037085734664e-05, "loss": 0.0705, "step": 8070 }, { "epoch": 0.47434542679347186, "grad_norm": 1.3970437049865723, "learning_rate": 8.698927213055107e-05, "loss": 0.0501, "step": 8080 }, { "epoch": 0.47493248796524595, "grad_norm": 0.7375102043151855, "learning_rate": 8.695814176382318e-05, "loss": 0.0782, "step": 8090 }, { "epoch": 0.4755195491370201, "grad_norm": 1.3514719009399414, "learning_rate": 8.692697978379125e-05, "loss": 0.0513, "step": 8100 }, { "epoch": 0.4761066103087942, "grad_norm": 1.7808268070220947, "learning_rate": 8.68957862171107e-05, "loss": 0.0652, "step": 8110 }, { "epoch": 0.4766936714805683, "grad_norm": 2.0955052375793457, "learning_rate": 8.68645610904639e-05, "loss": 0.0622, "step": 8120 }, { "epoch": 0.4772807326523424, "grad_norm": 3.559030055999756, "learning_rate": 8.683330443056026e-05, "loss": 0.0856, "step": 8130 }, { "epoch": 0.4778677938241165, "grad_norm": 3.245924234390259, "learning_rate": 8.680201626413612e-05, "loss": 0.0621, "step": 8140 }, { "epoch": 0.4784548549958906, "grad_norm": 5.589448928833008, "learning_rate": 8.677069661795479e-05, "loss": 0.0861, "step": 8150 }, { "epoch": 0.47904191616766467, "grad_norm": 2.8734142780303955, "learning_rate": 8.673934551880654e-05, "loss": 0.0395, "step": 8160 }, { "epoch": 0.47962897733943877, "grad_norm": 1.9405406713485718, "learning_rate": 8.67079629935085e-05, "loss": 0.0436, "step": 8170 }, { "epoch": 0.48021603851121286, "grad_norm": 0.47982171177864075, "learning_rate": 8.667654906890469e-05, "loss": 0.0522, "step": 8180 }, { "epoch": 0.48080309968298696, "grad_norm": 2.464160442352295, "learning_rate": 8.664510377186599e-05, "loss": 0.0327, "step": 8190 }, { "epoch": 0.48139016085476105, "grad_norm": 1.0303552150726318, "learning_rate": 8.661362712929013e-05, "loss": 0.0341, "step": 8200 }, { "epoch": 0.48197722202653515, "grad_norm": 2.3549325466156006, "learning_rate": 8.658211916810165e-05, "loss": 0.0525, "step": 8210 }, { "epoch": 0.48256428319830924, "grad_norm": 2.580984592437744, "learning_rate": 8.655057991525186e-05, "loss": 0.0431, "step": 8220 }, { "epoch": 0.48315134437008334, "grad_norm": 2.2424895763397217, "learning_rate": 8.651900939771884e-05, "loss": 0.0549, "step": 8230 }, { "epoch": 0.4837384055418575, "grad_norm": 1.061221718788147, "learning_rate": 8.648740764250745e-05, "loss": 0.034, "step": 8240 }, { "epoch": 0.4843254667136316, "grad_norm": 2.8359522819519043, "learning_rate": 8.645577467664919e-05, "loss": 0.0509, "step": 8250 }, { "epoch": 0.4849125278854057, "grad_norm": 1.082289695739746, "learning_rate": 8.642411052720235e-05, "loss": 0.0595, "step": 8260 }, { "epoch": 0.48549958905717977, "grad_norm": 3.414536476135254, "learning_rate": 8.639241522125185e-05, "loss": 0.0406, "step": 8270 }, { "epoch": 0.48608665022895386, "grad_norm": 1.7662638425827026, "learning_rate": 8.636068878590924e-05, "loss": 0.06, "step": 8280 }, { "epoch": 0.48667371140072796, "grad_norm": 1.6327471733093262, "learning_rate": 8.632893124831273e-05, "loss": 0.0604, "step": 8290 }, { "epoch": 0.48726077257250205, "grad_norm": 0.9782724976539612, "learning_rate": 8.629714263562716e-05, "loss": 0.072, "step": 8300 }, { "epoch": 0.48784783374427615, "grad_norm": 1.3304145336151123, "learning_rate": 8.626532297504386e-05, "loss": 0.0359, "step": 8310 }, { "epoch": 0.48843489491605024, "grad_norm": 0.6570135354995728, "learning_rate": 8.62334722937808e-05, "loss": 0.0475, "step": 8320 }, { "epoch": 0.48902195608782434, "grad_norm": 5.401157379150391, "learning_rate": 8.620159061908245e-05, "loss": 0.0528, "step": 8330 }, { "epoch": 0.48960901725959843, "grad_norm": 3.090275526046753, "learning_rate": 8.61696779782198e-05, "loss": 0.0795, "step": 8340 }, { "epoch": 0.49019607843137253, "grad_norm": 1.0488765239715576, "learning_rate": 8.613773439849034e-05, "loss": 0.0773, "step": 8350 }, { "epoch": 0.4907831396031466, "grad_norm": 2.3972856998443604, "learning_rate": 8.610575990721799e-05, "loss": 0.0787, "step": 8360 }, { "epoch": 0.4913702007749208, "grad_norm": 1.8785936832427979, "learning_rate": 8.607375453175316e-05, "loss": 0.0532, "step": 8370 }, { "epoch": 0.49195726194669487, "grad_norm": 1.9336755275726318, "learning_rate": 8.604171829947263e-05, "loss": 0.0587, "step": 8380 }, { "epoch": 0.49254432311846896, "grad_norm": 0.8379578590393066, "learning_rate": 8.600965123777957e-05, "loss": 0.043, "step": 8390 }, { "epoch": 0.49313138429024306, "grad_norm": 0.8855001926422119, "learning_rate": 8.59775533741036e-05, "loss": 0.0429, "step": 8400 }, { "epoch": 0.49371844546201715, "grad_norm": 0.7846783995628357, "learning_rate": 8.594542473590062e-05, "loss": 0.0262, "step": 8410 }, { "epoch": 0.49430550663379125, "grad_norm": 3.0324485301971436, "learning_rate": 8.591326535065283e-05, "loss": 0.0601, "step": 8420 }, { "epoch": 0.49489256780556534, "grad_norm": 1.560529112815857, "learning_rate": 8.58810752458688e-05, "loss": 0.0619, "step": 8430 }, { "epoch": 0.49547962897733944, "grad_norm": 2.3207945823669434, "learning_rate": 8.584885444908333e-05, "loss": 0.0866, "step": 8440 }, { "epoch": 0.49606669014911353, "grad_norm": 1.379296898841858, "learning_rate": 8.58166029878575e-05, "loss": 0.0322, "step": 8450 }, { "epoch": 0.4966537513208876, "grad_norm": 0.4500533938407898, "learning_rate": 8.578432088977859e-05, "loss": 0.0546, "step": 8460 }, { "epoch": 0.4972408124926617, "grad_norm": 1.0984269380569458, "learning_rate": 8.575200818246012e-05, "loss": 0.0446, "step": 8470 }, { "epoch": 0.4978278736644358, "grad_norm": 0.6852173805236816, "learning_rate": 8.571966489354178e-05, "loss": 0.0416, "step": 8480 }, { "epoch": 0.4984149348362099, "grad_norm": 1.9068775177001953, "learning_rate": 8.568729105068939e-05, "loss": 0.0442, "step": 8490 }, { "epoch": 0.499001996007984, "grad_norm": 2.947192430496216, "learning_rate": 8.565488668159496e-05, "loss": 0.0427, "step": 8500 }, { "epoch": 0.49958905717975816, "grad_norm": 1.586493730545044, "learning_rate": 8.562245181397655e-05, "loss": 0.0633, "step": 8510 }, { "epoch": 0.5001761183515322, "grad_norm": 1.0785051584243774, "learning_rate": 8.558998647557837e-05, "loss": 0.0365, "step": 8520 }, { "epoch": 0.5007631795233063, "grad_norm": 0.7715865969657898, "learning_rate": 8.555749069417065e-05, "loss": 0.0356, "step": 8530 }, { "epoch": 0.5013502406950804, "grad_norm": 1.5732499361038208, "learning_rate": 8.552496449754967e-05, "loss": 0.0595, "step": 8540 }, { "epoch": 0.5019373018668545, "grad_norm": 1.4762629270553589, "learning_rate": 8.549240791353775e-05, "loss": 0.042, "step": 8550 }, { "epoch": 0.5025243630386286, "grad_norm": 2.114508867263794, "learning_rate": 8.545982096998315e-05, "loss": 0.0307, "step": 8560 }, { "epoch": 0.5031114242104027, "grad_norm": 2.6006269454956055, "learning_rate": 8.542720369476016e-05, "loss": 0.0357, "step": 8570 }, { "epoch": 0.5036984853821769, "grad_norm": 0.9500741958618164, "learning_rate": 8.539455611576898e-05, "loss": 0.0499, "step": 8580 }, { "epoch": 0.504285546553951, "grad_norm": 1.3061884641647339, "learning_rate": 8.536187826093576e-05, "loss": 0.0635, "step": 8590 }, { "epoch": 0.5048726077257251, "grad_norm": 1.8609007596969604, "learning_rate": 8.53291701582125e-05, "loss": 0.0529, "step": 8600 }, { "epoch": 0.5054596688974992, "grad_norm": 3.005753517150879, "learning_rate": 8.529643183557708e-05, "loss": 0.0486, "step": 8610 }, { "epoch": 0.5060467300692733, "grad_norm": 1.3930306434631348, "learning_rate": 8.52636633210333e-05, "loss": 0.0869, "step": 8620 }, { "epoch": 0.5066337912410473, "grad_norm": 1.6231608390808105, "learning_rate": 8.52308646426107e-05, "loss": 0.0428, "step": 8630 }, { "epoch": 0.5072208524128214, "grad_norm": 0.8321643471717834, "learning_rate": 8.519803582836467e-05, "loss": 0.0279, "step": 8640 }, { "epoch": 0.5078079135845955, "grad_norm": 2.415229082107544, "learning_rate": 8.516517690637638e-05, "loss": 0.0632, "step": 8650 }, { "epoch": 0.5083949747563696, "grad_norm": 1.9775207042694092, "learning_rate": 8.513228790475269e-05, "loss": 0.0665, "step": 8660 }, { "epoch": 0.5089820359281437, "grad_norm": 1.3758684396743774, "learning_rate": 8.509936885162629e-05, "loss": 0.049, "step": 8670 }, { "epoch": 0.5095690970999178, "grad_norm": 1.9183942079544067, "learning_rate": 8.50664197751555e-05, "loss": 0.0574, "step": 8680 }, { "epoch": 0.5101561582716919, "grad_norm": 1.8109124898910522, "learning_rate": 8.503344070352434e-05, "loss": 0.0608, "step": 8690 }, { "epoch": 0.510743219443466, "grad_norm": 2.4511830806732178, "learning_rate": 8.50004316649425e-05, "loss": 0.0626, "step": 8700 }, { "epoch": 0.5113302806152401, "grad_norm": 2.496175527572632, "learning_rate": 8.496739268764529e-05, "loss": 0.0576, "step": 8710 }, { "epoch": 0.5119173417870142, "grad_norm": 1.0717878341674805, "learning_rate": 8.493432379989365e-05, "loss": 0.0162, "step": 8720 }, { "epoch": 0.5125044029587883, "grad_norm": 1.7837889194488525, "learning_rate": 8.490122502997406e-05, "loss": 0.0261, "step": 8730 }, { "epoch": 0.5130914641305624, "grad_norm": 2.5370993614196777, "learning_rate": 8.486809640619859e-05, "loss": 0.0455, "step": 8740 }, { "epoch": 0.5136785253023365, "grad_norm": 2.364403009414673, "learning_rate": 8.483493795690489e-05, "loss": 0.0654, "step": 8750 }, { "epoch": 0.5142655864741106, "grad_norm": 1.936968445777893, "learning_rate": 8.480174971045603e-05, "loss": 0.0595, "step": 8760 }, { "epoch": 0.5148526476458847, "grad_norm": 1.8834549188613892, "learning_rate": 8.476853169524065e-05, "loss": 0.0378, "step": 8770 }, { "epoch": 0.5154397088176588, "grad_norm": 2.0367743968963623, "learning_rate": 8.473528393967278e-05, "loss": 0.0658, "step": 8780 }, { "epoch": 0.5160267699894329, "grad_norm": 1.0848331451416016, "learning_rate": 8.470200647219198e-05, "loss": 0.0349, "step": 8790 }, { "epoch": 0.516613831161207, "grad_norm": 1.0683445930480957, "learning_rate": 8.466869932126314e-05, "loss": 0.0529, "step": 8800 }, { "epoch": 0.517200892332981, "grad_norm": 2.2186295986175537, "learning_rate": 8.463536251537656e-05, "loss": 0.0422, "step": 8810 }, { "epoch": 0.5177879535047551, "grad_norm": 1.4563610553741455, "learning_rate": 8.460199608304797e-05, "loss": 0.0625, "step": 8820 }, { "epoch": 0.5183750146765292, "grad_norm": 3.2594645023345947, "learning_rate": 8.456860005281835e-05, "loss": 0.063, "step": 8830 }, { "epoch": 0.5189620758483033, "grad_norm": 3.460399866104126, "learning_rate": 8.453517445325405e-05, "loss": 0.0654, "step": 8840 }, { "epoch": 0.5195491370200775, "grad_norm": 1.2705801725387573, "learning_rate": 8.450171931294673e-05, "loss": 0.0575, "step": 8850 }, { "epoch": 0.5201361981918516, "grad_norm": 1.9787516593933105, "learning_rate": 8.446823466051326e-05, "loss": 0.0533, "step": 8860 }, { "epoch": 0.5207232593636257, "grad_norm": 2.2670459747314453, "learning_rate": 8.44347205245958e-05, "loss": 0.0376, "step": 8870 }, { "epoch": 0.5213103205353998, "grad_norm": 0.8276971578598022, "learning_rate": 8.440117693386171e-05, "loss": 0.0415, "step": 8880 }, { "epoch": 0.5218973817071739, "grad_norm": 2.231466054916382, "learning_rate": 8.436760391700355e-05, "loss": 0.0507, "step": 8890 }, { "epoch": 0.522484442878948, "grad_norm": 0.2314351499080658, "learning_rate": 8.433400150273906e-05, "loss": 0.0405, "step": 8900 }, { "epoch": 0.5230715040507221, "grad_norm": 0.3257890045642853, "learning_rate": 8.430036971981112e-05, "loss": 0.0513, "step": 8910 }, { "epoch": 0.5236585652224962, "grad_norm": 2.153268814086914, "learning_rate": 8.426670859698771e-05, "loss": 0.0586, "step": 8920 }, { "epoch": 0.5242456263942703, "grad_norm": 2.878570079803467, "learning_rate": 8.423301816306193e-05, "loss": 0.07, "step": 8930 }, { "epoch": 0.5248326875660444, "grad_norm": 2.7315173149108887, "learning_rate": 8.419929844685197e-05, "loss": 0.0515, "step": 8940 }, { "epoch": 0.5254197487378185, "grad_norm": 2.85028338432312, "learning_rate": 8.416554947720104e-05, "loss": 0.0562, "step": 8950 }, { "epoch": 0.5260068099095926, "grad_norm": 2.2732903957366943, "learning_rate": 8.413177128297734e-05, "loss": 0.0335, "step": 8960 }, { "epoch": 0.5265938710813667, "grad_norm": 2.7150728702545166, "learning_rate": 8.409796389307417e-05, "loss": 0.0573, "step": 8970 }, { "epoch": 0.5271809322531408, "grad_norm": 2.8290627002716064, "learning_rate": 8.406412733640967e-05, "loss": 0.0605, "step": 8980 }, { "epoch": 0.5277679934249149, "grad_norm": 2.907315731048584, "learning_rate": 8.403026164192704e-05, "loss": 0.076, "step": 8990 }, { "epoch": 0.528355054596689, "grad_norm": 4.200743675231934, "learning_rate": 8.399636683859437e-05, "loss": 0.0516, "step": 9000 }, { "epoch": 0.528355054596689, "eval_loss": 0.40969395637512207, "eval_runtime": 269.596, "eval_samples_per_second": 3.505, "eval_steps_per_second": 3.505, "step": 9000 }, { "epoch": 0.5289421157684631, "grad_norm": 2.4810755252838135, "learning_rate": 8.396244295540462e-05, "loss": 0.0674, "step": 9010 }, { "epoch": 0.5295291769402372, "grad_norm": 0.8842018246650696, "learning_rate": 8.392849002137566e-05, "loss": 0.047, "step": 9020 }, { "epoch": 0.5301162381120113, "grad_norm": 3.200507402420044, "learning_rate": 8.389450806555017e-05, "loss": 0.0456, "step": 9030 }, { "epoch": 0.5307032992837853, "grad_norm": 3.428184986114502, "learning_rate": 8.386049711699571e-05, "loss": 0.0708, "step": 9040 }, { "epoch": 0.5312903604555594, "grad_norm": 3.955782890319824, "learning_rate": 8.38264572048046e-05, "loss": 0.0705, "step": 9050 }, { "epoch": 0.5318774216273335, "grad_norm": 1.9864522218704224, "learning_rate": 8.379238835809393e-05, "loss": 0.0414, "step": 9060 }, { "epoch": 0.5324644827991076, "grad_norm": 1.8918730020523071, "learning_rate": 8.37582906060056e-05, "loss": 0.0601, "step": 9070 }, { "epoch": 0.5330515439708817, "grad_norm": 2.5923526287078857, "learning_rate": 8.372416397770613e-05, "loss": 0.0495, "step": 9080 }, { "epoch": 0.5336386051426558, "grad_norm": 1.9799025058746338, "learning_rate": 8.369000850238683e-05, "loss": 0.0473, "step": 9090 }, { "epoch": 0.5342256663144299, "grad_norm": 2.412355422973633, "learning_rate": 8.365582420926366e-05, "loss": 0.0672, "step": 9100 }, { "epoch": 0.534812727486204, "grad_norm": 2.0596566200256348, "learning_rate": 8.362161112757723e-05, "loss": 0.0609, "step": 9110 }, { "epoch": 0.5353997886579782, "grad_norm": 0.7884270548820496, "learning_rate": 8.358736928659274e-05, "loss": 0.0277, "step": 9120 }, { "epoch": 0.5359868498297523, "grad_norm": 3.03768253326416, "learning_rate": 8.355309871560006e-05, "loss": 0.0558, "step": 9130 }, { "epoch": 0.5365739110015264, "grad_norm": 4.944599151611328, "learning_rate": 8.351879944391357e-05, "loss": 0.0507, "step": 9140 }, { "epoch": 0.5371609721733005, "grad_norm": 2.0009171962738037, "learning_rate": 8.348447150087223e-05, "loss": 0.0417, "step": 9150 }, { "epoch": 0.5377480333450746, "grad_norm": 1.3927279710769653, "learning_rate": 8.345011491583954e-05, "loss": 0.0371, "step": 9160 }, { "epoch": 0.5383350945168487, "grad_norm": 3.312725782394409, "learning_rate": 8.341572971820344e-05, "loss": 0.0518, "step": 9170 }, { "epoch": 0.5389221556886228, "grad_norm": 0.4931054413318634, "learning_rate": 8.338131593737643e-05, "loss": 0.0563, "step": 9180 }, { "epoch": 0.5395092168603969, "grad_norm": 2.3913474082946777, "learning_rate": 8.33468736027954e-05, "loss": 0.0528, "step": 9190 }, { "epoch": 0.540096278032171, "grad_norm": 1.2798314094543457, "learning_rate": 8.331240274392167e-05, "loss": 0.0409, "step": 9200 }, { "epoch": 0.5406833392039451, "grad_norm": 2.9809741973876953, "learning_rate": 8.327790339024097e-05, "loss": 0.0562, "step": 9210 }, { "epoch": 0.5412704003757192, "grad_norm": 2.873837471008301, "learning_rate": 8.324337557126342e-05, "loss": 0.0463, "step": 9220 }, { "epoch": 0.5418574615474933, "grad_norm": 2.423383951187134, "learning_rate": 8.320881931652347e-05, "loss": 0.0462, "step": 9230 }, { "epoch": 0.5424445227192674, "grad_norm": 1.4757925271987915, "learning_rate": 8.317423465557987e-05, "loss": 0.0549, "step": 9240 }, { "epoch": 0.5430315838910414, "grad_norm": 0.7047216892242432, "learning_rate": 8.313962161801569e-05, "loss": 0.0339, "step": 9250 }, { "epoch": 0.5436186450628155, "grad_norm": 1.6288440227508545, "learning_rate": 8.310498023343832e-05, "loss": 0.0451, "step": 9260 }, { "epoch": 0.5442057062345896, "grad_norm": 2.0440261363983154, "learning_rate": 8.307031053147932e-05, "loss": 0.0569, "step": 9270 }, { "epoch": 0.5447927674063637, "grad_norm": 2.2839391231536865, "learning_rate": 8.30356125417945e-05, "loss": 0.0495, "step": 9280 }, { "epoch": 0.5453798285781378, "grad_norm": 2.28950572013855, "learning_rate": 8.300088629406391e-05, "loss": 0.0436, "step": 9290 }, { "epoch": 0.5459668897499119, "grad_norm": 3.5766420364379883, "learning_rate": 8.296613181799168e-05, "loss": 0.04, "step": 9300 }, { "epoch": 0.546553950921686, "grad_norm": 1.5386685132980347, "learning_rate": 8.293134914330618e-05, "loss": 0.0621, "step": 9310 }, { "epoch": 0.5471410120934601, "grad_norm": 1.3756065368652344, "learning_rate": 8.289653829975983e-05, "loss": 0.0386, "step": 9320 }, { "epoch": 0.5477280732652342, "grad_norm": 3.2110755443573, "learning_rate": 8.286169931712921e-05, "loss": 0.0498, "step": 9330 }, { "epoch": 0.5483151344370083, "grad_norm": 1.801338791847229, "learning_rate": 8.28268322252149e-05, "loss": 0.0594, "step": 9340 }, { "epoch": 0.5489021956087824, "grad_norm": 1.8586894273757935, "learning_rate": 8.279193705384159e-05, "loss": 0.0639, "step": 9350 }, { "epoch": 0.5494892567805565, "grad_norm": 2.1896278858184814, "learning_rate": 8.275701383285795e-05, "loss": 0.0661, "step": 9360 }, { "epoch": 0.5500763179523306, "grad_norm": 1.4763203859329224, "learning_rate": 8.272206259213662e-05, "loss": 0.0445, "step": 9370 }, { "epoch": 0.5506633791241047, "grad_norm": 2.916865348815918, "learning_rate": 8.268708336157428e-05, "loss": 0.0741, "step": 9380 }, { "epoch": 0.5512504402958789, "grad_norm": 4.696934223175049, "learning_rate": 8.265207617109148e-05, "loss": 0.0553, "step": 9390 }, { "epoch": 0.551837501467653, "grad_norm": 1.0399225950241089, "learning_rate": 8.261704105063275e-05, "loss": 0.0458, "step": 9400 }, { "epoch": 0.5524245626394271, "grad_norm": 1.098526954650879, "learning_rate": 8.258197803016646e-05, "loss": 0.1109, "step": 9410 }, { "epoch": 0.5530116238112012, "grad_norm": 1.3748254776000977, "learning_rate": 8.254688713968484e-05, "loss": 0.032, "step": 9420 }, { "epoch": 0.5535986849829753, "grad_norm": 1.2855557203292847, "learning_rate": 8.2511768409204e-05, "loss": 0.0444, "step": 9430 }, { "epoch": 0.5541857461547494, "grad_norm": 1.5364638566970825, "learning_rate": 8.247662186876386e-05, "loss": 0.0596, "step": 9440 }, { "epoch": 0.5547728073265235, "grad_norm": 1.2066459655761719, "learning_rate": 8.244144754842809e-05, "loss": 0.0746, "step": 9450 }, { "epoch": 0.5553598684982975, "grad_norm": 2.495847463607788, "learning_rate": 8.240624547828417e-05, "loss": 0.0754, "step": 9460 }, { "epoch": 0.5559469296700716, "grad_norm": 0.5612956881523132, "learning_rate": 8.237101568844328e-05, "loss": 0.039, "step": 9470 }, { "epoch": 0.5565339908418457, "grad_norm": 1.5680084228515625, "learning_rate": 8.233575820904032e-05, "loss": 0.0388, "step": 9480 }, { "epoch": 0.5571210520136198, "grad_norm": 0.898784875869751, "learning_rate": 8.23004730702339e-05, "loss": 0.0534, "step": 9490 }, { "epoch": 0.5577081131853939, "grad_norm": 1.7916792631149292, "learning_rate": 8.226516030220623e-05, "loss": 0.0323, "step": 9500 }, { "epoch": 0.558295174357168, "grad_norm": 1.508663296699524, "learning_rate": 8.222981993516324e-05, "loss": 0.0305, "step": 9510 }, { "epoch": 0.5588822355289421, "grad_norm": 0.9690699577331543, "learning_rate": 8.219445199933437e-05, "loss": 0.0884, "step": 9520 }, { "epoch": 0.5594692967007162, "grad_norm": 2.0880961418151855, "learning_rate": 8.215905652497273e-05, "loss": 0.0245, "step": 9530 }, { "epoch": 0.5600563578724903, "grad_norm": 1.931066870689392, "learning_rate": 8.212363354235494e-05, "loss": 0.0699, "step": 9540 }, { "epoch": 0.5606434190442644, "grad_norm": 1.0448249578475952, "learning_rate": 8.208818308178114e-05, "loss": 0.0561, "step": 9550 }, { "epoch": 0.5612304802160385, "grad_norm": 1.4388859272003174, "learning_rate": 8.205270517357502e-05, "loss": 0.0515, "step": 9560 }, { "epoch": 0.5618175413878126, "grad_norm": 1.4475386142730713, "learning_rate": 8.201719984808369e-05, "loss": 0.0425, "step": 9570 }, { "epoch": 0.5624046025595867, "grad_norm": 1.9401365518569946, "learning_rate": 8.198166713567777e-05, "loss": 0.0408, "step": 9580 }, { "epoch": 0.5629916637313608, "grad_norm": 2.8390491008758545, "learning_rate": 8.194610706675125e-05, "loss": 0.0555, "step": 9590 }, { "epoch": 0.5635787249031349, "grad_norm": 2.739123821258545, "learning_rate": 8.191051967172157e-05, "loss": 0.0484, "step": 9600 }, { "epoch": 0.564165786074909, "grad_norm": 1.865818738937378, "learning_rate": 8.18749049810295e-05, "loss": 0.036, "step": 9610 }, { "epoch": 0.5647528472466831, "grad_norm": 3.7580649852752686, "learning_rate": 8.183926302513923e-05, "loss": 0.0546, "step": 9620 }, { "epoch": 0.5653399084184572, "grad_norm": 3.3338823318481445, "learning_rate": 8.180359383453815e-05, "loss": 0.0524, "step": 9630 }, { "epoch": 0.5659269695902313, "grad_norm": 2.9151487350463867, "learning_rate": 8.176789743973707e-05, "loss": 0.0475, "step": 9640 }, { "epoch": 0.5665140307620053, "grad_norm": 2.853806257247925, "learning_rate": 8.173217387127004e-05, "loss": 0.0435, "step": 9650 }, { "epoch": 0.5671010919337796, "grad_norm": 2.850360870361328, "learning_rate": 8.169642315969427e-05, "loss": 0.055, "step": 9660 }, { "epoch": 0.5676881531055537, "grad_norm": 2.4609642028808594, "learning_rate": 8.166064533559028e-05, "loss": 0.0801, "step": 9670 }, { "epoch": 0.5682752142773277, "grad_norm": 2.81255841255188, "learning_rate": 8.162484042956178e-05, "loss": 0.0608, "step": 9680 }, { "epoch": 0.5688622754491018, "grad_norm": 0.8586886525154114, "learning_rate": 8.158900847223556e-05, "loss": 0.0424, "step": 9690 }, { "epoch": 0.5694493366208759, "grad_norm": 1.6476390361785889, "learning_rate": 8.155314949426167e-05, "loss": 0.0454, "step": 9700 }, { "epoch": 0.57003639779265, "grad_norm": 1.444466471672058, "learning_rate": 8.151726352631316e-05, "loss": 0.0494, "step": 9710 }, { "epoch": 0.5706234589644241, "grad_norm": 0.3091655373573303, "learning_rate": 8.148135059908624e-05, "loss": 0.0535, "step": 9720 }, { "epoch": 0.5712105201361982, "grad_norm": 2.8618831634521484, "learning_rate": 8.144541074330015e-05, "loss": 0.0564, "step": 9730 }, { "epoch": 0.5717975813079723, "grad_norm": 3.9463939666748047, "learning_rate": 8.140944398969717e-05, "loss": 0.0512, "step": 9740 }, { "epoch": 0.5723846424797464, "grad_norm": 1.8597780466079712, "learning_rate": 8.13734503690426e-05, "loss": 0.0381, "step": 9750 }, { "epoch": 0.5729717036515205, "grad_norm": 1.4920759201049805, "learning_rate": 8.13374299121247e-05, "loss": 0.0501, "step": 9760 }, { "epoch": 0.5735587648232946, "grad_norm": 1.7344402074813843, "learning_rate": 8.130138264975471e-05, "loss": 0.071, "step": 9770 }, { "epoch": 0.5741458259950687, "grad_norm": 1.097912311553955, "learning_rate": 8.126530861276677e-05, "loss": 0.0316, "step": 9780 }, { "epoch": 0.5747328871668428, "grad_norm": 3.1586427688598633, "learning_rate": 8.122920783201793e-05, "loss": 0.0355, "step": 9790 }, { "epoch": 0.5753199483386169, "grad_norm": 2.340651750564575, "learning_rate": 8.119308033838814e-05, "loss": 0.0554, "step": 9800 }, { "epoch": 0.575907009510391, "grad_norm": 0.8285542130470276, "learning_rate": 8.115692616278018e-05, "loss": 0.0373, "step": 9810 }, { "epoch": 0.5764940706821651, "grad_norm": 1.3214105367660522, "learning_rate": 8.112074533611967e-05, "loss": 0.0472, "step": 9820 }, { "epoch": 0.5770811318539392, "grad_norm": 2.7276206016540527, "learning_rate": 8.108453788935498e-05, "loss": 0.0676, "step": 9830 }, { "epoch": 0.5776681930257133, "grad_norm": 3.1436824798583984, "learning_rate": 8.10483038534573e-05, "loss": 0.0467, "step": 9840 }, { "epoch": 0.5782552541974874, "grad_norm": 1.30657160282135, "learning_rate": 8.101204325942056e-05, "loss": 0.0298, "step": 9850 }, { "epoch": 0.5788423153692615, "grad_norm": 0.6837512850761414, "learning_rate": 8.097575613826136e-05, "loss": 0.0597, "step": 9860 }, { "epoch": 0.5794293765410355, "grad_norm": 2.244476795196533, "learning_rate": 8.093944252101907e-05, "loss": 0.0874, "step": 9870 }, { "epoch": 0.5800164377128096, "grad_norm": 2.2716472148895264, "learning_rate": 8.090310243875565e-05, "loss": 0.0374, "step": 9880 }, { "epoch": 0.5806034988845837, "grad_norm": 1.6220306158065796, "learning_rate": 8.086673592255573e-05, "loss": 0.05, "step": 9890 }, { "epoch": 0.5811905600563578, "grad_norm": 1.0101019144058228, "learning_rate": 8.083034300352657e-05, "loss": 0.0625, "step": 9900 }, { "epoch": 0.5817776212281319, "grad_norm": 2.9717745780944824, "learning_rate": 8.079392371279797e-05, "loss": 0.0823, "step": 9910 }, { "epoch": 0.582364682399906, "grad_norm": 2.167039394378662, "learning_rate": 8.075747808152231e-05, "loss": 0.0239, "step": 9920 }, { "epoch": 0.5829517435716801, "grad_norm": 2.5510525703430176, "learning_rate": 8.072100614087453e-05, "loss": 0.0513, "step": 9930 }, { "epoch": 0.5835388047434543, "grad_norm": 2.544032096862793, "learning_rate": 8.068450792205202e-05, "loss": 0.0417, "step": 9940 }, { "epoch": 0.5841258659152284, "grad_norm": 5.8752264976501465, "learning_rate": 8.064798345627468e-05, "loss": 0.0511, "step": 9950 }, { "epoch": 0.5847129270870025, "grad_norm": 2.863516330718994, "learning_rate": 8.061143277478486e-05, "loss": 0.0302, "step": 9960 }, { "epoch": 0.5852999882587766, "grad_norm": 0.4468505382537842, "learning_rate": 8.057485590884733e-05, "loss": 0.0371, "step": 9970 }, { "epoch": 0.5858870494305507, "grad_norm": 2.691464424133301, "learning_rate": 8.053825288974924e-05, "loss": 0.0488, "step": 9980 }, { "epoch": 0.5864741106023248, "grad_norm": 1.3407373428344727, "learning_rate": 8.050162374880015e-05, "loss": 0.055, "step": 9990 }, { "epoch": 0.5870611717740989, "grad_norm": 1.804349422454834, "learning_rate": 8.046496851733193e-05, "loss": 0.0425, "step": 10000 }, { "epoch": 0.587648232945873, "grad_norm": 3.6661617755889893, "learning_rate": 8.042828722669882e-05, "loss": 0.0558, "step": 10010 }, { "epoch": 0.5882352941176471, "grad_norm": 1.8299005031585693, "learning_rate": 8.039157990827727e-05, "loss": 0.048, "step": 10020 }, { "epoch": 0.5888223552894212, "grad_norm": 3.2334094047546387, "learning_rate": 8.0354846593466e-05, "loss": 0.0504, "step": 10030 }, { "epoch": 0.5894094164611953, "grad_norm": 5.981340408325195, "learning_rate": 8.031808731368608e-05, "loss": 0.0594, "step": 10040 }, { "epoch": 0.5899964776329694, "grad_norm": 2.805173873901367, "learning_rate": 8.028130210038067e-05, "loss": 0.0591, "step": 10050 }, { "epoch": 0.5905835388047435, "grad_norm": 0.9651131629943848, "learning_rate": 8.024449098501514e-05, "loss": 0.0635, "step": 10060 }, { "epoch": 0.5911705999765176, "grad_norm": 2.9740376472473145, "learning_rate": 8.020765399907707e-05, "loss": 0.063, "step": 10070 }, { "epoch": 0.5917576611482916, "grad_norm": 1.4581212997436523, "learning_rate": 8.017079117407611e-05, "loss": 0.0433, "step": 10080 }, { "epoch": 0.5923447223200657, "grad_norm": 1.9025243520736694, "learning_rate": 8.013390254154402e-05, "loss": 0.0415, "step": 10090 }, { "epoch": 0.5929317834918398, "grad_norm": 1.7271277904510498, "learning_rate": 8.009698813303465e-05, "loss": 0.0256, "step": 10100 }, { "epoch": 0.5935188446636139, "grad_norm": 2.1059720516204834, "learning_rate": 8.006004798012393e-05, "loss": 0.0573, "step": 10110 }, { "epoch": 0.594105905835388, "grad_norm": 3.541996479034424, "learning_rate": 8.002308211440974e-05, "loss": 0.0412, "step": 10120 }, { "epoch": 0.5946929670071621, "grad_norm": 4.55885648727417, "learning_rate": 7.998609056751199e-05, "loss": 0.0728, "step": 10130 }, { "epoch": 0.5952800281789362, "grad_norm": 4.764308452606201, "learning_rate": 7.994907337107258e-05, "loss": 0.0498, "step": 10140 }, { "epoch": 0.5958670893507103, "grad_norm": 2.9721059799194336, "learning_rate": 7.991203055675532e-05, "loss": 0.0721, "step": 10150 }, { "epoch": 0.5964541505224844, "grad_norm": 4.381247520446777, "learning_rate": 7.987496215624593e-05, "loss": 0.0356, "step": 10160 }, { "epoch": 0.5970412116942585, "grad_norm": 2.219993829727173, "learning_rate": 7.983786820125204e-05, "loss": 0.047, "step": 10170 }, { "epoch": 0.5976282728660326, "grad_norm": 2.0443854331970215, "learning_rate": 7.980074872350312e-05, "loss": 0.0374, "step": 10180 }, { "epoch": 0.5982153340378067, "grad_norm": 1.5770875215530396, "learning_rate": 7.976360375475047e-05, "loss": 0.0428, "step": 10190 }, { "epoch": 0.5988023952095808, "grad_norm": 4.273807048797607, "learning_rate": 7.972643332676723e-05, "loss": 0.061, "step": 10200 }, { "epoch": 0.599389456381355, "grad_norm": 4.913880825042725, "learning_rate": 7.968923747134825e-05, "loss": 0.0538, "step": 10210 }, { "epoch": 0.5999765175531291, "grad_norm": 4.090224742889404, "learning_rate": 7.965201622031021e-05, "loss": 0.0613, "step": 10220 }, { "epoch": 0.6005635787249032, "grad_norm": 0.2985149621963501, "learning_rate": 7.961476960549145e-05, "loss": 0.0388, "step": 10230 }, { "epoch": 0.6011506398966773, "grad_norm": 1.931270718574524, "learning_rate": 7.957749765875204e-05, "loss": 0.0422, "step": 10240 }, { "epoch": 0.6017377010684514, "grad_norm": 1.5263117551803589, "learning_rate": 7.954020041197369e-05, "loss": 0.0412, "step": 10250 }, { "epoch": 0.6023247622402255, "grad_norm": 0.7876232266426086, "learning_rate": 7.950287789705977e-05, "loss": 0.0398, "step": 10260 }, { "epoch": 0.6029118234119996, "grad_norm": 1.9584176540374756, "learning_rate": 7.94655301459353e-05, "loss": 0.0443, "step": 10270 }, { "epoch": 0.6034988845837737, "grad_norm": 1.9195181131362915, "learning_rate": 7.942815719054679e-05, "loss": 0.0569, "step": 10280 }, { "epoch": 0.6040859457555477, "grad_norm": 4.590113639831543, "learning_rate": 7.939075906286241e-05, "loss": 0.0568, "step": 10290 }, { "epoch": 0.6046730069273218, "grad_norm": 4.76384162902832, "learning_rate": 7.935333579487179e-05, "loss": 0.0418, "step": 10300 }, { "epoch": 0.6052600680990959, "grad_norm": 2.2241790294647217, "learning_rate": 7.931588741858612e-05, "loss": 0.0374, "step": 10310 }, { "epoch": 0.60584712927087, "grad_norm": 4.571372985839844, "learning_rate": 7.927841396603804e-05, "loss": 0.045, "step": 10320 }, { "epoch": 0.6064341904426441, "grad_norm": 1.6196099519729614, "learning_rate": 7.924091546928163e-05, "loss": 0.0288, "step": 10330 }, { "epoch": 0.6070212516144182, "grad_norm": 2.5634894371032715, "learning_rate": 7.920339196039239e-05, "loss": 0.0596, "step": 10340 }, { "epoch": 0.6076083127861923, "grad_norm": 2.9246764183044434, "learning_rate": 7.916584347146728e-05, "loss": 0.0436, "step": 10350 }, { "epoch": 0.6081953739579664, "grad_norm": 0.8077777624130249, "learning_rate": 7.912827003462451e-05, "loss": 0.0332, "step": 10360 }, { "epoch": 0.6087824351297405, "grad_norm": 2.4554831981658936, "learning_rate": 7.909067168200375e-05, "loss": 0.0461, "step": 10370 }, { "epoch": 0.6093694963015146, "grad_norm": 3.078833818435669, "learning_rate": 7.905304844576589e-05, "loss": 0.0674, "step": 10380 }, { "epoch": 0.6099565574732887, "grad_norm": 4.6846723556518555, "learning_rate": 7.901540035809316e-05, "loss": 0.0686, "step": 10390 }, { "epoch": 0.6105436186450628, "grad_norm": 1.6238244771957397, "learning_rate": 7.897772745118903e-05, "loss": 0.0378, "step": 10400 }, { "epoch": 0.6111306798168369, "grad_norm": 1.2178541421890259, "learning_rate": 7.89400297572782e-05, "loss": 0.0438, "step": 10410 }, { "epoch": 0.611717740988611, "grad_norm": 2.189246416091919, "learning_rate": 7.890230730860657e-05, "loss": 0.0444, "step": 10420 }, { "epoch": 0.6123048021603851, "grad_norm": 3.594202995300293, "learning_rate": 7.886456013744124e-05, "loss": 0.0593, "step": 10430 }, { "epoch": 0.6128918633321592, "grad_norm": 1.5608102083206177, "learning_rate": 7.88267882760704e-05, "loss": 0.0617, "step": 10440 }, { "epoch": 0.6134789245039333, "grad_norm": 1.3150290250778198, "learning_rate": 7.878899175680341e-05, "loss": 0.0524, "step": 10450 }, { "epoch": 0.6140659856757074, "grad_norm": 2.178785800933838, "learning_rate": 7.875117061197071e-05, "loss": 0.0609, "step": 10460 }, { "epoch": 0.6146530468474815, "grad_norm": 2.4178595542907715, "learning_rate": 7.87133248739238e-05, "loss": 0.0411, "step": 10470 }, { "epoch": 0.6152401080192557, "grad_norm": 0.16792283952236176, "learning_rate": 7.867545457503521e-05, "loss": 0.0546, "step": 10480 }, { "epoch": 0.6158271691910298, "grad_norm": 2.0843019485473633, "learning_rate": 7.863755974769851e-05, "loss": 0.0383, "step": 10490 }, { "epoch": 0.6164142303628038, "grad_norm": 2.229863405227661, "learning_rate": 7.859964042432819e-05, "loss": 0.0403, "step": 10500 }, { "epoch": 0.617001291534578, "grad_norm": 1.8265129327774048, "learning_rate": 7.856169663735975e-05, "loss": 0.0373, "step": 10510 }, { "epoch": 0.617588352706352, "grad_norm": 1.1807312965393066, "learning_rate": 7.852372841924961e-05, "loss": 0.0614, "step": 10520 }, { "epoch": 0.6181754138781261, "grad_norm": 1.4932291507720947, "learning_rate": 7.848573580247505e-05, "loss": 0.0364, "step": 10530 }, { "epoch": 0.6187624750499002, "grad_norm": 1.2048351764678955, "learning_rate": 7.844771881953425e-05, "loss": 0.0544, "step": 10540 }, { "epoch": 0.6193495362216743, "grad_norm": 2.292145013809204, "learning_rate": 7.840967750294626e-05, "loss": 0.0681, "step": 10550 }, { "epoch": 0.6199365973934484, "grad_norm": 2.7694923877716064, "learning_rate": 7.837161188525087e-05, "loss": 0.0669, "step": 10560 }, { "epoch": 0.6205236585652225, "grad_norm": 2.548207998275757, "learning_rate": 7.83335219990087e-05, "loss": 0.0528, "step": 10570 }, { "epoch": 0.6211107197369966, "grad_norm": 2.0487241744995117, "learning_rate": 7.829540787680114e-05, "loss": 0.0484, "step": 10580 }, { "epoch": 0.6216977809087707, "grad_norm": 1.4640957117080688, "learning_rate": 7.82572695512303e-05, "loss": 0.0537, "step": 10590 }, { "epoch": 0.6222848420805448, "grad_norm": 3.0031116008758545, "learning_rate": 7.8219107054919e-05, "loss": 0.0541, "step": 10600 }, { "epoch": 0.6228719032523189, "grad_norm": 0.8556775450706482, "learning_rate": 7.818092042051071e-05, "loss": 0.0432, "step": 10610 }, { "epoch": 0.623458964424093, "grad_norm": 2.426072835922241, "learning_rate": 7.814270968066956e-05, "loss": 0.0277, "step": 10620 }, { "epoch": 0.6240460255958671, "grad_norm": 0.1409802883863449, "learning_rate": 7.810447486808032e-05, "loss": 0.0429, "step": 10630 }, { "epoch": 0.6246330867676412, "grad_norm": 3.363037586212158, "learning_rate": 7.806621601544832e-05, "loss": 0.0344, "step": 10640 }, { "epoch": 0.6252201479394153, "grad_norm": 4.5818705558776855, "learning_rate": 7.802793315549948e-05, "loss": 0.0552, "step": 10650 }, { "epoch": 0.6258072091111894, "grad_norm": 3.9714221954345703, "learning_rate": 7.798962632098024e-05, "loss": 0.0417, "step": 10660 }, { "epoch": 0.6263942702829635, "grad_norm": 3.6985998153686523, "learning_rate": 7.795129554465754e-05, "loss": 0.0538, "step": 10670 }, { "epoch": 0.6269813314547376, "grad_norm": 1.2744144201278687, "learning_rate": 7.791294085931882e-05, "loss": 0.0454, "step": 10680 }, { "epoch": 0.6275683926265117, "grad_norm": 1.1511504650115967, "learning_rate": 7.787456229777196e-05, "loss": 0.0549, "step": 10690 }, { "epoch": 0.6281554537982857, "grad_norm": 2.6022820472717285, "learning_rate": 7.783615989284527e-05, "loss": 0.0552, "step": 10700 }, { "epoch": 0.6287425149700598, "grad_norm": 2.784878969192505, "learning_rate": 7.779773367738743e-05, "loss": 0.06, "step": 10710 }, { "epoch": 0.6293295761418339, "grad_norm": 0.13731670379638672, "learning_rate": 7.775928368426751e-05, "loss": 0.0258, "step": 10720 }, { "epoch": 0.629916637313608, "grad_norm": 3.121769428253174, "learning_rate": 7.772080994637494e-05, "loss": 0.0436, "step": 10730 }, { "epoch": 0.6305036984853821, "grad_norm": 5.707434177398682, "learning_rate": 7.768231249661942e-05, "loss": 0.0845, "step": 10740 }, { "epoch": 0.6310907596571563, "grad_norm": 2.115462303161621, "learning_rate": 7.764379136793096e-05, "loss": 0.0649, "step": 10750 }, { "epoch": 0.6316778208289304, "grad_norm": 1.9501739740371704, "learning_rate": 7.760524659325981e-05, "loss": 0.0424, "step": 10760 }, { "epoch": 0.6322648820007045, "grad_norm": 2.767653703689575, "learning_rate": 7.756667820557644e-05, "loss": 0.0598, "step": 10770 }, { "epoch": 0.6328519431724786, "grad_norm": 1.8406312465667725, "learning_rate": 7.752808623787152e-05, "loss": 0.0449, "step": 10780 }, { "epoch": 0.6334390043442527, "grad_norm": 1.769200325012207, "learning_rate": 7.748947072315592e-05, "loss": 0.0619, "step": 10790 }, { "epoch": 0.6340260655160268, "grad_norm": 2.3951313495635986, "learning_rate": 7.745083169446064e-05, "loss": 0.0337, "step": 10800 }, { "epoch": 0.6346131266878009, "grad_norm": 3.2387876510620117, "learning_rate": 7.741216918483674e-05, "loss": 0.0526, "step": 10810 }, { "epoch": 0.635200187859575, "grad_norm": 0.21347936987876892, "learning_rate": 7.737348322735545e-05, "loss": 0.0288, "step": 10820 }, { "epoch": 0.6357872490313491, "grad_norm": 2.916245460510254, "learning_rate": 7.7334773855108e-05, "loss": 0.0534, "step": 10830 }, { "epoch": 0.6363743102031232, "grad_norm": 2.786033868789673, "learning_rate": 7.729604110120564e-05, "loss": 0.0542, "step": 10840 }, { "epoch": 0.6369613713748973, "grad_norm": 3.163647413253784, "learning_rate": 7.725728499877967e-05, "loss": 0.0378, "step": 10850 }, { "epoch": 0.6375484325466714, "grad_norm": 0.20459908246994019, "learning_rate": 7.721850558098136e-05, "loss": 0.0609, "step": 10860 }, { "epoch": 0.6381354937184455, "grad_norm": 2.6411170959472656, "learning_rate": 7.717970288098184e-05, "loss": 0.0318, "step": 10870 }, { "epoch": 0.6387225548902196, "grad_norm": 1.81226646900177, "learning_rate": 7.714087693197227e-05, "loss": 0.0664, "step": 10880 }, { "epoch": 0.6393096160619937, "grad_norm": 2.738131284713745, "learning_rate": 7.710202776716362e-05, "loss": 0.0433, "step": 10890 }, { "epoch": 0.6398966772337678, "grad_norm": 2.4232263565063477, "learning_rate": 7.706315541978673e-05, "loss": 0.0451, "step": 10900 }, { "epoch": 0.6404837384055418, "grad_norm": 1.342581033706665, "learning_rate": 7.702425992309229e-05, "loss": 0.0772, "step": 10910 }, { "epoch": 0.6410707995773159, "grad_norm": 2.0487866401672363, "learning_rate": 7.698534131035077e-05, "loss": 0.0331, "step": 10920 }, { "epoch": 0.64165786074909, "grad_norm": 2.9351892471313477, "learning_rate": 7.694639961485246e-05, "loss": 0.0486, "step": 10930 }, { "epoch": 0.6422449219208641, "grad_norm": 2.427748203277588, "learning_rate": 7.69074348699073e-05, "loss": 0.065, "step": 10940 }, { "epoch": 0.6428319830926382, "grad_norm": 1.4076740741729736, "learning_rate": 7.686844710884506e-05, "loss": 0.0543, "step": 10950 }, { "epoch": 0.6434190442644123, "grad_norm": 1.4840725660324097, "learning_rate": 7.682943636501512e-05, "loss": 0.05, "step": 10960 }, { "epoch": 0.6440061054361864, "grad_norm": 2.7182629108428955, "learning_rate": 7.679040267178653e-05, "loss": 0.0533, "step": 10970 }, { "epoch": 0.6445931666079605, "grad_norm": 6.214156150817871, "learning_rate": 7.675134606254799e-05, "loss": 0.0804, "step": 10980 }, { "epoch": 0.6451802277797346, "grad_norm": 2.520094871520996, "learning_rate": 7.67122665707078e-05, "loss": 0.0378, "step": 10990 }, { "epoch": 0.6457672889515087, "grad_norm": 2.512451648712158, "learning_rate": 7.667316422969383e-05, "loss": 0.0903, "step": 11000 }, { "epoch": 0.6463543501232828, "grad_norm": 2.207740545272827, "learning_rate": 7.663403907295348e-05, "loss": 0.0524, "step": 11010 }, { "epoch": 0.6469414112950569, "grad_norm": 2.339881181716919, "learning_rate": 7.65948911339537e-05, "loss": 0.0543, "step": 11020 }, { "epoch": 0.6475284724668311, "grad_norm": 1.7187379598617554, "learning_rate": 7.655572044618086e-05, "loss": 0.0529, "step": 11030 }, { "epoch": 0.6481155336386052, "grad_norm": 1.831847906112671, "learning_rate": 7.65165270431409e-05, "loss": 0.0552, "step": 11040 }, { "epoch": 0.6487025948103793, "grad_norm": 1.8412458896636963, "learning_rate": 7.647731095835906e-05, "loss": 0.0527, "step": 11050 }, { "epoch": 0.6492896559821534, "grad_norm": 0.5242549777030945, "learning_rate": 7.64380722253801e-05, "loss": 0.0296, "step": 11060 }, { "epoch": 0.6498767171539275, "grad_norm": 2.6345746517181396, "learning_rate": 7.639881087776807e-05, "loss": 0.0571, "step": 11070 }, { "epoch": 0.6504637783257016, "grad_norm": 2.9504003524780273, "learning_rate": 7.635952694910637e-05, "loss": 0.0356, "step": 11080 }, { "epoch": 0.6510508394974757, "grad_norm": 0.4250659942626953, "learning_rate": 7.632022047299781e-05, "loss": 0.0368, "step": 11090 }, { "epoch": 0.6516379006692498, "grad_norm": 2.3709330558776855, "learning_rate": 7.628089148306434e-05, "loss": 0.0538, "step": 11100 }, { "epoch": 0.6522249618410239, "grad_norm": 2.345409870147705, "learning_rate": 7.624154001294729e-05, "loss": 0.0519, "step": 11110 }, { "epoch": 0.652812023012798, "grad_norm": 2.8023202419281006, "learning_rate": 7.620216609630715e-05, "loss": 0.085, "step": 11120 }, { "epoch": 0.653399084184572, "grad_norm": 3.364889144897461, "learning_rate": 7.616276976682365e-05, "loss": 0.0424, "step": 11130 }, { "epoch": 0.6539861453563461, "grad_norm": 2.60764741897583, "learning_rate": 7.612335105819565e-05, "loss": 0.0462, "step": 11140 }, { "epoch": 0.6545732065281202, "grad_norm": 2.6991515159606934, "learning_rate": 7.608391000414118e-05, "loss": 0.0406, "step": 11150 }, { "epoch": 0.6551602676998943, "grad_norm": 2.6384434700012207, "learning_rate": 7.604444663839743e-05, "loss": 0.0572, "step": 11160 }, { "epoch": 0.6557473288716684, "grad_norm": 5.621922969818115, "learning_rate": 7.600496099472057e-05, "loss": 0.0549, "step": 11170 }, { "epoch": 0.6563343900434425, "grad_norm": 2.415738821029663, "learning_rate": 7.59654531068859e-05, "loss": 0.0636, "step": 11180 }, { "epoch": 0.6569214512152166, "grad_norm": 2.35561203956604, "learning_rate": 7.592592300868774e-05, "loss": 0.0292, "step": 11190 }, { "epoch": 0.6575085123869907, "grad_norm": 1.058709740638733, "learning_rate": 7.588637073393935e-05, "loss": 0.0541, "step": 11200 }, { "epoch": 0.6580955735587648, "grad_norm": 1.6674184799194336, "learning_rate": 7.58467963164731e-05, "loss": 0.0602, "step": 11210 }, { "epoch": 0.6586826347305389, "grad_norm": 2.6472854614257812, "learning_rate": 7.580719979014012e-05, "loss": 0.0486, "step": 11220 }, { "epoch": 0.659269695902313, "grad_norm": 3.3469183444976807, "learning_rate": 7.576758118881056e-05, "loss": 0.0595, "step": 11230 }, { "epoch": 0.6598567570740871, "grad_norm": 1.0797492265701294, "learning_rate": 7.572794054637347e-05, "loss": 0.0389, "step": 11240 }, { "epoch": 0.6604438182458612, "grad_norm": 3.399881601333618, "learning_rate": 7.568827789673665e-05, "loss": 0.0698, "step": 11250 }, { "epoch": 0.6610308794176353, "grad_norm": 2.2885448932647705, "learning_rate": 7.564859327382685e-05, "loss": 0.0473, "step": 11260 }, { "epoch": 0.6616179405894094, "grad_norm": 1.8934096097946167, "learning_rate": 7.560888671158953e-05, "loss": 0.0738, "step": 11270 }, { "epoch": 0.6622050017611835, "grad_norm": 1.3255971670150757, "learning_rate": 7.556915824398894e-05, "loss": 0.0431, "step": 11280 }, { "epoch": 0.6627920629329576, "grad_norm": 2.498289108276367, "learning_rate": 7.552940790500806e-05, "loss": 0.0449, "step": 11290 }, { "epoch": 0.6633791241047318, "grad_norm": 1.8834761381149292, "learning_rate": 7.54896357286486e-05, "loss": 0.0332, "step": 11300 }, { "epoch": 0.6639661852765059, "grad_norm": 4.0630621910095215, "learning_rate": 7.544984174893095e-05, "loss": 0.0515, "step": 11310 }, { "epoch": 0.66455324644828, "grad_norm": 2.2917168140411377, "learning_rate": 7.54100259998941e-05, "loss": 0.0303, "step": 11320 }, { "epoch": 0.665140307620054, "grad_norm": 1.560427188873291, "learning_rate": 7.537018851559576e-05, "loss": 0.0245, "step": 11330 }, { "epoch": 0.6657273687918281, "grad_norm": 2.269132137298584, "learning_rate": 7.533032933011209e-05, "loss": 0.0624, "step": 11340 }, { "epoch": 0.6663144299636022, "grad_norm": 2.116055488586426, "learning_rate": 7.529044847753795e-05, "loss": 0.0668, "step": 11350 }, { "epoch": 0.6669014911353763, "grad_norm": 2.3280317783355713, "learning_rate": 7.525054599198666e-05, "loss": 0.0369, "step": 11360 }, { "epoch": 0.6674885523071504, "grad_norm": 1.0251444578170776, "learning_rate": 7.521062190759005e-05, "loss": 0.0394, "step": 11370 }, { "epoch": 0.6680756134789245, "grad_norm": 1.3464771509170532, "learning_rate": 7.517067625849846e-05, "loss": 0.0494, "step": 11380 }, { "epoch": 0.6686626746506986, "grad_norm": 1.4254645109176636, "learning_rate": 7.513070907888065e-05, "loss": 0.072, "step": 11390 }, { "epoch": 0.6692497358224727, "grad_norm": 2.681849718093872, "learning_rate": 7.509072040292376e-05, "loss": 0.049, "step": 11400 }, { "epoch": 0.6698367969942468, "grad_norm": 1.6129744052886963, "learning_rate": 7.505071026483337e-05, "loss": 0.0429, "step": 11410 }, { "epoch": 0.6704238581660209, "grad_norm": 2.020989179611206, "learning_rate": 7.501067869883344e-05, "loss": 0.0524, "step": 11420 }, { "epoch": 0.671010919337795, "grad_norm": 1.7132612466812134, "learning_rate": 7.49706257391662e-05, "loss": 0.0539, "step": 11430 }, { "epoch": 0.6715979805095691, "grad_norm": 2.2620656490325928, "learning_rate": 7.49305514200922e-05, "loss": 0.0468, "step": 11440 }, { "epoch": 0.6721850416813432, "grad_norm": 0.48451223969459534, "learning_rate": 7.489045577589026e-05, "loss": 0.0487, "step": 11450 }, { "epoch": 0.6727721028531173, "grad_norm": 1.3714998960494995, "learning_rate": 7.485033884085746e-05, "loss": 0.0375, "step": 11460 }, { "epoch": 0.6733591640248914, "grad_norm": 3.537254810333252, "learning_rate": 7.481020064930908e-05, "loss": 0.0404, "step": 11470 }, { "epoch": 0.6739462251966655, "grad_norm": 1.1268501281738281, "learning_rate": 7.477004123557855e-05, "loss": 0.0611, "step": 11480 }, { "epoch": 0.6745332863684396, "grad_norm": 1.7831934690475464, "learning_rate": 7.472986063401751e-05, "loss": 0.0311, "step": 11490 }, { "epoch": 0.6751203475402137, "grad_norm": 3.192962646484375, "learning_rate": 7.46896588789957e-05, "loss": 0.0734, "step": 11500 }, { "epoch": 0.6757074087119878, "grad_norm": 5.8145976066589355, "learning_rate": 7.464943600490094e-05, "loss": 0.0388, "step": 11510 }, { "epoch": 0.6762944698837619, "grad_norm": 2.2048866748809814, "learning_rate": 7.46091920461391e-05, "loss": 0.0405, "step": 11520 }, { "epoch": 0.676881531055536, "grad_norm": 4.10899543762207, "learning_rate": 7.456892703713415e-05, "loss": 0.0497, "step": 11530 }, { "epoch": 0.67746859222731, "grad_norm": 3.3961877822875977, "learning_rate": 7.452864101232798e-05, "loss": 0.0415, "step": 11540 }, { "epoch": 0.6780556533990841, "grad_norm": 0.6255916357040405, "learning_rate": 7.448833400618055e-05, "loss": 0.0557, "step": 11550 }, { "epoch": 0.6786427145708582, "grad_norm": 2.311728000640869, "learning_rate": 7.44480060531697e-05, "loss": 0.0401, "step": 11560 }, { "epoch": 0.6792297757426324, "grad_norm": 1.7161227464675903, "learning_rate": 7.440765718779124e-05, "loss": 0.0545, "step": 11570 }, { "epoch": 0.6798168369144065, "grad_norm": 1.3901468515396118, "learning_rate": 7.436728744455877e-05, "loss": 0.0333, "step": 11580 }, { "epoch": 0.6804038980861806, "grad_norm": 2.844081163406372, "learning_rate": 7.432689685800386e-05, "loss": 0.0448, "step": 11590 }, { "epoch": 0.6809909592579547, "grad_norm": 2.115511417388916, "learning_rate": 7.428648546267586e-05, "loss": 0.0493, "step": 11600 }, { "epoch": 0.6815780204297288, "grad_norm": 1.6813451051712036, "learning_rate": 7.42460532931419e-05, "loss": 0.0536, "step": 11610 }, { "epoch": 0.6821650816015029, "grad_norm": 3.8245468139648438, "learning_rate": 7.420560038398694e-05, "loss": 0.0514, "step": 11620 }, { "epoch": 0.682752142773277, "grad_norm": 1.9684380292892456, "learning_rate": 7.416512676981359e-05, "loss": 0.0508, "step": 11630 }, { "epoch": 0.6833392039450511, "grad_norm": 1.6315116882324219, "learning_rate": 7.412463248524229e-05, "loss": 0.0356, "step": 11640 }, { "epoch": 0.6839262651168252, "grad_norm": 2.6481685638427734, "learning_rate": 7.408411756491104e-05, "loss": 0.0279, "step": 11650 }, { "epoch": 0.6845133262885993, "grad_norm": 3.495072841644287, "learning_rate": 7.404358204347557e-05, "loss": 0.0368, "step": 11660 }, { "epoch": 0.6851003874603734, "grad_norm": 2.3664352893829346, "learning_rate": 7.400302595560919e-05, "loss": 0.0501, "step": 11670 }, { "epoch": 0.6856874486321475, "grad_norm": 2.232999324798584, "learning_rate": 7.396244933600285e-05, "loss": 0.0556, "step": 11680 }, { "epoch": 0.6862745098039216, "grad_norm": 2.578655958175659, "learning_rate": 7.3921852219365e-05, "loss": 0.0325, "step": 11690 }, { "epoch": 0.6868615709756957, "grad_norm": 1.3885486125946045, "learning_rate": 7.388123464042167e-05, "loss": 0.0493, "step": 11700 }, { "epoch": 0.6874486321474698, "grad_norm": 0.7150513529777527, "learning_rate": 7.38405966339164e-05, "loss": 0.0607, "step": 11710 }, { "epoch": 0.6880356933192439, "grad_norm": 2.9040191173553467, "learning_rate": 7.379993823461014e-05, "loss": 0.0306, "step": 11720 }, { "epoch": 0.688622754491018, "grad_norm": 2.2759017944335938, "learning_rate": 7.375925947728135e-05, "loss": 0.0472, "step": 11730 }, { "epoch": 0.689209815662792, "grad_norm": 1.5399824380874634, "learning_rate": 7.371856039672586e-05, "loss": 0.0414, "step": 11740 }, { "epoch": 0.6897968768345661, "grad_norm": 0.29096680879592896, "learning_rate": 7.367784102775694e-05, "loss": 0.0368, "step": 11750 }, { "epoch": 0.6903839380063402, "grad_norm": 1.2176077365875244, "learning_rate": 7.363710140520514e-05, "loss": 0.043, "step": 11760 }, { "epoch": 0.6909709991781143, "grad_norm": 4.282761573791504, "learning_rate": 7.35963415639184e-05, "loss": 0.0378, "step": 11770 }, { "epoch": 0.6915580603498884, "grad_norm": 2.395709991455078, "learning_rate": 7.35555615387619e-05, "loss": 0.0579, "step": 11780 }, { "epoch": 0.6921451215216625, "grad_norm": 2.70701265335083, "learning_rate": 7.351476136461814e-05, "loss": 0.0503, "step": 11790 }, { "epoch": 0.6927321826934366, "grad_norm": 1.529801607131958, "learning_rate": 7.34739410763868e-05, "loss": 0.071, "step": 11800 }, { "epoch": 0.6933192438652107, "grad_norm": 3.3718185424804688, "learning_rate": 7.34331007089848e-05, "loss": 0.0372, "step": 11810 }, { "epoch": 0.6939063050369848, "grad_norm": 2.4751768112182617, "learning_rate": 7.339224029734623e-05, "loss": 0.0528, "step": 11820 }, { "epoch": 0.6944933662087589, "grad_norm": 0.7187146544456482, "learning_rate": 7.335135987642233e-05, "loss": 0.0317, "step": 11830 }, { "epoch": 0.6950804273805331, "grad_norm": 1.9040707349777222, "learning_rate": 7.331045948118144e-05, "loss": 0.0619, "step": 11840 }, { "epoch": 0.6956674885523072, "grad_norm": 1.1234776973724365, "learning_rate": 7.3269539146609e-05, "loss": 0.0431, "step": 11850 }, { "epoch": 0.6962545497240813, "grad_norm": 3.2750253677368164, "learning_rate": 7.32285989077075e-05, "loss": 0.0497, "step": 11860 }, { "epoch": 0.6968416108958554, "grad_norm": 2.447385311126709, "learning_rate": 7.318763879949644e-05, "loss": 0.0568, "step": 11870 }, { "epoch": 0.6974286720676295, "grad_norm": 2.359724283218384, "learning_rate": 7.314665885701234e-05, "loss": 0.0477, "step": 11880 }, { "epoch": 0.6980157332394036, "grad_norm": 2.8300561904907227, "learning_rate": 7.310565911530869e-05, "loss": 0.0623, "step": 11890 }, { "epoch": 0.6986027944111777, "grad_norm": 2.227444887161255, "learning_rate": 7.30646396094559e-05, "loss": 0.0366, "step": 11900 }, { "epoch": 0.6991898555829518, "grad_norm": 0.7826442718505859, "learning_rate": 7.302360037454128e-05, "loss": 0.0334, "step": 11910 }, { "epoch": 0.6997769167547259, "grad_norm": 0.606780469417572, "learning_rate": 7.298254144566901e-05, "loss": 0.0493, "step": 11920 }, { "epoch": 0.7003639779265, "grad_norm": 3.1362287998199463, "learning_rate": 7.294146285796015e-05, "loss": 0.0564, "step": 11930 }, { "epoch": 0.700951039098274, "grad_norm": 2.817150354385376, "learning_rate": 7.290036464655257e-05, "loss": 0.0601, "step": 11940 }, { "epoch": 0.7015381002700481, "grad_norm": 2.4754340648651123, "learning_rate": 7.285924684660089e-05, "loss": 0.0756, "step": 11950 }, { "epoch": 0.7021251614418222, "grad_norm": 2.2875254154205322, "learning_rate": 7.281810949327651e-05, "loss": 0.0392, "step": 11960 }, { "epoch": 0.7027122226135963, "grad_norm": 1.9385100603103638, "learning_rate": 7.277695262176756e-05, "loss": 0.0506, "step": 11970 }, { "epoch": 0.7032992837853704, "grad_norm": 2.7072794437408447, "learning_rate": 7.273577626727884e-05, "loss": 0.0547, "step": 11980 }, { "epoch": 0.7038863449571445, "grad_norm": 1.3798446655273438, "learning_rate": 7.269458046503187e-05, "loss": 0.0281, "step": 11990 }, { "epoch": 0.7044734061289186, "grad_norm": 1.3505398035049438, "learning_rate": 7.265336525026476e-05, "loss": 0.0324, "step": 12000 }, { "epoch": 0.7044734061289186, "eval_loss": 0.42484381794929504, "eval_runtime": 269.3958, "eval_samples_per_second": 3.508, "eval_steps_per_second": 3.508, "step": 12000 }, { "epoch": 0.7050604673006927, "grad_norm": 1.4764763116836548, "learning_rate": 7.26121306582322e-05, "loss": 0.0477, "step": 12010 }, { "epoch": 0.7056475284724668, "grad_norm": 3.2928757667541504, "learning_rate": 7.257087672420553e-05, "loss": 0.0534, "step": 12020 }, { "epoch": 0.7062345896442409, "grad_norm": 0.3668311536312103, "learning_rate": 7.252960348347258e-05, "loss": 0.0517, "step": 12030 }, { "epoch": 0.706821650816015, "grad_norm": 1.4070870876312256, "learning_rate": 7.24883109713377e-05, "loss": 0.0328, "step": 12040 }, { "epoch": 0.7074087119877891, "grad_norm": 2.4994914531707764, "learning_rate": 7.244699922312176e-05, "loss": 0.0346, "step": 12050 }, { "epoch": 0.7079957731595632, "grad_norm": 2.1366090774536133, "learning_rate": 7.240566827416204e-05, "loss": 0.0533, "step": 12060 }, { "epoch": 0.7085828343313373, "grad_norm": 2.076207399368286, "learning_rate": 7.236431815981223e-05, "loss": 0.0561, "step": 12070 }, { "epoch": 0.7091698955031114, "grad_norm": 0.8552598357200623, "learning_rate": 7.23229489154425e-05, "loss": 0.0416, "step": 12080 }, { "epoch": 0.7097569566748855, "grad_norm": 2.1739418506622314, "learning_rate": 7.22815605764393e-05, "loss": 0.0461, "step": 12090 }, { "epoch": 0.7103440178466596, "grad_norm": 1.3149423599243164, "learning_rate": 7.224015317820544e-05, "loss": 0.0651, "step": 12100 }, { "epoch": 0.7109310790184338, "grad_norm": 3.0229716300964355, "learning_rate": 7.219872675616006e-05, "loss": 0.04, "step": 12110 }, { "epoch": 0.7115181401902079, "grad_norm": 1.1393182277679443, "learning_rate": 7.215728134573852e-05, "loss": 0.0396, "step": 12120 }, { "epoch": 0.712105201361982, "grad_norm": 3.466083526611328, "learning_rate": 7.211581698239245e-05, "loss": 0.0583, "step": 12130 }, { "epoch": 0.7126922625337561, "grad_norm": 0.6670250296592712, "learning_rate": 7.207433370158972e-05, "loss": 0.0314, "step": 12140 }, { "epoch": 0.7132793237055302, "grad_norm": 0.23745274543762207, "learning_rate": 7.203283153881432e-05, "loss": 0.0368, "step": 12150 }, { "epoch": 0.7138663848773042, "grad_norm": 1.116580605506897, "learning_rate": 7.199131052956644e-05, "loss": 0.0376, "step": 12160 }, { "epoch": 0.7144534460490783, "grad_norm": 5.257540225982666, "learning_rate": 7.194977070936239e-05, "loss": 0.0498, "step": 12170 }, { "epoch": 0.7150405072208524, "grad_norm": 1.307039499282837, "learning_rate": 7.190821211373453e-05, "loss": 0.0608, "step": 12180 }, { "epoch": 0.7156275683926265, "grad_norm": 0.8632544875144958, "learning_rate": 7.18666347782313e-05, "loss": 0.0446, "step": 12190 }, { "epoch": 0.7162146295644006, "grad_norm": 1.5932352542877197, "learning_rate": 7.182503873841722e-05, "loss": 0.0391, "step": 12200 }, { "epoch": 0.7168016907361747, "grad_norm": 3.4451279640197754, "learning_rate": 7.178342402987272e-05, "loss": 0.0465, "step": 12210 }, { "epoch": 0.7173887519079488, "grad_norm": 1.3696842193603516, "learning_rate": 7.174179068819428e-05, "loss": 0.0402, "step": 12220 }, { "epoch": 0.7179758130797229, "grad_norm": 2.310307025909424, "learning_rate": 7.170013874899426e-05, "loss": 0.0449, "step": 12230 }, { "epoch": 0.718562874251497, "grad_norm": 2.0366644859313965, "learning_rate": 7.165846824790095e-05, "loss": 0.0278, "step": 12240 }, { "epoch": 0.7191499354232711, "grad_norm": 3.133538246154785, "learning_rate": 7.161677922055853e-05, "loss": 0.033, "step": 12250 }, { "epoch": 0.7197369965950452, "grad_norm": 2.494349479675293, "learning_rate": 7.157507170262701e-05, "loss": 0.0445, "step": 12260 }, { "epoch": 0.7203240577668193, "grad_norm": 0.7048376798629761, "learning_rate": 7.153334572978221e-05, "loss": 0.0628, "step": 12270 }, { "epoch": 0.7209111189385934, "grad_norm": 0.8292019963264465, "learning_rate": 7.149160133771577e-05, "loss": 0.0556, "step": 12280 }, { "epoch": 0.7214981801103675, "grad_norm": 0.5695822834968567, "learning_rate": 7.144983856213507e-05, "loss": 0.038, "step": 12290 }, { "epoch": 0.7220852412821416, "grad_norm": 0.4260999858379364, "learning_rate": 7.140805743876317e-05, "loss": 0.0297, "step": 12300 }, { "epoch": 0.7226723024539157, "grad_norm": 2.661212921142578, "learning_rate": 7.136625800333887e-05, "loss": 0.0763, "step": 12310 }, { "epoch": 0.7232593636256898, "grad_norm": 2.367537260055542, "learning_rate": 7.132444029161667e-05, "loss": 0.0333, "step": 12320 }, { "epoch": 0.7238464247974639, "grad_norm": 1.2273950576782227, "learning_rate": 7.12826043393666e-05, "loss": 0.0364, "step": 12330 }, { "epoch": 0.724433485969238, "grad_norm": 1.5182220935821533, "learning_rate": 7.12407501823744e-05, "loss": 0.0578, "step": 12340 }, { "epoch": 0.725020547141012, "grad_norm": 3.5969302654266357, "learning_rate": 7.11988778564413e-05, "loss": 0.0633, "step": 12350 }, { "epoch": 0.7256076083127861, "grad_norm": 1.830116868019104, "learning_rate": 7.115698739738412e-05, "loss": 0.0545, "step": 12360 }, { "epoch": 0.7261946694845602, "grad_norm": 4.492576599121094, "learning_rate": 7.111507884103518e-05, "loss": 0.058, "step": 12370 }, { "epoch": 0.7267817306563343, "grad_norm": 0.652728259563446, "learning_rate": 7.107315222324227e-05, "loss": 0.0552, "step": 12380 }, { "epoch": 0.7273687918281085, "grad_norm": 1.4836835861206055, "learning_rate": 7.103120757986864e-05, "loss": 0.0467, "step": 12390 }, { "epoch": 0.7279558529998826, "grad_norm": 1.9405089616775513, "learning_rate": 7.098924494679295e-05, "loss": 0.0202, "step": 12400 }, { "epoch": 0.7285429141716567, "grad_norm": 1.2933133840560913, "learning_rate": 7.094726435990926e-05, "loss": 0.0196, "step": 12410 }, { "epoch": 0.7291299753434308, "grad_norm": 4.597402095794678, "learning_rate": 7.090526585512696e-05, "loss": 0.0403, "step": 12420 }, { "epoch": 0.7297170365152049, "grad_norm": 1.3302552700042725, "learning_rate": 7.086324946837081e-05, "loss": 0.0432, "step": 12430 }, { "epoch": 0.730304097686979, "grad_norm": 2.8235247135162354, "learning_rate": 7.082121523558083e-05, "loss": 0.1037, "step": 12440 }, { "epoch": 0.7308911588587531, "grad_norm": 4.161515235900879, "learning_rate": 7.077916319271232e-05, "loss": 0.0592, "step": 12450 }, { "epoch": 0.7314782200305272, "grad_norm": 2.3160433769226074, "learning_rate": 7.073709337573581e-05, "loss": 0.0754, "step": 12460 }, { "epoch": 0.7320652812023013, "grad_norm": 1.7251265048980713, "learning_rate": 7.069500582063702e-05, "loss": 0.0432, "step": 12470 }, { "epoch": 0.7326523423740754, "grad_norm": 1.1279487609863281, "learning_rate": 7.06529005634169e-05, "loss": 0.0325, "step": 12480 }, { "epoch": 0.7332394035458495, "grad_norm": 1.057920217514038, "learning_rate": 7.061077764009147e-05, "loss": 0.047, "step": 12490 }, { "epoch": 0.7338264647176236, "grad_norm": 2.576719284057617, "learning_rate": 7.05686370866919e-05, "loss": 0.0279, "step": 12500 }, { "epoch": 0.7344135258893977, "grad_norm": 4.218789100646973, "learning_rate": 7.052647893926442e-05, "loss": 0.0595, "step": 12510 }, { "epoch": 0.7350005870611718, "grad_norm": 2.4572722911834717, "learning_rate": 7.048430323387034e-05, "loss": 0.0529, "step": 12520 }, { "epoch": 0.7355876482329459, "grad_norm": 1.0583401918411255, "learning_rate": 7.044211000658595e-05, "loss": 0.0535, "step": 12530 }, { "epoch": 0.73617470940472, "grad_norm": 0.5790993571281433, "learning_rate": 7.039989929350257e-05, "loss": 0.0328, "step": 12540 }, { "epoch": 0.7367617705764941, "grad_norm": 1.6805678606033325, "learning_rate": 7.035767113072645e-05, "loss": 0.0919, "step": 12550 }, { "epoch": 0.7373488317482682, "grad_norm": 2.538081645965576, "learning_rate": 7.031542555437876e-05, "loss": 0.079, "step": 12560 }, { "epoch": 0.7379358929200422, "grad_norm": 2.5677759647369385, "learning_rate": 7.027316260059558e-05, "loss": 0.0625, "step": 12570 }, { "epoch": 0.7385229540918163, "grad_norm": 0.23643195629119873, "learning_rate": 7.023088230552787e-05, "loss": 0.0626, "step": 12580 }, { "epoch": 0.7391100152635904, "grad_norm": 2.1477859020233154, "learning_rate": 7.018858470534138e-05, "loss": 0.0504, "step": 12590 }, { "epoch": 0.7396970764353645, "grad_norm": 1.8838815689086914, "learning_rate": 7.014626983621669e-05, "loss": 0.0457, "step": 12600 }, { "epoch": 0.7402841376071386, "grad_norm": 0.7412036061286926, "learning_rate": 7.010393773434917e-05, "loss": 0.0778, "step": 12610 }, { "epoch": 0.7408711987789127, "grad_norm": 2.408327579498291, "learning_rate": 7.006158843594887e-05, "loss": 0.0631, "step": 12620 }, { "epoch": 0.7414582599506868, "grad_norm": 4.956777572631836, "learning_rate": 7.001922197724063e-05, "loss": 0.0486, "step": 12630 }, { "epoch": 0.7420453211224609, "grad_norm": 1.2330464124679565, "learning_rate": 6.997683839446392e-05, "loss": 0.0476, "step": 12640 }, { "epoch": 0.742632382294235, "grad_norm": 4.299368381500244, "learning_rate": 6.993443772387284e-05, "loss": 0.0646, "step": 12650 }, { "epoch": 0.7432194434660092, "grad_norm": 2.379664182662964, "learning_rate": 6.989202000173614e-05, "loss": 0.0669, "step": 12660 }, { "epoch": 0.7438065046377833, "grad_norm": 2.5982563495635986, "learning_rate": 6.984958526433716e-05, "loss": 0.0327, "step": 12670 }, { "epoch": 0.7443935658095574, "grad_norm": 1.7356817722320557, "learning_rate": 6.980713354797376e-05, "loss": 0.0384, "step": 12680 }, { "epoch": 0.7449806269813315, "grad_norm": 2.0834689140319824, "learning_rate": 6.97646648889584e-05, "loss": 0.0469, "step": 12690 }, { "epoch": 0.7455676881531056, "grad_norm": 1.8051512241363525, "learning_rate": 6.972217932361792e-05, "loss": 0.0677, "step": 12700 }, { "epoch": 0.7461547493248797, "grad_norm": 3.258486270904541, "learning_rate": 6.967967688829369e-05, "loss": 0.047, "step": 12710 }, { "epoch": 0.7467418104966538, "grad_norm": 3.2901456356048584, "learning_rate": 6.963715761934151e-05, "loss": 0.0778, "step": 12720 }, { "epoch": 0.7473288716684279, "grad_norm": 7.174700736999512, "learning_rate": 6.959462155313155e-05, "loss": 0.056, "step": 12730 }, { "epoch": 0.747915932840202, "grad_norm": 0.7426133155822754, "learning_rate": 6.955206872604839e-05, "loss": 0.0366, "step": 12740 }, { "epoch": 0.7485029940119761, "grad_norm": 1.182964563369751, "learning_rate": 6.950949917449093e-05, "loss": 0.0584, "step": 12750 }, { "epoch": 0.7490900551837502, "grad_norm": 1.5630015134811401, "learning_rate": 6.946691293487233e-05, "loss": 0.0328, "step": 12760 }, { "epoch": 0.7496771163555243, "grad_norm": 2.925347328186035, "learning_rate": 6.94243100436201e-05, "loss": 0.0491, "step": 12770 }, { "epoch": 0.7502641775272983, "grad_norm": 2.0041565895080566, "learning_rate": 6.938169053717593e-05, "loss": 0.0448, "step": 12780 }, { "epoch": 0.7508512386990724, "grad_norm": 1.82222580909729, "learning_rate": 6.933905445199578e-05, "loss": 0.0428, "step": 12790 }, { "epoch": 0.7514382998708465, "grad_norm": 3.978710651397705, "learning_rate": 6.929640182454973e-05, "loss": 0.048, "step": 12800 }, { "epoch": 0.7520253610426206, "grad_norm": 0.5901386141777039, "learning_rate": 6.925373269132207e-05, "loss": 0.0366, "step": 12810 }, { "epoch": 0.7526124222143947, "grad_norm": 3.8027117252349854, "learning_rate": 6.921104708881115e-05, "loss": 0.0721, "step": 12820 }, { "epoch": 0.7531994833861688, "grad_norm": 1.8794770240783691, "learning_rate": 6.916834505352945e-05, "loss": 0.066, "step": 12830 }, { "epoch": 0.7537865445579429, "grad_norm": 2.6269638538360596, "learning_rate": 6.91256266220035e-05, "loss": 0.0395, "step": 12840 }, { "epoch": 0.754373605729717, "grad_norm": 1.8135740756988525, "learning_rate": 6.908289183077385e-05, "loss": 0.0599, "step": 12850 }, { "epoch": 0.7549606669014911, "grad_norm": 2.1418561935424805, "learning_rate": 6.904014071639503e-05, "loss": 0.0594, "step": 12860 }, { "epoch": 0.7555477280732652, "grad_norm": 1.0812987089157104, "learning_rate": 6.899737331543555e-05, "loss": 0.041, "step": 12870 }, { "epoch": 0.7561347892450393, "grad_norm": 1.3494699001312256, "learning_rate": 6.895458966447784e-05, "loss": 0.0352, "step": 12880 }, { "epoch": 0.7567218504168134, "grad_norm": 1.3893370628356934, "learning_rate": 6.891178980011826e-05, "loss": 0.0208, "step": 12890 }, { "epoch": 0.7573089115885875, "grad_norm": 3.462141513824463, "learning_rate": 6.886897375896697e-05, "loss": 0.0774, "step": 12900 }, { "epoch": 0.7578959727603616, "grad_norm": 2.109053373336792, "learning_rate": 6.882614157764804e-05, "loss": 0.05, "step": 12910 }, { "epoch": 0.7584830339321357, "grad_norm": 1.5835174322128296, "learning_rate": 6.878329329279933e-05, "loss": 0.0285, "step": 12920 }, { "epoch": 0.7590700951039099, "grad_norm": 1.2065250873565674, "learning_rate": 6.874042894107245e-05, "loss": 0.0398, "step": 12930 }, { "epoch": 0.759657156275684, "grad_norm": 0.4219166040420532, "learning_rate": 6.869754855913273e-05, "loss": 0.0517, "step": 12940 }, { "epoch": 0.7602442174474581, "grad_norm": 0.454787939786911, "learning_rate": 6.86546521836593e-05, "loss": 0.0241, "step": 12950 }, { "epoch": 0.7608312786192322, "grad_norm": 3.6437501907348633, "learning_rate": 6.86117398513449e-05, "loss": 0.0417, "step": 12960 }, { "epoch": 0.7614183397910063, "grad_norm": 1.5840270519256592, "learning_rate": 6.856881159889593e-05, "loss": 0.0448, "step": 12970 }, { "epoch": 0.7620054009627804, "grad_norm": 2.22502064704895, "learning_rate": 6.852586746303243e-05, "loss": 0.0682, "step": 12980 }, { "epoch": 0.7625924621345544, "grad_norm": 4.858117580413818, "learning_rate": 6.848290748048801e-05, "loss": 0.0266, "step": 12990 }, { "epoch": 0.7631795233063285, "grad_norm": 0.256031334400177, "learning_rate": 6.843993168800982e-05, "loss": 0.0304, "step": 13000 }, { "epoch": 0.7637665844781026, "grad_norm": 1.634435772895813, "learning_rate": 6.839694012235856e-05, "loss": 0.0547, "step": 13010 }, { "epoch": 0.7643536456498767, "grad_norm": 1.0915956497192383, "learning_rate": 6.835393282030841e-05, "loss": 0.0386, "step": 13020 }, { "epoch": 0.7649407068216508, "grad_norm": 2.0468428134918213, "learning_rate": 6.8310909818647e-05, "loss": 0.0489, "step": 13030 }, { "epoch": 0.7655277679934249, "grad_norm": 1.7956228256225586, "learning_rate": 6.826787115417544e-05, "loss": 0.0305, "step": 13040 }, { "epoch": 0.766114829165199, "grad_norm": 1.8376433849334717, "learning_rate": 6.822481686370815e-05, "loss": 0.0711, "step": 13050 }, { "epoch": 0.7667018903369731, "grad_norm": 3.314697742462158, "learning_rate": 6.818174698407302e-05, "loss": 0.0629, "step": 13060 }, { "epoch": 0.7672889515087472, "grad_norm": 4.7288336753845215, "learning_rate": 6.813866155211118e-05, "loss": 0.041, "step": 13070 }, { "epoch": 0.7678760126805213, "grad_norm": 0.9822351932525635, "learning_rate": 6.80955606046771e-05, "loss": 0.0423, "step": 13080 }, { "epoch": 0.7684630738522954, "grad_norm": 2.40872859954834, "learning_rate": 6.805244417863854e-05, "loss": 0.032, "step": 13090 }, { "epoch": 0.7690501350240695, "grad_norm": 2.8423330783843994, "learning_rate": 6.80093123108765e-05, "loss": 0.0409, "step": 13100 }, { "epoch": 0.7696371961958436, "grad_norm": 2.7918121814727783, "learning_rate": 6.796616503828515e-05, "loss": 0.0704, "step": 13110 }, { "epoch": 0.7702242573676177, "grad_norm": 1.3258646726608276, "learning_rate": 6.79230023977719e-05, "loss": 0.0399, "step": 13120 }, { "epoch": 0.7708113185393918, "grad_norm": 3.8489837646484375, "learning_rate": 6.787982442625721e-05, "loss": 0.0455, "step": 13130 }, { "epoch": 0.7713983797111659, "grad_norm": 1.065025806427002, "learning_rate": 6.783663116067473e-05, "loss": 0.0424, "step": 13140 }, { "epoch": 0.77198544088294, "grad_norm": 4.975100040435791, "learning_rate": 6.779342263797119e-05, "loss": 0.081, "step": 13150 }, { "epoch": 0.7725725020547141, "grad_norm": 0.7484115958213806, "learning_rate": 6.775019889510635e-05, "loss": 0.0469, "step": 13160 }, { "epoch": 0.7731595632264882, "grad_norm": 2.914372682571411, "learning_rate": 6.770695996905297e-05, "loss": 0.0295, "step": 13170 }, { "epoch": 0.7737466243982623, "grad_norm": 0.0399160273373127, "learning_rate": 6.766370589679685e-05, "loss": 0.0417, "step": 13180 }, { "epoch": 0.7743336855700363, "grad_norm": 1.3361228704452515, "learning_rate": 6.762043671533668e-05, "loss": 0.0442, "step": 13190 }, { "epoch": 0.7749207467418106, "grad_norm": 1.0522292852401733, "learning_rate": 6.757715246168414e-05, "loss": 0.0368, "step": 13200 }, { "epoch": 0.7755078079135846, "grad_norm": 3.435419797897339, "learning_rate": 6.753385317286377e-05, "loss": 0.0349, "step": 13210 }, { "epoch": 0.7760948690853587, "grad_norm": 1.6181929111480713, "learning_rate": 6.749053888591295e-05, "loss": 0.0458, "step": 13220 }, { "epoch": 0.7766819302571328, "grad_norm": 3.1003963947296143, "learning_rate": 6.744720963788193e-05, "loss": 0.0443, "step": 13230 }, { "epoch": 0.7772689914289069, "grad_norm": 2.7116281986236572, "learning_rate": 6.740386546583373e-05, "loss": 0.0791, "step": 13240 }, { "epoch": 0.777856052600681, "grad_norm": 2.30363392829895, "learning_rate": 6.736050640684416e-05, "loss": 0.0414, "step": 13250 }, { "epoch": 0.7784431137724551, "grad_norm": 3.3929340839385986, "learning_rate": 6.731713249800173e-05, "loss": 0.0737, "step": 13260 }, { "epoch": 0.7790301749442292, "grad_norm": 0.6389382481575012, "learning_rate": 6.727374377640768e-05, "loss": 0.0406, "step": 13270 }, { "epoch": 0.7796172361160033, "grad_norm": 1.625324010848999, "learning_rate": 6.723034027917592e-05, "loss": 0.043, "step": 13280 }, { "epoch": 0.7802042972877774, "grad_norm": 2.860623836517334, "learning_rate": 6.718692204343298e-05, "loss": 0.049, "step": 13290 }, { "epoch": 0.7807913584595515, "grad_norm": 3.2424652576446533, "learning_rate": 6.7143489106318e-05, "loss": 0.0368, "step": 13300 }, { "epoch": 0.7813784196313256, "grad_norm": 3.919355630874634, "learning_rate": 6.710004150498271e-05, "loss": 0.0252, "step": 13310 }, { "epoch": 0.7819654808030997, "grad_norm": 2.217737913131714, "learning_rate": 6.70565792765914e-05, "loss": 0.0431, "step": 13320 }, { "epoch": 0.7825525419748738, "grad_norm": 6.595785140991211, "learning_rate": 6.701310245832082e-05, "loss": 0.0399, "step": 13330 }, { "epoch": 0.7831396031466479, "grad_norm": 1.0565236806869507, "learning_rate": 6.696961108736024e-05, "loss": 0.0327, "step": 13340 }, { "epoch": 0.783726664318422, "grad_norm": 3.037895917892456, "learning_rate": 6.692610520091137e-05, "loss": 0.0239, "step": 13350 }, { "epoch": 0.7843137254901961, "grad_norm": 1.9817838668823242, "learning_rate": 6.68825848361883e-05, "loss": 0.049, "step": 13360 }, { "epoch": 0.7849007866619702, "grad_norm": 3.1776561737060547, "learning_rate": 6.683905003041757e-05, "loss": 0.0526, "step": 13370 }, { "epoch": 0.7854878478337443, "grad_norm": 1.737399935722351, "learning_rate": 6.679550082083803e-05, "loss": 0.0441, "step": 13380 }, { "epoch": 0.7860749090055184, "grad_norm": 0.6524925827980042, "learning_rate": 6.675193724470087e-05, "loss": 0.0284, "step": 13390 }, { "epoch": 0.7866619701772924, "grad_norm": 1.5588257312774658, "learning_rate": 6.670835933926955e-05, "loss": 0.0635, "step": 13400 }, { "epoch": 0.7872490313490665, "grad_norm": 0.8149248957633972, "learning_rate": 6.666476714181979e-05, "loss": 0.0248, "step": 13410 }, { "epoch": 0.7878360925208406, "grad_norm": 0.6885552406311035, "learning_rate": 6.662116068963954e-05, "loss": 0.0793, "step": 13420 }, { "epoch": 0.7884231536926147, "grad_norm": 3.004079818725586, "learning_rate": 6.657754002002898e-05, "loss": 0.043, "step": 13430 }, { "epoch": 0.7890102148643888, "grad_norm": 2.774564266204834, "learning_rate": 6.653390517030038e-05, "loss": 0.0552, "step": 13440 }, { "epoch": 0.7895972760361629, "grad_norm": 3.0677783489227295, "learning_rate": 6.649025617777818e-05, "loss": 0.0485, "step": 13450 }, { "epoch": 0.790184337207937, "grad_norm": 3.7828519344329834, "learning_rate": 6.64465930797989e-05, "loss": 0.0418, "step": 13460 }, { "epoch": 0.7907713983797112, "grad_norm": 1.3428243398666382, "learning_rate": 6.640291591371117e-05, "loss": 0.0468, "step": 13470 }, { "epoch": 0.7913584595514853, "grad_norm": 2.8900206089019775, "learning_rate": 6.635922471687561e-05, "loss": 0.0437, "step": 13480 }, { "epoch": 0.7919455207232594, "grad_norm": 4.2262372970581055, "learning_rate": 6.631551952666484e-05, "loss": 0.0893, "step": 13490 }, { "epoch": 0.7925325818950335, "grad_norm": 2.866955041885376, "learning_rate": 6.627180038046347e-05, "loss": 0.0211, "step": 13500 }, { "epoch": 0.7931196430668076, "grad_norm": 0.8401691913604736, "learning_rate": 6.622806731566807e-05, "loss": 0.048, "step": 13510 }, { "epoch": 0.7937067042385817, "grad_norm": 1.6271196603775024, "learning_rate": 6.618432036968705e-05, "loss": 0.0453, "step": 13520 }, { "epoch": 0.7942937654103558, "grad_norm": 1.3949114084243774, "learning_rate": 6.614055957994075e-05, "loss": 0.0337, "step": 13530 }, { "epoch": 0.7948808265821299, "grad_norm": 1.1710011959075928, "learning_rate": 6.60967849838613e-05, "loss": 0.0443, "step": 13540 }, { "epoch": 0.795467887753904, "grad_norm": 1.4405739307403564, "learning_rate": 6.60529966188927e-05, "loss": 0.0479, "step": 13550 }, { "epoch": 0.7960549489256781, "grad_norm": 3.004775047302246, "learning_rate": 6.60091945224907e-05, "loss": 0.0574, "step": 13560 }, { "epoch": 0.7966420100974522, "grad_norm": 2.5304245948791504, "learning_rate": 6.596537873212281e-05, "loss": 0.0534, "step": 13570 }, { "epoch": 0.7972290712692263, "grad_norm": 0.9503095746040344, "learning_rate": 6.592154928526818e-05, "loss": 0.0447, "step": 13580 }, { "epoch": 0.7978161324410004, "grad_norm": 2.029606580734253, "learning_rate": 6.587770621941776e-05, "loss": 0.0525, "step": 13590 }, { "epoch": 0.7984031936127745, "grad_norm": 1.882787823677063, "learning_rate": 6.583384957207406e-05, "loss": 0.0258, "step": 13600 }, { "epoch": 0.7989902547845485, "grad_norm": 5.355138301849365, "learning_rate": 6.578997938075125e-05, "loss": 0.0225, "step": 13610 }, { "epoch": 0.7995773159563226, "grad_norm": 0.6417304277420044, "learning_rate": 6.574609568297507e-05, "loss": 0.0413, "step": 13620 }, { "epoch": 0.8001643771280967, "grad_norm": 0.25775960087776184, "learning_rate": 6.57021985162828e-05, "loss": 0.0562, "step": 13630 }, { "epoch": 0.8007514382998708, "grad_norm": 1.2497048377990723, "learning_rate": 6.565828791822327e-05, "loss": 0.063, "step": 13640 }, { "epoch": 0.8013384994716449, "grad_norm": 2.7102062702178955, "learning_rate": 6.56143639263568e-05, "loss": 0.0403, "step": 13650 }, { "epoch": 0.801925560643419, "grad_norm": 2.428157329559326, "learning_rate": 6.557042657825511e-05, "loss": 0.0729, "step": 13660 }, { "epoch": 0.8025126218151931, "grad_norm": 1.3086574077606201, "learning_rate": 6.552647591150143e-05, "loss": 0.0348, "step": 13670 }, { "epoch": 0.8030996829869672, "grad_norm": 1.1941686868667603, "learning_rate": 6.548251196369031e-05, "loss": 0.0383, "step": 13680 }, { "epoch": 0.8036867441587413, "grad_norm": 0.7203810811042786, "learning_rate": 6.54385347724277e-05, "loss": 0.0496, "step": 13690 }, { "epoch": 0.8042738053305154, "grad_norm": 1.3461620807647705, "learning_rate": 6.539454437533088e-05, "loss": 0.0371, "step": 13700 }, { "epoch": 0.8048608665022895, "grad_norm": 4.824581146240234, "learning_rate": 6.535054081002841e-05, "loss": 0.0611, "step": 13710 }, { "epoch": 0.8054479276740636, "grad_norm": 0.8641654849052429, "learning_rate": 6.530652411416007e-05, "loss": 0.0274, "step": 13720 }, { "epoch": 0.8060349888458377, "grad_norm": 1.9644825458526611, "learning_rate": 6.5262494325377e-05, "loss": 0.055, "step": 13730 }, { "epoch": 0.8066220500176118, "grad_norm": 1.6513824462890625, "learning_rate": 6.52184514813414e-05, "loss": 0.0341, "step": 13740 }, { "epoch": 0.807209111189386, "grad_norm": 0.9873337149620056, "learning_rate": 6.517439561972671e-05, "loss": 0.0379, "step": 13750 }, { "epoch": 0.8077961723611601, "grad_norm": 2.502126455307007, "learning_rate": 6.513032677821752e-05, "loss": 0.0779, "step": 13760 }, { "epoch": 0.8083832335329342, "grad_norm": 1.3818060159683228, "learning_rate": 6.508624499450944e-05, "loss": 0.0412, "step": 13770 }, { "epoch": 0.8089702947047083, "grad_norm": 1.796377182006836, "learning_rate": 6.504215030630925e-05, "loss": 0.0361, "step": 13780 }, { "epoch": 0.8095573558764824, "grad_norm": 3.33707332611084, "learning_rate": 6.49980427513347e-05, "loss": 0.0225, "step": 13790 }, { "epoch": 0.8101444170482565, "grad_norm": 4.678584098815918, "learning_rate": 6.495392236731458e-05, "loss": 0.0879, "step": 13800 }, { "epoch": 0.8107314782200306, "grad_norm": 2.5326459407806396, "learning_rate": 6.490978919198863e-05, "loss": 0.0632, "step": 13810 }, { "epoch": 0.8113185393918046, "grad_norm": 0.7089422345161438, "learning_rate": 6.486564326310754e-05, "loss": 0.031, "step": 13820 }, { "epoch": 0.8119056005635787, "grad_norm": 2.3819382190704346, "learning_rate": 6.482148461843294e-05, "loss": 0.048, "step": 13830 }, { "epoch": 0.8124926617353528, "grad_norm": 4.321537017822266, "learning_rate": 6.477731329573729e-05, "loss": 0.0768, "step": 13840 }, { "epoch": 0.8130797229071269, "grad_norm": 1.1775785684585571, "learning_rate": 6.473312933280391e-05, "loss": 0.0291, "step": 13850 }, { "epoch": 0.813666784078901, "grad_norm": 0.6265810132026672, "learning_rate": 6.468893276742695e-05, "loss": 0.0347, "step": 13860 }, { "epoch": 0.8142538452506751, "grad_norm": 1.668916940689087, "learning_rate": 6.464472363741132e-05, "loss": 0.0511, "step": 13870 }, { "epoch": 0.8148409064224492, "grad_norm": 2.267259359359741, "learning_rate": 6.460050198057268e-05, "loss": 0.0497, "step": 13880 }, { "epoch": 0.8154279675942233, "grad_norm": 2.395707607269287, "learning_rate": 6.45562678347374e-05, "loss": 0.036, "step": 13890 }, { "epoch": 0.8160150287659974, "grad_norm": 1.61038076877594, "learning_rate": 6.451202123774258e-05, "loss": 0.0418, "step": 13900 }, { "epoch": 0.8166020899377715, "grad_norm": 0.8160961270332336, "learning_rate": 6.446776222743589e-05, "loss": 0.0668, "step": 13910 }, { "epoch": 0.8171891511095456, "grad_norm": 1.6493704319000244, "learning_rate": 6.442349084167568e-05, "loss": 0.0401, "step": 13920 }, { "epoch": 0.8177762122813197, "grad_norm": 3.1336772441864014, "learning_rate": 6.437920711833086e-05, "loss": 0.0514, "step": 13930 }, { "epoch": 0.8183632734530938, "grad_norm": 1.4115411043167114, "learning_rate": 6.433491109528091e-05, "loss": 0.038, "step": 13940 }, { "epoch": 0.8189503346248679, "grad_norm": 1.697857141494751, "learning_rate": 6.429060281041581e-05, "loss": 0.0509, "step": 13950 }, { "epoch": 0.819537395796642, "grad_norm": 3.210850477218628, "learning_rate": 6.424628230163606e-05, "loss": 0.0271, "step": 13960 }, { "epoch": 0.8201244569684161, "grad_norm": 0.7086080312728882, "learning_rate": 6.420194960685255e-05, "loss": 0.0367, "step": 13970 }, { "epoch": 0.8207115181401902, "grad_norm": 2.7358102798461914, "learning_rate": 6.41576047639867e-05, "loss": 0.0795, "step": 13980 }, { "epoch": 0.8212985793119643, "grad_norm": 2.733741044998169, "learning_rate": 6.41132478109702e-05, "loss": 0.0378, "step": 13990 }, { "epoch": 0.8218856404837384, "grad_norm": 0.9385964870452881, "learning_rate": 6.406887878574519e-05, "loss": 0.0353, "step": 14000 }, { "epoch": 0.8224727016555125, "grad_norm": 2.7095303535461426, "learning_rate": 6.402449772626412e-05, "loss": 0.0736, "step": 14010 }, { "epoch": 0.8230597628272867, "grad_norm": 2.0674962997436523, "learning_rate": 6.398010467048968e-05, "loss": 0.06, "step": 14020 }, { "epoch": 0.8236468239990608, "grad_norm": 1.4313205480575562, "learning_rate": 6.39356996563949e-05, "loss": 0.0227, "step": 14030 }, { "epoch": 0.8242338851708348, "grad_norm": 2.533048629760742, "learning_rate": 6.389128272196296e-05, "loss": 0.0497, "step": 14040 }, { "epoch": 0.8248209463426089, "grad_norm": 2.874424695968628, "learning_rate": 6.38468539051873e-05, "loss": 0.0377, "step": 14050 }, { "epoch": 0.825408007514383, "grad_norm": 2.4855847358703613, "learning_rate": 6.38024132440715e-05, "loss": 0.0458, "step": 14060 }, { "epoch": 0.8259950686861571, "grad_norm": 2.164066791534424, "learning_rate": 6.375796077662928e-05, "loss": 0.0278, "step": 14070 }, { "epoch": 0.8265821298579312, "grad_norm": 1.594590187072754, "learning_rate": 6.371349654088442e-05, "loss": 0.0563, "step": 14080 }, { "epoch": 0.8271691910297053, "grad_norm": 0.6808485388755798, "learning_rate": 6.366902057487083e-05, "loss": 0.0326, "step": 14090 }, { "epoch": 0.8277562522014794, "grad_norm": 4.297865867614746, "learning_rate": 6.36245329166324e-05, "loss": 0.0537, "step": 14100 }, { "epoch": 0.8283433133732535, "grad_norm": 3.141972303390503, "learning_rate": 6.358003360422304e-05, "loss": 0.0663, "step": 14110 }, { "epoch": 0.8289303745450276, "grad_norm": 1.9593634605407715, "learning_rate": 6.353552267570666e-05, "loss": 0.0558, "step": 14120 }, { "epoch": 0.8295174357168017, "grad_norm": 2.1373162269592285, "learning_rate": 6.349100016915703e-05, "loss": 0.0325, "step": 14130 }, { "epoch": 0.8301044968885758, "grad_norm": 3.5958569049835205, "learning_rate": 6.34464661226579e-05, "loss": 0.0589, "step": 14140 }, { "epoch": 0.8306915580603499, "grad_norm": 3.618833541870117, "learning_rate": 6.340192057430286e-05, "loss": 0.0363, "step": 14150 }, { "epoch": 0.831278619232124, "grad_norm": 1.9822224378585815, "learning_rate": 6.335736356219533e-05, "loss": 0.0396, "step": 14160 }, { "epoch": 0.8318656804038981, "grad_norm": 1.5001624822616577, "learning_rate": 6.331279512444855e-05, "loss": 0.0262, "step": 14170 }, { "epoch": 0.8324527415756722, "grad_norm": 1.801331639289856, "learning_rate": 6.326821529918553e-05, "loss": 0.0179, "step": 14180 }, { "epoch": 0.8330398027474463, "grad_norm": 2.5991337299346924, "learning_rate": 6.322362412453903e-05, "loss": 0.0414, "step": 14190 }, { "epoch": 0.8336268639192204, "grad_norm": 2.08478045463562, "learning_rate": 6.31790216386515e-05, "loss": 0.0524, "step": 14200 }, { "epoch": 0.8342139250909945, "grad_norm": 0.5593706369400024, "learning_rate": 6.313440787967506e-05, "loss": 0.0208, "step": 14210 }, { "epoch": 0.8348009862627686, "grad_norm": 3.2073302268981934, "learning_rate": 6.30897828857715e-05, "loss": 0.0308, "step": 14220 }, { "epoch": 0.8353880474345426, "grad_norm": 2.968762159347534, "learning_rate": 6.30451466951122e-05, "loss": 0.0235, "step": 14230 }, { "epoch": 0.8359751086063167, "grad_norm": 2.3763837814331055, "learning_rate": 6.300049934587812e-05, "loss": 0.0263, "step": 14240 }, { "epoch": 0.8365621697780908, "grad_norm": 1.7677310705184937, "learning_rate": 6.295584087625979e-05, "loss": 0.0459, "step": 14250 }, { "epoch": 0.8371492309498649, "grad_norm": 3.6045050621032715, "learning_rate": 6.291117132445722e-05, "loss": 0.0526, "step": 14260 }, { "epoch": 0.837736292121639, "grad_norm": 1.805864930152893, "learning_rate": 6.286649072867988e-05, "loss": 0.042, "step": 14270 }, { "epoch": 0.8383233532934131, "grad_norm": 2.14461088180542, "learning_rate": 6.282179912714677e-05, "loss": 0.0743, "step": 14280 }, { "epoch": 0.8389104144651873, "grad_norm": 3.040163993835449, "learning_rate": 6.277709655808622e-05, "loss": 0.0544, "step": 14290 }, { "epoch": 0.8394974756369614, "grad_norm": 3.3869519233703613, "learning_rate": 6.273238305973596e-05, "loss": 0.0436, "step": 14300 }, { "epoch": 0.8400845368087355, "grad_norm": 1.099915623664856, "learning_rate": 6.268765867034311e-05, "loss": 0.0608, "step": 14310 }, { "epoch": 0.8406715979805096, "grad_norm": 2.5395612716674805, "learning_rate": 6.264292342816407e-05, "loss": 0.0503, "step": 14320 }, { "epoch": 0.8412586591522837, "grad_norm": 1.3137449026107788, "learning_rate": 6.25981773714645e-05, "loss": 0.0483, "step": 14330 }, { "epoch": 0.8418457203240578, "grad_norm": 0.4606730341911316, "learning_rate": 6.255342053851938e-05, "loss": 0.0428, "step": 14340 }, { "epoch": 0.8424327814958319, "grad_norm": 1.6700111627578735, "learning_rate": 6.250865296761286e-05, "loss": 0.0399, "step": 14350 }, { "epoch": 0.843019842667606, "grad_norm": 2.4387354850769043, "learning_rate": 6.246387469703826e-05, "loss": 0.0672, "step": 14360 }, { "epoch": 0.8436069038393801, "grad_norm": 2.6700453758239746, "learning_rate": 6.241908576509812e-05, "loss": 0.0395, "step": 14370 }, { "epoch": 0.8441939650111542, "grad_norm": 2.5601587295532227, "learning_rate": 6.237428621010402e-05, "loss": 0.0535, "step": 14380 }, { "epoch": 0.8447810261829283, "grad_norm": 2.6522321701049805, "learning_rate": 6.232947607037666e-05, "loss": 0.039, "step": 14390 }, { "epoch": 0.8453680873547024, "grad_norm": 1.8198210000991821, "learning_rate": 6.228465538424583e-05, "loss": 0.0491, "step": 14400 }, { "epoch": 0.8459551485264765, "grad_norm": 3.20634388923645, "learning_rate": 6.223982419005027e-05, "loss": 0.0376, "step": 14410 }, { "epoch": 0.8465422096982506, "grad_norm": 0.9358623027801514, "learning_rate": 6.219498252613777e-05, "loss": 0.035, "step": 14420 }, { "epoch": 0.8471292708700247, "grad_norm": 3.459280014038086, "learning_rate": 6.215013043086504e-05, "loss": 0.0482, "step": 14430 }, { "epoch": 0.8477163320417987, "grad_norm": 1.3764972686767578, "learning_rate": 6.210526794259772e-05, "loss": 0.0482, "step": 14440 }, { "epoch": 0.8483033932135728, "grad_norm": 1.696393609046936, "learning_rate": 6.206039509971038e-05, "loss": 0.0322, "step": 14450 }, { "epoch": 0.8488904543853469, "grad_norm": 2.162684202194214, "learning_rate": 6.201551194058637e-05, "loss": 0.0415, "step": 14460 }, { "epoch": 0.849477515557121, "grad_norm": 0.39044174551963806, "learning_rate": 6.19706185036179e-05, "loss": 0.0551, "step": 14470 }, { "epoch": 0.8500645767288951, "grad_norm": 0.6007933616638184, "learning_rate": 6.192571482720601e-05, "loss": 0.0465, "step": 14480 }, { "epoch": 0.8506516379006692, "grad_norm": 2.2173054218292236, "learning_rate": 6.188080094976046e-05, "loss": 0.0406, "step": 14490 }, { "epoch": 0.8512386990724433, "grad_norm": 1.6457314491271973, "learning_rate": 6.183587690969974e-05, "loss": 0.0239, "step": 14500 }, { "epoch": 0.8518257602442174, "grad_norm": 0.3718281090259552, "learning_rate": 6.179094274545102e-05, "loss": 0.0467, "step": 14510 }, { "epoch": 0.8524128214159915, "grad_norm": 0.7005079388618469, "learning_rate": 6.174599849545015e-05, "loss": 0.0446, "step": 14520 }, { "epoch": 0.8529998825877656, "grad_norm": 2.837313652038574, "learning_rate": 6.170104419814162e-05, "loss": 0.0551, "step": 14530 }, { "epoch": 0.8535869437595397, "grad_norm": 1.405055046081543, "learning_rate": 6.165607989197847e-05, "loss": 0.0164, "step": 14540 }, { "epoch": 0.8541740049313138, "grad_norm": 2.049793243408203, "learning_rate": 6.161110561542235e-05, "loss": 0.038, "step": 14550 }, { "epoch": 0.854761066103088, "grad_norm": 2.076131582260132, "learning_rate": 6.156612140694339e-05, "loss": 0.0253, "step": 14560 }, { "epoch": 0.8553481272748621, "grad_norm": 1.3480819463729858, "learning_rate": 6.152112730502027e-05, "loss": 0.0466, "step": 14570 }, { "epoch": 0.8559351884466362, "grad_norm": 1.8923150300979614, "learning_rate": 6.147612334814008e-05, "loss": 0.0422, "step": 14580 }, { "epoch": 0.8565222496184103, "grad_norm": 1.127937912940979, "learning_rate": 6.143110957479839e-05, "loss": 0.0613, "step": 14590 }, { "epoch": 0.8571093107901844, "grad_norm": 1.5263972282409668, "learning_rate": 6.138608602349911e-05, "loss": 0.0564, "step": 14600 }, { "epoch": 0.8576963719619585, "grad_norm": 1.5901850461959839, "learning_rate": 6.134105273275457e-05, "loss": 0.0257, "step": 14610 }, { "epoch": 0.8582834331337326, "grad_norm": 1.3203537464141846, "learning_rate": 6.129600974108538e-05, "loss": 0.0313, "step": 14620 }, { "epoch": 0.8588704943055067, "grad_norm": 1.7751020193099976, "learning_rate": 6.125095708702052e-05, "loss": 0.0322, "step": 14630 }, { "epoch": 0.8594575554772808, "grad_norm": 3.6982996463775635, "learning_rate": 6.120589480909715e-05, "loss": 0.0394, "step": 14640 }, { "epoch": 0.8600446166490548, "grad_norm": 2.0997314453125, "learning_rate": 6.116082294586068e-05, "loss": 0.0485, "step": 14650 }, { "epoch": 0.8606316778208289, "grad_norm": 2.7278316020965576, "learning_rate": 6.11157415358648e-05, "loss": 0.0426, "step": 14660 }, { "epoch": 0.861218738992603, "grad_norm": 2.1180830001831055, "learning_rate": 6.107065061767127e-05, "loss": 0.022, "step": 14670 }, { "epoch": 0.8618058001643771, "grad_norm": 0.46879950165748596, "learning_rate": 6.1025550229850004e-05, "loss": 0.0513, "step": 14680 }, { "epoch": 0.8623928613361512, "grad_norm": 2.1595799922943115, "learning_rate": 6.098044041097907e-05, "loss": 0.0645, "step": 14690 }, { "epoch": 0.8629799225079253, "grad_norm": 2.693690061569214, "learning_rate": 6.0935321199644544e-05, "loss": 0.0573, "step": 14700 }, { "epoch": 0.8635669836796994, "grad_norm": 1.1986830234527588, "learning_rate": 6.0890192634440546e-05, "loss": 0.0305, "step": 14710 }, { "epoch": 0.8641540448514735, "grad_norm": 5.476376533508301, "learning_rate": 6.084505475396923e-05, "loss": 0.0507, "step": 14720 }, { "epoch": 0.8647411060232476, "grad_norm": 0.6578884720802307, "learning_rate": 6.079990759684068e-05, "loss": 0.0358, "step": 14730 }, { "epoch": 0.8653281671950217, "grad_norm": 7.568542003631592, "learning_rate": 6.075475120167293e-05, "loss": 0.0643, "step": 14740 }, { "epoch": 0.8659152283667958, "grad_norm": 2.21598482131958, "learning_rate": 6.070958560709194e-05, "loss": 0.0483, "step": 14750 }, { "epoch": 0.8665022895385699, "grad_norm": 0.31963619589805603, "learning_rate": 6.066441085173149e-05, "loss": 0.0254, "step": 14760 }, { "epoch": 0.867089350710344, "grad_norm": 2.6377768516540527, "learning_rate": 6.061922697423322e-05, "loss": 0.0329, "step": 14770 }, { "epoch": 0.8676764118821181, "grad_norm": 1.1218945980072021, "learning_rate": 6.057403401324659e-05, "loss": 0.047, "step": 14780 }, { "epoch": 0.8682634730538922, "grad_norm": 2.0679190158843994, "learning_rate": 6.052883200742883e-05, "loss": 0.0314, "step": 14790 }, { "epoch": 0.8688505342256663, "grad_norm": 3.9840517044067383, "learning_rate": 6.0483620995444835e-05, "loss": 0.0432, "step": 14800 }, { "epoch": 0.8694375953974404, "grad_norm": 3.364485502243042, "learning_rate": 6.043840101596731e-05, "loss": 0.0478, "step": 14810 }, { "epoch": 0.8700246565692145, "grad_norm": 3.0178751945495605, "learning_rate": 6.0393172107676576e-05, "loss": 0.0336, "step": 14820 }, { "epoch": 0.8706117177409887, "grad_norm": 2.6065449714660645, "learning_rate": 6.034793430926058e-05, "loss": 0.0677, "step": 14830 }, { "epoch": 0.8711987789127628, "grad_norm": 2.52195143699646, "learning_rate": 6.0302687659414904e-05, "loss": 0.0931, "step": 14840 }, { "epoch": 0.8717858400845369, "grad_norm": 0.792038083076477, "learning_rate": 6.025743219684267e-05, "loss": 0.0619, "step": 14850 }, { "epoch": 0.872372901256311, "grad_norm": 1.2015186548233032, "learning_rate": 6.021216796025456e-05, "loss": 0.036, "step": 14860 }, { "epoch": 0.872959962428085, "grad_norm": 3.2608089447021484, "learning_rate": 6.016689498836877e-05, "loss": 0.043, "step": 14870 }, { "epoch": 0.8735470235998591, "grad_norm": 1.4122695922851562, "learning_rate": 6.012161331991093e-05, "loss": 0.0475, "step": 14880 }, { "epoch": 0.8741340847716332, "grad_norm": 3.4364013671875, "learning_rate": 6.007632299361417e-05, "loss": 0.0325, "step": 14890 }, { "epoch": 0.8747211459434073, "grad_norm": 1.4639275074005127, "learning_rate": 6.003102404821895e-05, "loss": 0.0238, "step": 14900 }, { "epoch": 0.8753082071151814, "grad_norm": 2.4071452617645264, "learning_rate": 5.9985716522473166e-05, "loss": 0.0387, "step": 14910 }, { "epoch": 0.8758952682869555, "grad_norm": 1.1663764715194702, "learning_rate": 5.9940400455132025e-05, "loss": 0.0205, "step": 14920 }, { "epoch": 0.8764823294587296, "grad_norm": 0.5613025426864624, "learning_rate": 5.989507588495804e-05, "loss": 0.034, "step": 14930 }, { "epoch": 0.8770693906305037, "grad_norm": 0.57170569896698, "learning_rate": 5.984974285072099e-05, "loss": 0.0287, "step": 14940 }, { "epoch": 0.8776564518022778, "grad_norm": 1.890034794807434, "learning_rate": 5.980440139119794e-05, "loss": 0.0244, "step": 14950 }, { "epoch": 0.8782435129740519, "grad_norm": 1.29929780960083, "learning_rate": 5.975905154517309e-05, "loss": 0.0291, "step": 14960 }, { "epoch": 0.878830574145826, "grad_norm": 2.5182735919952393, "learning_rate": 5.971369335143787e-05, "loss": 0.0385, "step": 14970 }, { "epoch": 0.8794176353176001, "grad_norm": 0.026701239868998528, "learning_rate": 5.966832684879084e-05, "loss": 0.0481, "step": 14980 }, { "epoch": 0.8800046964893742, "grad_norm": 2.461606740951538, "learning_rate": 5.962295207603764e-05, "loss": 0.0479, "step": 14990 }, { "epoch": 0.8805917576611483, "grad_norm": 0.466767817735672, "learning_rate": 5.9577569071991e-05, "loss": 0.0394, "step": 15000 }, { "epoch": 0.8805917576611483, "eval_loss": 0.44330793619155884, "eval_runtime": 269.9247, "eval_samples_per_second": 3.501, "eval_steps_per_second": 3.501, "step": 15000 }, { "epoch": 0.8811788188329224, "grad_norm": 2.5875964164733887, "learning_rate": 5.953217787547072e-05, "loss": 0.061, "step": 15010 }, { "epoch": 0.8817658800046965, "grad_norm": 1.1449871063232422, "learning_rate": 5.9486778525303556e-05, "loss": 0.0284, "step": 15020 }, { "epoch": 0.8823529411764706, "grad_norm": 4.314147472381592, "learning_rate": 5.944137106032327e-05, "loss": 0.0322, "step": 15030 }, { "epoch": 0.8829400023482447, "grad_norm": 3.349551200866699, "learning_rate": 5.939595551937057e-05, "loss": 0.0516, "step": 15040 }, { "epoch": 0.8835270635200188, "grad_norm": 1.2026758193969727, "learning_rate": 5.9350531941293056e-05, "loss": 0.0286, "step": 15050 }, { "epoch": 0.8841141246917928, "grad_norm": 1.3250828981399536, "learning_rate": 5.93051003649452e-05, "loss": 0.0298, "step": 15060 }, { "epoch": 0.8847011858635669, "grad_norm": 0.25947925448417664, "learning_rate": 5.9259660829188337e-05, "loss": 0.059, "step": 15070 }, { "epoch": 0.885288247035341, "grad_norm": 1.218214750289917, "learning_rate": 5.921421337289059e-05, "loss": 0.0344, "step": 15080 }, { "epoch": 0.8858753082071151, "grad_norm": 3.7481465339660645, "learning_rate": 5.9168758034926855e-05, "loss": 0.0684, "step": 15090 }, { "epoch": 0.8864623693788892, "grad_norm": 1.504749059677124, "learning_rate": 5.912329485417879e-05, "loss": 0.0371, "step": 15100 }, { "epoch": 0.8870494305506634, "grad_norm": 1.3720250129699707, "learning_rate": 5.9077823869534745e-05, "loss": 0.0344, "step": 15110 }, { "epoch": 0.8876364917224375, "grad_norm": 0.861710250377655, "learning_rate": 5.9032345119889765e-05, "loss": 0.0347, "step": 15120 }, { "epoch": 0.8882235528942116, "grad_norm": 3.441455602645874, "learning_rate": 5.8986858644145505e-05, "loss": 0.0723, "step": 15130 }, { "epoch": 0.8888106140659857, "grad_norm": 0.7448044419288635, "learning_rate": 5.8941364481210245e-05, "loss": 0.0403, "step": 15140 }, { "epoch": 0.8893976752377598, "grad_norm": 1.0399250984191895, "learning_rate": 5.889586266999887e-05, "loss": 0.071, "step": 15150 }, { "epoch": 0.8899847364095339, "grad_norm": 3.521848440170288, "learning_rate": 5.8850353249432744e-05, "loss": 0.0444, "step": 15160 }, { "epoch": 0.890571797581308, "grad_norm": 4.5713300704956055, "learning_rate": 5.880483625843979e-05, "loss": 0.0485, "step": 15170 }, { "epoch": 0.8911588587530821, "grad_norm": 0.8455482721328735, "learning_rate": 5.8759311735954404e-05, "loss": 0.0323, "step": 15180 }, { "epoch": 0.8917459199248562, "grad_norm": 1.9206844568252563, "learning_rate": 5.8713779720917395e-05, "loss": 0.0379, "step": 15190 }, { "epoch": 0.8923329810966303, "grad_norm": 1.255924105644226, "learning_rate": 5.8668240252275995e-05, "loss": 0.0274, "step": 15200 }, { "epoch": 0.8929200422684044, "grad_norm": 2.0059471130371094, "learning_rate": 5.8622693368983847e-05, "loss": 0.0335, "step": 15210 }, { "epoch": 0.8935071034401785, "grad_norm": 1.881173849105835, "learning_rate": 5.857713911000087e-05, "loss": 0.0458, "step": 15220 }, { "epoch": 0.8940941646119526, "grad_norm": 0.7294260859489441, "learning_rate": 5.8531577514293324e-05, "loss": 0.0321, "step": 15230 }, { "epoch": 0.8946812257837267, "grad_norm": 1.6017183065414429, "learning_rate": 5.848600862083378e-05, "loss": 0.0436, "step": 15240 }, { "epoch": 0.8952682869555008, "grad_norm": 2.576051950454712, "learning_rate": 5.844043246860098e-05, "loss": 0.0787, "step": 15250 }, { "epoch": 0.8958553481272749, "grad_norm": 1.7845737934112549, "learning_rate": 5.839484909657993e-05, "loss": 0.0368, "step": 15260 }, { "epoch": 0.896442409299049, "grad_norm": 1.6344528198242188, "learning_rate": 5.834925854376181e-05, "loss": 0.032, "step": 15270 }, { "epoch": 0.897029470470823, "grad_norm": 2.0339598655700684, "learning_rate": 5.83036608491439e-05, "loss": 0.025, "step": 15280 }, { "epoch": 0.8976165316425971, "grad_norm": 0.42804691195487976, "learning_rate": 5.8258056051729634e-05, "loss": 0.0302, "step": 15290 }, { "epoch": 0.8982035928143712, "grad_norm": 3.1157166957855225, "learning_rate": 5.821244419052849e-05, "loss": 0.0761, "step": 15300 }, { "epoch": 0.8987906539861453, "grad_norm": 0.912639856338501, "learning_rate": 5.816682530455602e-05, "loss": 0.0411, "step": 15310 }, { "epoch": 0.8993777151579194, "grad_norm": 1.99317467212677, "learning_rate": 5.8121199432833754e-05, "loss": 0.0366, "step": 15320 }, { "epoch": 0.8999647763296935, "grad_norm": 1.7734209299087524, "learning_rate": 5.807556661438922e-05, "loss": 0.0253, "step": 15330 }, { "epoch": 0.9005518375014676, "grad_norm": 1.6674447059631348, "learning_rate": 5.802992688825587e-05, "loss": 0.048, "step": 15340 }, { "epoch": 0.9011388986732417, "grad_norm": 3.4875285625457764, "learning_rate": 5.798428029347306e-05, "loss": 0.0287, "step": 15350 }, { "epoch": 0.9017259598450158, "grad_norm": 1.6051019430160522, "learning_rate": 5.7938626869086066e-05, "loss": 0.0425, "step": 15360 }, { "epoch": 0.9023130210167899, "grad_norm": 0.6197221875190735, "learning_rate": 5.7892966654145944e-05, "loss": 0.0523, "step": 15370 }, { "epoch": 0.9029000821885641, "grad_norm": 3.6087498664855957, "learning_rate": 5.784729968770961e-05, "loss": 0.0339, "step": 15380 }, { "epoch": 0.9034871433603382, "grad_norm": 0.6627610325813293, "learning_rate": 5.780162600883974e-05, "loss": 0.0401, "step": 15390 }, { "epoch": 0.9040742045321123, "grad_norm": 1.812591552734375, "learning_rate": 5.775594565660472e-05, "loss": 0.0394, "step": 15400 }, { "epoch": 0.9046612657038864, "grad_norm": 3.026209592819214, "learning_rate": 5.771025867007868e-05, "loss": 0.0445, "step": 15410 }, { "epoch": 0.9052483268756605, "grad_norm": 4.8852386474609375, "learning_rate": 5.766456508834142e-05, "loss": 0.038, "step": 15420 }, { "epoch": 0.9058353880474346, "grad_norm": 1.1808998584747314, "learning_rate": 5.761886495047837e-05, "loss": 0.0547, "step": 15430 }, { "epoch": 0.9064224492192087, "grad_norm": 0.14771953225135803, "learning_rate": 5.757315829558057e-05, "loss": 0.0621, "step": 15440 }, { "epoch": 0.9070095103909828, "grad_norm": 1.789146900177002, "learning_rate": 5.752744516274465e-05, "loss": 0.035, "step": 15450 }, { "epoch": 0.9075965715627569, "grad_norm": 3.1927261352539062, "learning_rate": 5.748172559107277e-05, "loss": 0.0614, "step": 15460 }, { "epoch": 0.908183632734531, "grad_norm": 2.276498317718506, "learning_rate": 5.7435999619672595e-05, "loss": 0.0407, "step": 15470 }, { "epoch": 0.908770693906305, "grad_norm": 2.7692625522613525, "learning_rate": 5.739026728765726e-05, "loss": 0.0351, "step": 15480 }, { "epoch": 0.9093577550780791, "grad_norm": 2.171767473220825, "learning_rate": 5.7344528634145354e-05, "loss": 0.0444, "step": 15490 }, { "epoch": 0.9099448162498532, "grad_norm": 1.2514421939849854, "learning_rate": 5.7298783698260874e-05, "loss": 0.048, "step": 15500 }, { "epoch": 0.9105318774216273, "grad_norm": 2.918902635574341, "learning_rate": 5.725303251913317e-05, "loss": 0.0558, "step": 15510 }, { "epoch": 0.9111189385934014, "grad_norm": 1.763729214668274, "learning_rate": 5.7207275135896945e-05, "loss": 0.0392, "step": 15520 }, { "epoch": 0.9117059997651755, "grad_norm": 1.1538106203079224, "learning_rate": 5.7161511587692216e-05, "loss": 0.0369, "step": 15530 }, { "epoch": 0.9122930609369496, "grad_norm": 0.6280871033668518, "learning_rate": 5.7115741913664264e-05, "loss": 0.0412, "step": 15540 }, { "epoch": 0.9128801221087237, "grad_norm": 1.0035511255264282, "learning_rate": 5.7069966152963614e-05, "loss": 0.0227, "step": 15550 }, { "epoch": 0.9134671832804978, "grad_norm": 1.5030596256256104, "learning_rate": 5.702418434474601e-05, "loss": 0.0329, "step": 15560 }, { "epoch": 0.9140542444522719, "grad_norm": 2.701653003692627, "learning_rate": 5.6978396528172326e-05, "loss": 0.0426, "step": 15570 }, { "epoch": 0.914641305624046, "grad_norm": 0.8428555727005005, "learning_rate": 5.693260274240863e-05, "loss": 0.0689, "step": 15580 }, { "epoch": 0.9152283667958201, "grad_norm": 3.0564322471618652, "learning_rate": 5.688680302662607e-05, "loss": 0.0518, "step": 15590 }, { "epoch": 0.9158154279675942, "grad_norm": 3.0662670135498047, "learning_rate": 5.6840997420000865e-05, "loss": 0.0633, "step": 15600 }, { "epoch": 0.9164024891393683, "grad_norm": 3.3530144691467285, "learning_rate": 5.679518596171425e-05, "loss": 0.0302, "step": 15610 }, { "epoch": 0.9169895503111424, "grad_norm": 2.3576653003692627, "learning_rate": 5.674936869095252e-05, "loss": 0.0386, "step": 15620 }, { "epoch": 0.9175766114829165, "grad_norm": 0.9424792528152466, "learning_rate": 5.670354564690692e-05, "loss": 0.0478, "step": 15630 }, { "epoch": 0.9181636726546906, "grad_norm": 2.947878837585449, "learning_rate": 5.665771686877358e-05, "loss": 0.0501, "step": 15640 }, { "epoch": 0.9187507338264648, "grad_norm": 3.772454261779785, "learning_rate": 5.661188239575364e-05, "loss": 0.055, "step": 15650 }, { "epoch": 0.9193377949982389, "grad_norm": 5.333547592163086, "learning_rate": 5.6566042267052997e-05, "loss": 0.0297, "step": 15660 }, { "epoch": 0.919924856170013, "grad_norm": 1.9018349647521973, "learning_rate": 5.6520196521882475e-05, "loss": 0.0391, "step": 15670 }, { "epoch": 0.9205119173417871, "grad_norm": 2.1346988677978516, "learning_rate": 5.647434519945767e-05, "loss": 0.0399, "step": 15680 }, { "epoch": 0.9210989785135612, "grad_norm": 0.6692573428153992, "learning_rate": 5.642848833899891e-05, "loss": 0.0391, "step": 15690 }, { "epoch": 0.9216860396853352, "grad_norm": 1.264906883239746, "learning_rate": 5.638262597973134e-05, "loss": 0.0492, "step": 15700 }, { "epoch": 0.9222731008571093, "grad_norm": 2.9607551097869873, "learning_rate": 5.633675816088475e-05, "loss": 0.0456, "step": 15710 }, { "epoch": 0.9228601620288834, "grad_norm": 0.5963914394378662, "learning_rate": 5.62908849216936e-05, "loss": 0.0505, "step": 15720 }, { "epoch": 0.9234472232006575, "grad_norm": 1.0271310806274414, "learning_rate": 5.624500630139702e-05, "loss": 0.0284, "step": 15730 }, { "epoch": 0.9240342843724316, "grad_norm": 1.6457319259643555, "learning_rate": 5.619912233923872e-05, "loss": 0.042, "step": 15740 }, { "epoch": 0.9246213455442057, "grad_norm": 2.4600753784179688, "learning_rate": 5.615323307446697e-05, "loss": 0.0392, "step": 15750 }, { "epoch": 0.9252084067159798, "grad_norm": 0.8508187532424927, "learning_rate": 5.610733854633462e-05, "loss": 0.0353, "step": 15760 }, { "epoch": 0.9257954678877539, "grad_norm": 2.4094738960266113, "learning_rate": 5.6061438794098974e-05, "loss": 0.0413, "step": 15770 }, { "epoch": 0.926382529059528, "grad_norm": 0.9184495210647583, "learning_rate": 5.601553385702182e-05, "loss": 0.0243, "step": 15780 }, { "epoch": 0.9269695902313021, "grad_norm": 1.6502752304077148, "learning_rate": 5.5969623774369396e-05, "loss": 0.0335, "step": 15790 }, { "epoch": 0.9275566514030762, "grad_norm": 2.8988659381866455, "learning_rate": 5.592370858541232e-05, "loss": 0.0363, "step": 15800 }, { "epoch": 0.9281437125748503, "grad_norm": 2.271238327026367, "learning_rate": 5.587778832942556e-05, "loss": 0.0388, "step": 15810 }, { "epoch": 0.9287307737466244, "grad_norm": 0.7988091707229614, "learning_rate": 5.583186304568849e-05, "loss": 0.0301, "step": 15820 }, { "epoch": 0.9293178349183985, "grad_norm": 0.6879439949989319, "learning_rate": 5.578593277348473e-05, "loss": 0.0439, "step": 15830 }, { "epoch": 0.9299048960901726, "grad_norm": 2.1726632118225098, "learning_rate": 5.573999755210215e-05, "loss": 0.0326, "step": 15840 }, { "epoch": 0.9304919572619467, "grad_norm": 2.4020466804504395, "learning_rate": 5.56940574208329e-05, "loss": 0.0252, "step": 15850 }, { "epoch": 0.9310790184337208, "grad_norm": 2.6222338676452637, "learning_rate": 5.564811241897333e-05, "loss": 0.0317, "step": 15860 }, { "epoch": 0.9316660796054949, "grad_norm": 4.5372138023376465, "learning_rate": 5.5602162585823894e-05, "loss": 0.0366, "step": 15870 }, { "epoch": 0.932253140777269, "grad_norm": 3.0580427646636963, "learning_rate": 5.555620796068925e-05, "loss": 0.0551, "step": 15880 }, { "epoch": 0.932840201949043, "grad_norm": 0.9663577079772949, "learning_rate": 5.551024858287812e-05, "loss": 0.0382, "step": 15890 }, { "epoch": 0.9334272631208171, "grad_norm": 1.553074598312378, "learning_rate": 5.546428449170329e-05, "loss": 0.0356, "step": 15900 }, { "epoch": 0.9340143242925912, "grad_norm": 2.1263463497161865, "learning_rate": 5.54183157264816e-05, "loss": 0.0241, "step": 15910 }, { "epoch": 0.9346013854643654, "grad_norm": 1.7101470232009888, "learning_rate": 5.537234232653386e-05, "loss": 0.035, "step": 15920 }, { "epoch": 0.9351884466361395, "grad_norm": 1.838174819946289, "learning_rate": 5.532636433118484e-05, "loss": 0.0332, "step": 15930 }, { "epoch": 0.9357755078079136, "grad_norm": 1.9210201501846313, "learning_rate": 5.52803817797633e-05, "loss": 0.0464, "step": 15940 }, { "epoch": 0.9363625689796877, "grad_norm": 0.9225133657455444, "learning_rate": 5.523439471160181e-05, "loss": 0.0477, "step": 15950 }, { "epoch": 0.9369496301514618, "grad_norm": 2.7963900566101074, "learning_rate": 5.518840316603689e-05, "loss": 0.0268, "step": 15960 }, { "epoch": 0.9375366913232359, "grad_norm": 0.47619158029556274, "learning_rate": 5.514240718240884e-05, "loss": 0.0461, "step": 15970 }, { "epoch": 0.93812375249501, "grad_norm": 1.3008337020874023, "learning_rate": 5.509640680006175e-05, "loss": 0.0633, "step": 15980 }, { "epoch": 0.9387108136667841, "grad_norm": 1.382997989654541, "learning_rate": 5.5050402058343476e-05, "loss": 0.0182, "step": 15990 }, { "epoch": 0.9392978748385582, "grad_norm": 5.267919063568115, "learning_rate": 5.500439299660566e-05, "loss": 0.0502, "step": 16000 }, { "epoch": 0.9398849360103323, "grad_norm": 0.74673992395401, "learning_rate": 5.495837965420356e-05, "loss": 0.0636, "step": 16010 }, { "epoch": 0.9404719971821064, "grad_norm": 1.4157620668411255, "learning_rate": 5.491236207049613e-05, "loss": 0.0338, "step": 16020 }, { "epoch": 0.9410590583538805, "grad_norm": 1.6849260330200195, "learning_rate": 5.4866340284845955e-05, "loss": 0.0247, "step": 16030 }, { "epoch": 0.9416461195256546, "grad_norm": 1.1458992958068848, "learning_rate": 5.4820314336619214e-05, "loss": 0.0232, "step": 16040 }, { "epoch": 0.9422331806974287, "grad_norm": 3.4146482944488525, "learning_rate": 5.477428426518565e-05, "loss": 0.0171, "step": 16050 }, { "epoch": 0.9428202418692028, "grad_norm": 1.5777746438980103, "learning_rate": 5.472825010991852e-05, "loss": 0.0322, "step": 16060 }, { "epoch": 0.9434073030409769, "grad_norm": 0.5253183841705322, "learning_rate": 5.468221191019457e-05, "loss": 0.0307, "step": 16070 }, { "epoch": 0.943994364212751, "grad_norm": 1.2979775667190552, "learning_rate": 5.463616970539403e-05, "loss": 0.0506, "step": 16080 }, { "epoch": 0.944581425384525, "grad_norm": 4.567948818206787, "learning_rate": 5.459012353490054e-05, "loss": 0.0344, "step": 16090 }, { "epoch": 0.9451684865562991, "grad_norm": 0.19619053602218628, "learning_rate": 5.454407343810112e-05, "loss": 0.0458, "step": 16100 }, { "epoch": 0.9457555477280732, "grad_norm": 0.15248937904834747, "learning_rate": 5.449801945438619e-05, "loss": 0.0401, "step": 16110 }, { "epoch": 0.9463426088998473, "grad_norm": 4.874411582946777, "learning_rate": 5.445196162314944e-05, "loss": 0.0563, "step": 16120 }, { "epoch": 0.9469296700716214, "grad_norm": 1.524129867553711, "learning_rate": 5.440589998378788e-05, "loss": 0.0401, "step": 16130 }, { "epoch": 0.9475167312433955, "grad_norm": 2.1806161403656006, "learning_rate": 5.435983457570179e-05, "loss": 0.0593, "step": 16140 }, { "epoch": 0.9481037924151696, "grad_norm": 2.463407039642334, "learning_rate": 5.431376543829467e-05, "loss": 0.0321, "step": 16150 }, { "epoch": 0.9486908535869437, "grad_norm": 2.473222017288208, "learning_rate": 5.426769261097317e-05, "loss": 0.0257, "step": 16160 }, { "epoch": 0.9492779147587178, "grad_norm": 3.790813446044922, "learning_rate": 5.422161613314715e-05, "loss": 0.0402, "step": 16170 }, { "epoch": 0.9498649759304919, "grad_norm": 1.729813575744629, "learning_rate": 5.4175536044229555e-05, "loss": 0.0413, "step": 16180 }, { "epoch": 0.950452037102266, "grad_norm": 0.2650720477104187, "learning_rate": 5.412945238363643e-05, "loss": 0.0232, "step": 16190 }, { "epoch": 0.9510390982740402, "grad_norm": 0.5374074578285217, "learning_rate": 5.408336519078688e-05, "loss": 0.034, "step": 16200 }, { "epoch": 0.9516261594458143, "grad_norm": 2.8509881496429443, "learning_rate": 5.403727450510304e-05, "loss": 0.0506, "step": 16210 }, { "epoch": 0.9522132206175884, "grad_norm": 2.0335960388183594, "learning_rate": 5.399118036601001e-05, "loss": 0.031, "step": 16220 }, { "epoch": 0.9528002817893625, "grad_norm": 1.038758635520935, "learning_rate": 5.3945082812935857e-05, "loss": 0.0255, "step": 16230 }, { "epoch": 0.9533873429611366, "grad_norm": 2.2434442043304443, "learning_rate": 5.389898188531156e-05, "loss": 0.028, "step": 16240 }, { "epoch": 0.9539744041329107, "grad_norm": 2.159489631652832, "learning_rate": 5.385287762257101e-05, "loss": 0.0235, "step": 16250 }, { "epoch": 0.9545614653046848, "grad_norm": 2.3346493244171143, "learning_rate": 5.380677006415093e-05, "loss": 0.0437, "step": 16260 }, { "epoch": 0.9551485264764589, "grad_norm": 3.1966068744659424, "learning_rate": 5.376065924949083e-05, "loss": 0.0331, "step": 16270 }, { "epoch": 0.955735587648233, "grad_norm": 4.0267014503479, "learning_rate": 5.3714545218033076e-05, "loss": 0.03, "step": 16280 }, { "epoch": 0.9563226488200071, "grad_norm": 1.0968010425567627, "learning_rate": 5.366842800922274e-05, "loss": 0.0507, "step": 16290 }, { "epoch": 0.9569097099917812, "grad_norm": 4.610196113586426, "learning_rate": 5.362230766250761e-05, "loss": 0.049, "step": 16300 }, { "epoch": 0.9574967711635552, "grad_norm": 1.0601422786712646, "learning_rate": 5.3576184217338185e-05, "loss": 0.0947, "step": 16310 }, { "epoch": 0.9580838323353293, "grad_norm": 0.8651183247566223, "learning_rate": 5.35300577131676e-05, "loss": 0.0354, "step": 16320 }, { "epoch": 0.9586708935071034, "grad_norm": 3.9636476039886475, "learning_rate": 5.3483928189451585e-05, "loss": 0.0886, "step": 16330 }, { "epoch": 0.9592579546788775, "grad_norm": 0.9150936007499695, "learning_rate": 5.343779568564848e-05, "loss": 0.0393, "step": 16340 }, { "epoch": 0.9598450158506516, "grad_norm": 1.3083094358444214, "learning_rate": 5.339166024121919e-05, "loss": 0.0639, "step": 16350 }, { "epoch": 0.9604320770224257, "grad_norm": 2.6717019081115723, "learning_rate": 5.334552189562707e-05, "loss": 0.0364, "step": 16360 }, { "epoch": 0.9610191381941998, "grad_norm": 2.7528023719787598, "learning_rate": 5.329938068833803e-05, "loss": 0.0623, "step": 16370 }, { "epoch": 0.9616061993659739, "grad_norm": 2.5364022254943848, "learning_rate": 5.3253236658820396e-05, "loss": 0.0298, "step": 16380 }, { "epoch": 0.962193260537748, "grad_norm": 2.2441797256469727, "learning_rate": 5.320708984654489e-05, "loss": 0.0548, "step": 16390 }, { "epoch": 0.9627803217095221, "grad_norm": 1.7258532047271729, "learning_rate": 5.316094029098465e-05, "loss": 0.0597, "step": 16400 }, { "epoch": 0.9633673828812962, "grad_norm": 0.5922069549560547, "learning_rate": 5.311478803161513e-05, "loss": 0.0279, "step": 16410 }, { "epoch": 0.9639544440530703, "grad_norm": 1.452048420906067, "learning_rate": 5.306863310791411e-05, "loss": 0.0339, "step": 16420 }, { "epoch": 0.9645415052248444, "grad_norm": 3.686988353729248, "learning_rate": 5.302247555936168e-05, "loss": 0.0379, "step": 16430 }, { "epoch": 0.9651285663966185, "grad_norm": 1.7346909046173096, "learning_rate": 5.2976315425440123e-05, "loss": 0.0314, "step": 16440 }, { "epoch": 0.9657156275683926, "grad_norm": 1.4474844932556152, "learning_rate": 5.293015274563394e-05, "loss": 0.0395, "step": 16450 }, { "epoch": 0.9663026887401667, "grad_norm": 1.6750543117523193, "learning_rate": 5.288398755942985e-05, "loss": 0.0273, "step": 16460 }, { "epoch": 0.9668897499119409, "grad_norm": 2.921114683151245, "learning_rate": 5.283781990631668e-05, "loss": 0.059, "step": 16470 }, { "epoch": 0.967476811083715, "grad_norm": 1.5211082696914673, "learning_rate": 5.279164982578536e-05, "loss": 0.0475, "step": 16480 }, { "epoch": 0.9680638722554891, "grad_norm": 4.571354866027832, "learning_rate": 5.2745477357328955e-05, "loss": 0.0531, "step": 16490 }, { "epoch": 0.9686509334272632, "grad_norm": 4.393733978271484, "learning_rate": 5.26993025404425e-05, "loss": 0.046, "step": 16500 }, { "epoch": 0.9692379945990373, "grad_norm": 0.4008418321609497, "learning_rate": 5.265312541462308e-05, "loss": 0.0356, "step": 16510 }, { "epoch": 0.9698250557708114, "grad_norm": 0.5978420972824097, "learning_rate": 5.260694601936975e-05, "loss": 0.0358, "step": 16520 }, { "epoch": 0.9704121169425854, "grad_norm": 2.1604294776916504, "learning_rate": 5.2560764394183494e-05, "loss": 0.0228, "step": 16530 }, { "epoch": 0.9709991781143595, "grad_norm": 4.071254730224609, "learning_rate": 5.2514580578567216e-05, "loss": 0.0528, "step": 16540 }, { "epoch": 0.9715862392861336, "grad_norm": 0.4931686222553253, "learning_rate": 5.2468394612025686e-05, "loss": 0.0272, "step": 16550 }, { "epoch": 0.9721733004579077, "grad_norm": 1.4254976511001587, "learning_rate": 5.242220653406553e-05, "loss": 0.0317, "step": 16560 }, { "epoch": 0.9727603616296818, "grad_norm": 1.2035225629806519, "learning_rate": 5.2376016384195136e-05, "loss": 0.03, "step": 16570 }, { "epoch": 0.9733474228014559, "grad_norm": 1.90572988986969, "learning_rate": 5.232982420192474e-05, "loss": 0.0256, "step": 16580 }, { "epoch": 0.97393448397323, "grad_norm": 1.1137534379959106, "learning_rate": 5.2283630026766225e-05, "loss": 0.0269, "step": 16590 }, { "epoch": 0.9745215451450041, "grad_norm": 0.971361517906189, "learning_rate": 5.223743389823327e-05, "loss": 0.0362, "step": 16600 }, { "epoch": 0.9751086063167782, "grad_norm": 1.9118727445602417, "learning_rate": 5.2191235855841146e-05, "loss": 0.0564, "step": 16610 }, { "epoch": 0.9756956674885523, "grad_norm": 2.5001401901245117, "learning_rate": 5.21450359391068e-05, "loss": 0.0446, "step": 16620 }, { "epoch": 0.9762827286603264, "grad_norm": 1.0125675201416016, "learning_rate": 5.2098834187548805e-05, "loss": 0.0217, "step": 16630 }, { "epoch": 0.9768697898321005, "grad_norm": 1.999197006225586, "learning_rate": 5.205263064068725e-05, "loss": 0.0451, "step": 16640 }, { "epoch": 0.9774568510038746, "grad_norm": 1.5815180540084839, "learning_rate": 5.200642533804379e-05, "loss": 0.0504, "step": 16650 }, { "epoch": 0.9780439121756487, "grad_norm": 2.38808012008667, "learning_rate": 5.196021831914157e-05, "loss": 0.0469, "step": 16660 }, { "epoch": 0.9786309733474228, "grad_norm": 2.111520290374756, "learning_rate": 5.191400962350523e-05, "loss": 0.0289, "step": 16670 }, { "epoch": 0.9792180345191969, "grad_norm": 1.6514825820922852, "learning_rate": 5.1867799290660815e-05, "loss": 0.0291, "step": 16680 }, { "epoch": 0.979805095690971, "grad_norm": 2.216040849685669, "learning_rate": 5.182158736013577e-05, "loss": 0.0264, "step": 16690 }, { "epoch": 0.9803921568627451, "grad_norm": 2.7220072746276855, "learning_rate": 5.177537387145894e-05, "loss": 0.0279, "step": 16700 }, { "epoch": 0.9809792180345192, "grad_norm": 2.420001268386841, "learning_rate": 5.1729158864160466e-05, "loss": 0.0312, "step": 16710 }, { "epoch": 0.9815662792062932, "grad_norm": 2.6234045028686523, "learning_rate": 5.16829423777718e-05, "loss": 0.0498, "step": 16720 }, { "epoch": 0.9821533403780673, "grad_norm": 2.2999556064605713, "learning_rate": 5.163672445182568e-05, "loss": 0.0463, "step": 16730 }, { "epoch": 0.9827404015498415, "grad_norm": 2.363856077194214, "learning_rate": 5.1590505125856025e-05, "loss": 0.0437, "step": 16740 }, { "epoch": 0.9833274627216156, "grad_norm": 1.5514051914215088, "learning_rate": 5.1544284439398006e-05, "loss": 0.032, "step": 16750 }, { "epoch": 0.9839145238933897, "grad_norm": 1.6730340719223022, "learning_rate": 5.149806243198794e-05, "loss": 0.076, "step": 16760 }, { "epoch": 0.9845015850651638, "grad_norm": 1.1458053588867188, "learning_rate": 5.1451839143163226e-05, "loss": 0.0493, "step": 16770 }, { "epoch": 0.9850886462369379, "grad_norm": 3.1543195247650146, "learning_rate": 5.140561461246246e-05, "loss": 0.0583, "step": 16780 }, { "epoch": 0.985675707408712, "grad_norm": 0.5939778089523315, "learning_rate": 5.13593888794252e-05, "loss": 0.0395, "step": 16790 }, { "epoch": 0.9862627685804861, "grad_norm": 1.49458909034729, "learning_rate": 5.1313161983592096e-05, "loss": 0.023, "step": 16800 }, { "epoch": 0.9868498297522602, "grad_norm": 0.751722514629364, "learning_rate": 5.126693396450476e-05, "loss": 0.0366, "step": 16810 }, { "epoch": 0.9874368909240343, "grad_norm": 0.714433491230011, "learning_rate": 5.1220704861705774e-05, "loss": 0.034, "step": 16820 }, { "epoch": 0.9880239520958084, "grad_norm": 2.0831124782562256, "learning_rate": 5.117447471473865e-05, "loss": 0.0312, "step": 16830 }, { "epoch": 0.9886110132675825, "grad_norm": 0.7495170831680298, "learning_rate": 5.1128243563147816e-05, "loss": 0.0459, "step": 16840 }, { "epoch": 0.9891980744393566, "grad_norm": 0.08070006221532822, "learning_rate": 5.108201144647851e-05, "loss": 0.0382, "step": 16850 }, { "epoch": 0.9897851356111307, "grad_norm": 0.5520992875099182, "learning_rate": 5.1035778404276815e-05, "loss": 0.0108, "step": 16860 }, { "epoch": 0.9903721967829048, "grad_norm": 1.1832150220870972, "learning_rate": 5.098954447608964e-05, "loss": 0.0262, "step": 16870 }, { "epoch": 0.9909592579546789, "grad_norm": 0.3039027750492096, "learning_rate": 5.0943309701464584e-05, "loss": 0.0199, "step": 16880 }, { "epoch": 0.991546319126453, "grad_norm": 1.2417914867401123, "learning_rate": 5.089707411995005e-05, "loss": 0.0302, "step": 16890 }, { "epoch": 0.9921333802982271, "grad_norm": 2.827122688293457, "learning_rate": 5.0850837771095074e-05, "loss": 0.0477, "step": 16900 }, { "epoch": 0.9927204414700012, "grad_norm": 1.0996414422988892, "learning_rate": 5.080460069444936e-05, "loss": 0.0433, "step": 16910 }, { "epoch": 0.9933075026417753, "grad_norm": 3.985029935836792, "learning_rate": 5.0758362929563244e-05, "loss": 0.0477, "step": 16920 }, { "epoch": 0.9938945638135493, "grad_norm": 0.7686489820480347, "learning_rate": 5.071212451598765e-05, "loss": 0.0329, "step": 16930 }, { "epoch": 0.9944816249853234, "grad_norm": 1.4695439338684082, "learning_rate": 5.066588549327403e-05, "loss": 0.0344, "step": 16940 }, { "epoch": 0.9950686861570975, "grad_norm": 1.7345598936080933, "learning_rate": 5.061964590097442e-05, "loss": 0.0302, "step": 16950 }, { "epoch": 0.9956557473288716, "grad_norm": 2.0473780632019043, "learning_rate": 5.057340577864127e-05, "loss": 0.0268, "step": 16960 }, { "epoch": 0.9962428085006457, "grad_norm": 0.17342029511928558, "learning_rate": 5.052716516582753e-05, "loss": 0.0388, "step": 16970 }, { "epoch": 0.9968298696724198, "grad_norm": 0.6585134863853455, "learning_rate": 5.048092410208656e-05, "loss": 0.0257, "step": 16980 }, { "epoch": 0.9974169308441939, "grad_norm": 1.3544081449508667, "learning_rate": 5.0434682626972105e-05, "loss": 0.0342, "step": 16990 }, { "epoch": 0.998003992015968, "grad_norm": 1.9180796146392822, "learning_rate": 5.0388440780038235e-05, "loss": 0.0461, "step": 17000 }, { "epoch": 0.9985910531877422, "grad_norm": 1.034408450126648, "learning_rate": 5.0342198600839394e-05, "loss": 0.0334, "step": 17010 }, { "epoch": 0.9991781143595163, "grad_norm": 1.6787270307540894, "learning_rate": 5.029595612893027e-05, "loss": 0.0365, "step": 17020 }, { "epoch": 0.9997651755312904, "grad_norm": 3.2107155323028564, "learning_rate": 5.024971340386577e-05, "loss": 0.0163, "step": 17030 }, { "epoch": 1.0003522367030644, "grad_norm": 0.9043329954147339, "learning_rate": 5.020347046520112e-05, "loss": 0.0274, "step": 17040 }, { "epoch": 1.0009392978748386, "grad_norm": 2.0490527153015137, "learning_rate": 5.015722735249163e-05, "loss": 0.0196, "step": 17050 }, { "epoch": 1.0015263590466126, "grad_norm": 4.039851665496826, "learning_rate": 5.0110984105292793e-05, "loss": 0.0193, "step": 17060 }, { "epoch": 1.0021134202183868, "grad_norm": 2.4501867294311523, "learning_rate": 5.0064740763160224e-05, "loss": 0.0157, "step": 17070 }, { "epoch": 1.0027004813901608, "grad_norm": 0.29475656151771545, "learning_rate": 5.001849736564961e-05, "loss": 0.0168, "step": 17080 }, { "epoch": 1.003287542561935, "grad_norm": 1.3502825498580933, "learning_rate": 4.99722539523167e-05, "loss": 0.0347, "step": 17090 }, { "epoch": 1.003874603733709, "grad_norm": 0.9406032562255859, "learning_rate": 4.9926010562717255e-05, "loss": 0.0207, "step": 17100 }, { "epoch": 1.0044616649054832, "grad_norm": 0.1644798219203949, "learning_rate": 4.987976723640698e-05, "loss": 0.0248, "step": 17110 }, { "epoch": 1.0050487260772571, "grad_norm": 1.1704422235488892, "learning_rate": 4.983352401294157e-05, "loss": 0.0134, "step": 17120 }, { "epoch": 1.0056357872490314, "grad_norm": 1.6676406860351562, "learning_rate": 4.97872809318766e-05, "loss": 0.0212, "step": 17130 }, { "epoch": 1.0062228484208053, "grad_norm": 0.01679125428199768, "learning_rate": 4.974103803276756e-05, "loss": 0.0058, "step": 17140 }, { "epoch": 1.0068099095925795, "grad_norm": 1.9125401973724365, "learning_rate": 4.9694795355169734e-05, "loss": 0.0105, "step": 17150 }, { "epoch": 1.0073969707643537, "grad_norm": 2.4387991428375244, "learning_rate": 4.964855293863828e-05, "loss": 0.0275, "step": 17160 }, { "epoch": 1.0079840319361277, "grad_norm": 0.5251579880714417, "learning_rate": 4.960231082272805e-05, "loss": 0.023, "step": 17170 }, { "epoch": 1.008571093107902, "grad_norm": 0.6000058054924011, "learning_rate": 4.955606904699371e-05, "loss": 0.0163, "step": 17180 }, { "epoch": 1.009158154279676, "grad_norm": 1.5222712755203247, "learning_rate": 4.950982765098965e-05, "loss": 0.0244, "step": 17190 }, { "epoch": 1.0097452154514501, "grad_norm": 1.656982421875, "learning_rate": 4.946358667426984e-05, "loss": 0.0204, "step": 17200 }, { "epoch": 1.0103322766232241, "grad_norm": 3.2606256008148193, "learning_rate": 4.941734615638797e-05, "loss": 0.0187, "step": 17210 }, { "epoch": 1.0109193377949983, "grad_norm": 1.3270857334136963, "learning_rate": 4.93711061368973e-05, "loss": 0.0249, "step": 17220 }, { "epoch": 1.0115063989667723, "grad_norm": 3.2098591327667236, "learning_rate": 4.9324866655350706e-05, "loss": 0.0158, "step": 17230 }, { "epoch": 1.0120934601385465, "grad_norm": 1.1581672430038452, "learning_rate": 4.927862775130055e-05, "loss": 0.0248, "step": 17240 }, { "epoch": 1.0126805213103205, "grad_norm": 2.123234987258911, "learning_rate": 4.923238946429876e-05, "loss": 0.0154, "step": 17250 }, { "epoch": 1.0132675824820947, "grad_norm": 0.3895007073879242, "learning_rate": 4.918615183389665e-05, "loss": 0.039, "step": 17260 }, { "epoch": 1.0138546436538687, "grad_norm": 3.8976800441741943, "learning_rate": 4.9139914899645096e-05, "loss": 0.0328, "step": 17270 }, { "epoch": 1.0144417048256429, "grad_norm": 3.9558913707733154, "learning_rate": 4.909367870109424e-05, "loss": 0.0261, "step": 17280 }, { "epoch": 1.0150287659974169, "grad_norm": 1.230304479598999, "learning_rate": 4.90474432777937e-05, "loss": 0.021, "step": 17290 }, { "epoch": 1.015615827169191, "grad_norm": 0.29122626781463623, "learning_rate": 4.900120866929238e-05, "loss": 0.0199, "step": 17300 }, { "epoch": 1.016202888340965, "grad_norm": 3.575148820877075, "learning_rate": 4.89549749151385e-05, "loss": 0.0109, "step": 17310 }, { "epoch": 1.0167899495127393, "grad_norm": 1.0561134815216064, "learning_rate": 4.890874205487957e-05, "loss": 0.0112, "step": 17320 }, { "epoch": 1.0173770106845132, "grad_norm": 4.787125110626221, "learning_rate": 4.8862510128062284e-05, "loss": 0.0169, "step": 17330 }, { "epoch": 1.0179640718562875, "grad_norm": 0.4798748195171356, "learning_rate": 4.881627917423261e-05, "loss": 0.0181, "step": 17340 }, { "epoch": 1.0185511330280614, "grad_norm": 1.4769734144210815, "learning_rate": 4.8770049232935575e-05, "loss": 0.0075, "step": 17350 }, { "epoch": 1.0191381941998356, "grad_norm": 1.235856533050537, "learning_rate": 4.8723820343715484e-05, "loss": 0.037, "step": 17360 }, { "epoch": 1.0197252553716096, "grad_norm": 1.6534072160720825, "learning_rate": 4.867759254611561e-05, "loss": 0.0241, "step": 17370 }, { "epoch": 1.0203123165433838, "grad_norm": 0.18529169261455536, "learning_rate": 4.8631365879678384e-05, "loss": 0.0104, "step": 17380 }, { "epoch": 1.0208993777151578, "grad_norm": 0.885037362575531, "learning_rate": 4.85851403839452e-05, "loss": 0.0085, "step": 17390 }, { "epoch": 1.021486438886932, "grad_norm": 1.1856127977371216, "learning_rate": 4.85389160984565e-05, "loss": 0.0274, "step": 17400 }, { "epoch": 1.022073500058706, "grad_norm": 0.05116559565067291, "learning_rate": 4.8492693062751675e-05, "loss": 0.0163, "step": 17410 }, { "epoch": 1.0226605612304802, "grad_norm": 1.601496934890747, "learning_rate": 4.844647131636907e-05, "loss": 0.0267, "step": 17420 }, { "epoch": 1.0232476224022544, "grad_norm": 0.557055652141571, "learning_rate": 4.840025089884583e-05, "loss": 0.0335, "step": 17430 }, { "epoch": 1.0238346835740284, "grad_norm": 1.0247009992599487, "learning_rate": 4.8354031849718126e-05, "loss": 0.0112, "step": 17440 }, { "epoch": 1.0244217447458026, "grad_norm": 1.172916293144226, "learning_rate": 4.8307814208520806e-05, "loss": 0.0318, "step": 17450 }, { "epoch": 1.0250088059175766, "grad_norm": 2.2573323249816895, "learning_rate": 4.82615980147876e-05, "loss": 0.0174, "step": 17460 }, { "epoch": 1.0255958670893508, "grad_norm": 0.3718615472316742, "learning_rate": 4.821538330805098e-05, "loss": 0.0536, "step": 17470 }, { "epoch": 1.0261829282611248, "grad_norm": 1.2338478565216064, "learning_rate": 4.816917012784213e-05, "loss": 0.0206, "step": 17480 }, { "epoch": 1.026769989432899, "grad_norm": 0.7763340473175049, "learning_rate": 4.812295851369096e-05, "loss": 0.0238, "step": 17490 }, { "epoch": 1.027357050604673, "grad_norm": 0.7135237455368042, "learning_rate": 4.807674850512601e-05, "loss": 0.0201, "step": 17500 }, { "epoch": 1.0279441117764472, "grad_norm": 0.7816159725189209, "learning_rate": 4.803054014167447e-05, "loss": 0.0244, "step": 17510 }, { "epoch": 1.0285311729482212, "grad_norm": 0.604958713054657, "learning_rate": 4.7984333462862066e-05, "loss": 0.0132, "step": 17520 }, { "epoch": 1.0291182341199954, "grad_norm": 0.3558271825313568, "learning_rate": 4.793812850821319e-05, "loss": 0.016, "step": 17530 }, { "epoch": 1.0297052952917694, "grad_norm": 0.5346792340278625, "learning_rate": 4.789192531725066e-05, "loss": 0.0201, "step": 17540 }, { "epoch": 1.0302923564635436, "grad_norm": 1.6487033367156982, "learning_rate": 4.784572392949583e-05, "loss": 0.0243, "step": 17550 }, { "epoch": 1.0308794176353175, "grad_norm": 2.0586116313934326, "learning_rate": 4.77995243844685e-05, "loss": 0.0149, "step": 17560 }, { "epoch": 1.0314664788070917, "grad_norm": 1.5526201725006104, "learning_rate": 4.775332672168691e-05, "loss": 0.015, "step": 17570 }, { "epoch": 1.0320535399788657, "grad_norm": 0.4767589867115021, "learning_rate": 4.770713098066765e-05, "loss": 0.0218, "step": 17580 }, { "epoch": 1.03264060115064, "grad_norm": 1.7146309614181519, "learning_rate": 4.7660937200925726e-05, "loss": 0.0335, "step": 17590 }, { "epoch": 1.033227662322414, "grad_norm": 1.1817280054092407, "learning_rate": 4.7614745421974447e-05, "loss": 0.0274, "step": 17600 }, { "epoch": 1.0338147234941881, "grad_norm": 2.4363436698913574, "learning_rate": 4.7568555683325325e-05, "loss": 0.0161, "step": 17610 }, { "epoch": 1.034401784665962, "grad_norm": 1.1145620346069336, "learning_rate": 4.752236802448829e-05, "loss": 0.0119, "step": 17620 }, { "epoch": 1.0349888458377363, "grad_norm": 1.6344585418701172, "learning_rate": 4.747618248497134e-05, "loss": 0.0118, "step": 17630 }, { "epoch": 1.0355759070095103, "grad_norm": 6.80527400970459, "learning_rate": 4.742999910428075e-05, "loss": 0.0304, "step": 17640 }, { "epoch": 1.0361629681812845, "grad_norm": 0.6219860315322876, "learning_rate": 4.73838179219209e-05, "loss": 0.012, "step": 17650 }, { "epoch": 1.0367500293530585, "grad_norm": 0.8430576920509338, "learning_rate": 4.7337638977394336e-05, "loss": 0.02, "step": 17660 }, { "epoch": 1.0373370905248327, "grad_norm": 0.3212287425994873, "learning_rate": 4.729146231020164e-05, "loss": 0.02, "step": 17670 }, { "epoch": 1.0379241516966067, "grad_norm": 0.26386991143226624, "learning_rate": 4.724528795984151e-05, "loss": 0.0237, "step": 17680 }, { "epoch": 1.0385112128683809, "grad_norm": 2.561908721923828, "learning_rate": 4.719911596581057e-05, "loss": 0.012, "step": 17690 }, { "epoch": 1.039098274040155, "grad_norm": 3.6018483638763428, "learning_rate": 4.715294636760352e-05, "loss": 0.0431, "step": 17700 }, { "epoch": 1.039685335211929, "grad_norm": 0.2698362171649933, "learning_rate": 4.7106779204712946e-05, "loss": 0.0194, "step": 17710 }, { "epoch": 1.0402723963837033, "grad_norm": 2.6462647914886475, "learning_rate": 4.7060614516629396e-05, "loss": 0.0156, "step": 17720 }, { "epoch": 1.0408594575554773, "grad_norm": 0.2950989902019501, "learning_rate": 4.701445234284127e-05, "loss": 0.0138, "step": 17730 }, { "epoch": 1.0414465187272515, "grad_norm": 0.05109265819191933, "learning_rate": 4.696829272283483e-05, "loss": 0.0188, "step": 17740 }, { "epoch": 1.0420335798990255, "grad_norm": 0.556533932685852, "learning_rate": 4.6922135696094175e-05, "loss": 0.0247, "step": 17750 }, { "epoch": 1.0426206410707997, "grad_norm": 1.077636480331421, "learning_rate": 4.687598130210112e-05, "loss": 0.0175, "step": 17760 }, { "epoch": 1.0432077022425736, "grad_norm": 0.15768545866012573, "learning_rate": 4.682982958033533e-05, "loss": 0.0181, "step": 17770 }, { "epoch": 1.0437947634143478, "grad_norm": 1.1106746196746826, "learning_rate": 4.678368057027407e-05, "loss": 0.0176, "step": 17780 }, { "epoch": 1.0443818245861218, "grad_norm": 6.201788425445557, "learning_rate": 4.6737534311392375e-05, "loss": 0.0443, "step": 17790 }, { "epoch": 1.044968885757896, "grad_norm": 0.5797479152679443, "learning_rate": 4.669139084316286e-05, "loss": 0.0117, "step": 17800 }, { "epoch": 1.04555594692967, "grad_norm": 0.9684267044067383, "learning_rate": 4.664525020505582e-05, "loss": 0.0053, "step": 17810 }, { "epoch": 1.0461430081014442, "grad_norm": 0.24567517638206482, "learning_rate": 4.6599112436539075e-05, "loss": 0.0103, "step": 17820 }, { "epoch": 1.0467300692732182, "grad_norm": 1.7975558042526245, "learning_rate": 4.6552977577078035e-05, "loss": 0.0229, "step": 17830 }, { "epoch": 1.0473171304449924, "grad_norm": 0.5624830722808838, "learning_rate": 4.6506845666135546e-05, "loss": 0.0191, "step": 17840 }, { "epoch": 1.0479041916167664, "grad_norm": 1.1604492664337158, "learning_rate": 4.646071674317204e-05, "loss": 0.0345, "step": 17850 }, { "epoch": 1.0484912527885406, "grad_norm": 4.161646366119385, "learning_rate": 4.6414590847645305e-05, "loss": 0.0133, "step": 17860 }, { "epoch": 1.0490783139603146, "grad_norm": 0.6629294157028198, "learning_rate": 4.636846801901056e-05, "loss": 0.02, "step": 17870 }, { "epoch": 1.0496653751320888, "grad_norm": 0.8780168294906616, "learning_rate": 4.632234829672045e-05, "loss": 0.0263, "step": 17880 }, { "epoch": 1.0502524363038628, "grad_norm": 0.35935521125793457, "learning_rate": 4.6276231720224885e-05, "loss": 0.0078, "step": 17890 }, { "epoch": 1.050839497475637, "grad_norm": 2.422109365463257, "learning_rate": 4.6230118328971156e-05, "loss": 0.0176, "step": 17900 }, { "epoch": 1.051426558647411, "grad_norm": 2.739208698272705, "learning_rate": 4.618400816240376e-05, "loss": 0.0168, "step": 17910 }, { "epoch": 1.0520136198191852, "grad_norm": 2.1523261070251465, "learning_rate": 4.613790125996451e-05, "loss": 0.0143, "step": 17920 }, { "epoch": 1.0526006809909592, "grad_norm": 2.3159255981445312, "learning_rate": 4.609179766109236e-05, "loss": 0.0228, "step": 17930 }, { "epoch": 1.0531877421627334, "grad_norm": 1.3906899690628052, "learning_rate": 4.604569740522349e-05, "loss": 0.0219, "step": 17940 }, { "epoch": 1.0537748033345073, "grad_norm": 1.316084623336792, "learning_rate": 4.599960053179117e-05, "loss": 0.0155, "step": 17950 }, { "epoch": 1.0543618645062816, "grad_norm": 0.3568131625652313, "learning_rate": 4.595350708022583e-05, "loss": 0.007, "step": 17960 }, { "epoch": 1.0549489256780558, "grad_norm": 1.3302022218704224, "learning_rate": 4.5907417089954926e-05, "loss": 0.0254, "step": 17970 }, { "epoch": 1.0555359868498297, "grad_norm": 1.3200385570526123, "learning_rate": 4.5861330600403e-05, "loss": 0.0158, "step": 17980 }, { "epoch": 1.056123048021604, "grad_norm": 0.4094598591327667, "learning_rate": 4.581524765099154e-05, "loss": 0.0094, "step": 17990 }, { "epoch": 1.056710109193378, "grad_norm": 2.4014627933502197, "learning_rate": 4.5769168281139066e-05, "loss": 0.0369, "step": 18000 }, { "epoch": 1.056710109193378, "eval_loss": 0.47405582666397095, "eval_runtime": 270.036, "eval_samples_per_second": 3.5, "eval_steps_per_second": 3.5, "step": 18000 }, { "epoch": 1.0572971703651521, "grad_norm": 1.9064300060272217, "learning_rate": 4.572309253026101e-05, "loss": 0.0233, "step": 18010 }, { "epoch": 1.0578842315369261, "grad_norm": 0.7512912154197693, "learning_rate": 4.56770204377697e-05, "loss": 0.0149, "step": 18020 }, { "epoch": 1.0584712927087003, "grad_norm": 1.0954493284225464, "learning_rate": 4.5630952043074356e-05, "loss": 0.0164, "step": 18030 }, { "epoch": 1.0590583538804743, "grad_norm": 2.786641836166382, "learning_rate": 4.5584887385581e-05, "loss": 0.0127, "step": 18040 }, { "epoch": 1.0596454150522485, "grad_norm": 0.6778393387794495, "learning_rate": 4.5538826504692496e-05, "loss": 0.0119, "step": 18050 }, { "epoch": 1.0602324762240225, "grad_norm": 0.5568864941596985, "learning_rate": 4.549276943980845e-05, "loss": 0.0242, "step": 18060 }, { "epoch": 1.0608195373957967, "grad_norm": 1.0595320463180542, "learning_rate": 4.544671623032522e-05, "loss": 0.0092, "step": 18070 }, { "epoch": 1.0614065985675707, "grad_norm": 2.627183437347412, "learning_rate": 4.540066691563587e-05, "loss": 0.0374, "step": 18080 }, { "epoch": 1.061993659739345, "grad_norm": 0.6018410921096802, "learning_rate": 4.535462153513012e-05, "loss": 0.0117, "step": 18090 }, { "epoch": 1.0625807209111189, "grad_norm": 5.123389720916748, "learning_rate": 4.53085801281943e-05, "loss": 0.0296, "step": 18100 }, { "epoch": 1.063167782082893, "grad_norm": 0.7179359197616577, "learning_rate": 4.526254273421143e-05, "loss": 0.0107, "step": 18110 }, { "epoch": 1.063754843254667, "grad_norm": 1.0431585311889648, "learning_rate": 4.521650939256097e-05, "loss": 0.0092, "step": 18120 }, { "epoch": 1.0643419044264413, "grad_norm": 0.39003250002861023, "learning_rate": 4.517048014261902e-05, "loss": 0.0153, "step": 18130 }, { "epoch": 1.0649289655982153, "grad_norm": 0.7012884616851807, "learning_rate": 4.512445502375813e-05, "loss": 0.0265, "step": 18140 }, { "epoch": 1.0655160267699895, "grad_norm": 0.6719775199890137, "learning_rate": 4.507843407534732e-05, "loss": 0.0186, "step": 18150 }, { "epoch": 1.0661030879417634, "grad_norm": 1.4437520503997803, "learning_rate": 4.503241733675207e-05, "loss": 0.0187, "step": 18160 }, { "epoch": 1.0666901491135377, "grad_norm": 0.06713113188743591, "learning_rate": 4.498640484733421e-05, "loss": 0.0077, "step": 18170 }, { "epoch": 1.0672772102853116, "grad_norm": 1.4248532056808472, "learning_rate": 4.494039664645201e-05, "loss": 0.0156, "step": 18180 }, { "epoch": 1.0678642714570858, "grad_norm": 1.0497246980667114, "learning_rate": 4.4894392773459957e-05, "loss": 0.0183, "step": 18190 }, { "epoch": 1.0684513326288598, "grad_norm": 0.09055526554584503, "learning_rate": 4.4848393267708974e-05, "loss": 0.0212, "step": 18200 }, { "epoch": 1.069038393800634, "grad_norm": 0.8578062653541565, "learning_rate": 4.480239816854613e-05, "loss": 0.017, "step": 18210 }, { "epoch": 1.069625454972408, "grad_norm": 3.037128448486328, "learning_rate": 4.4756407515314804e-05, "loss": 0.0211, "step": 18220 }, { "epoch": 1.0702125161441822, "grad_norm": 1.2302640676498413, "learning_rate": 4.471042134735451e-05, "loss": 0.0336, "step": 18230 }, { "epoch": 1.0707995773159564, "grad_norm": 0.2871870696544647, "learning_rate": 4.466443970400099e-05, "loss": 0.0163, "step": 18240 }, { "epoch": 1.0713866384877304, "grad_norm": 2.075404167175293, "learning_rate": 4.461846262458606e-05, "loss": 0.0251, "step": 18250 }, { "epoch": 1.0719736996595046, "grad_norm": 0.4061993658542633, "learning_rate": 4.4572490148437686e-05, "loss": 0.0196, "step": 18260 }, { "epoch": 1.0725607608312786, "grad_norm": 0.8557636737823486, "learning_rate": 4.452652231487982e-05, "loss": 0.0154, "step": 18270 }, { "epoch": 1.0731478220030528, "grad_norm": 1.7022781372070312, "learning_rate": 4.448055916323249e-05, "loss": 0.0145, "step": 18280 }, { "epoch": 1.0737348831748268, "grad_norm": 2.739203929901123, "learning_rate": 4.443460073281178e-05, "loss": 0.0201, "step": 18290 }, { "epoch": 1.074321944346601, "grad_norm": 1.0256034135818481, "learning_rate": 4.43886470629296e-05, "loss": 0.0132, "step": 18300 }, { "epoch": 1.074909005518375, "grad_norm": 3.473208427429199, "learning_rate": 4.4342698192893904e-05, "loss": 0.0394, "step": 18310 }, { "epoch": 1.0754960666901492, "grad_norm": 0.4178438186645508, "learning_rate": 4.429675416200848e-05, "loss": 0.024, "step": 18320 }, { "epoch": 1.0760831278619232, "grad_norm": 3.310688018798828, "learning_rate": 4.4250815009573e-05, "loss": 0.0253, "step": 18330 }, { "epoch": 1.0766701890336974, "grad_norm": 0.2001148760318756, "learning_rate": 4.420488077488295e-05, "loss": 0.0126, "step": 18340 }, { "epoch": 1.0772572502054714, "grad_norm": 2.2187912464141846, "learning_rate": 4.415895149722964e-05, "loss": 0.0104, "step": 18350 }, { "epoch": 1.0778443113772456, "grad_norm": 2.495615005493164, "learning_rate": 4.411302721590007e-05, "loss": 0.0277, "step": 18360 }, { "epoch": 1.0784313725490196, "grad_norm": 0.6757046580314636, "learning_rate": 4.406710797017706e-05, "loss": 0.0139, "step": 18370 }, { "epoch": 1.0790184337207938, "grad_norm": 0.36000552773475647, "learning_rate": 4.402119379933904e-05, "loss": 0.0142, "step": 18380 }, { "epoch": 1.0796054948925677, "grad_norm": 0.8996804356575012, "learning_rate": 4.3975284742660153e-05, "loss": 0.0202, "step": 18390 }, { "epoch": 1.080192556064342, "grad_norm": 3.5434324741363525, "learning_rate": 4.392938083941014e-05, "loss": 0.0135, "step": 18400 }, { "epoch": 1.080779617236116, "grad_norm": 2.7697722911834717, "learning_rate": 4.388348212885435e-05, "loss": 0.0162, "step": 18410 }, { "epoch": 1.0813666784078901, "grad_norm": 0.8480291962623596, "learning_rate": 4.383758865025368e-05, "loss": 0.0233, "step": 18420 }, { "epoch": 1.0819537395796641, "grad_norm": 1.0872409343719482, "learning_rate": 4.379170044286454e-05, "loss": 0.0133, "step": 18430 }, { "epoch": 1.0825408007514383, "grad_norm": 1.362554907798767, "learning_rate": 4.3745817545938874e-05, "loss": 0.0348, "step": 18440 }, { "epoch": 1.0831278619232123, "grad_norm": 1.4147248268127441, "learning_rate": 4.369993999872402e-05, "loss": 0.0202, "step": 18450 }, { "epoch": 1.0837149230949865, "grad_norm": 0.13291633129119873, "learning_rate": 4.365406784046282e-05, "loss": 0.0117, "step": 18460 }, { "epoch": 1.0843019842667605, "grad_norm": 2.1015400886535645, "learning_rate": 4.360820111039341e-05, "loss": 0.0218, "step": 18470 }, { "epoch": 1.0848890454385347, "grad_norm": 1.3264888525009155, "learning_rate": 4.3562339847749376e-05, "loss": 0.0163, "step": 18480 }, { "epoch": 1.085476106610309, "grad_norm": 1.4007258415222168, "learning_rate": 4.3516484091759545e-05, "loss": 0.0271, "step": 18490 }, { "epoch": 1.086063167782083, "grad_norm": 2.8127200603485107, "learning_rate": 4.347063388164812e-05, "loss": 0.0152, "step": 18500 }, { "epoch": 1.0866502289538569, "grad_norm": 2.462484121322632, "learning_rate": 4.342478925663447e-05, "loss": 0.0244, "step": 18510 }, { "epoch": 1.087237290125631, "grad_norm": 3.2977025508880615, "learning_rate": 4.3378950255933284e-05, "loss": 0.0219, "step": 18520 }, { "epoch": 1.0878243512974053, "grad_norm": 1.0323644876480103, "learning_rate": 4.333311691875433e-05, "loss": 0.0185, "step": 18530 }, { "epoch": 1.0884114124691793, "grad_norm": 1.8792959451675415, "learning_rate": 4.3287289284302615e-05, "loss": 0.0139, "step": 18540 }, { "epoch": 1.0889984736409535, "grad_norm": 1.6511178016662598, "learning_rate": 4.324146739177824e-05, "loss": 0.013, "step": 18550 }, { "epoch": 1.0895855348127275, "grad_norm": 0.5358900427818298, "learning_rate": 4.319565128037639e-05, "loss": 0.0096, "step": 18560 }, { "epoch": 1.0901725959845017, "grad_norm": 1.784927487373352, "learning_rate": 4.314984098928733e-05, "loss": 0.0195, "step": 18570 }, { "epoch": 1.0907596571562757, "grad_norm": 0.06436686217784882, "learning_rate": 4.3104036557696295e-05, "loss": 0.0111, "step": 18580 }, { "epoch": 1.0913467183280499, "grad_norm": 0.18714739382266998, "learning_rate": 4.305823802478357e-05, "loss": 0.0188, "step": 18590 }, { "epoch": 1.0919337794998238, "grad_norm": 0.02658461034297943, "learning_rate": 4.301244542972435e-05, "loss": 0.0149, "step": 18600 }, { "epoch": 1.092520840671598, "grad_norm": 1.0946741104125977, "learning_rate": 4.2966658811688785e-05, "loss": 0.0223, "step": 18610 }, { "epoch": 1.093107901843372, "grad_norm": 0.9635413885116577, "learning_rate": 4.292087820984185e-05, "loss": 0.0129, "step": 18620 }, { "epoch": 1.0936949630151462, "grad_norm": 3.726040840148926, "learning_rate": 4.287510366334346e-05, "loss": 0.0198, "step": 18630 }, { "epoch": 1.0942820241869202, "grad_norm": 0.013850445859134197, "learning_rate": 4.282933521134827e-05, "loss": 0.0145, "step": 18640 }, { "epoch": 1.0948690853586944, "grad_norm": 0.5245586633682251, "learning_rate": 4.2783572893005794e-05, "loss": 0.015, "step": 18650 }, { "epoch": 1.0954561465304684, "grad_norm": 4.834798812866211, "learning_rate": 4.273781674746023e-05, "loss": 0.029, "step": 18660 }, { "epoch": 1.0960432077022426, "grad_norm": 1.9716501235961914, "learning_rate": 4.269206681385058e-05, "loss": 0.0176, "step": 18670 }, { "epoch": 1.0966302688740166, "grad_norm": 2.1880879402160645, "learning_rate": 4.264632313131041e-05, "loss": 0.0271, "step": 18680 }, { "epoch": 1.0972173300457908, "grad_norm": 0.4904673993587494, "learning_rate": 4.260058573896809e-05, "loss": 0.0131, "step": 18690 }, { "epoch": 1.0978043912175648, "grad_norm": 0.6214790940284729, "learning_rate": 4.255485467594647e-05, "loss": 0.0093, "step": 18700 }, { "epoch": 1.098391452389339, "grad_norm": 3.5769858360290527, "learning_rate": 4.250912998136307e-05, "loss": 0.0158, "step": 18710 }, { "epoch": 1.098978513561113, "grad_norm": 0.420706570148468, "learning_rate": 4.246341169432994e-05, "loss": 0.0173, "step": 18720 }, { "epoch": 1.0995655747328872, "grad_norm": 1.8304451704025269, "learning_rate": 4.241769985395365e-05, "loss": 0.0141, "step": 18730 }, { "epoch": 1.1001526359046612, "grad_norm": 0.7487080693244934, "learning_rate": 4.2371994499335264e-05, "loss": 0.0258, "step": 18740 }, { "epoch": 1.1007396970764354, "grad_norm": 3.707828998565674, "learning_rate": 4.232629566957026e-05, "loss": 0.0309, "step": 18750 }, { "epoch": 1.1013267582482094, "grad_norm": 1.8079040050506592, "learning_rate": 4.2280603403748606e-05, "loss": 0.0222, "step": 18760 }, { "epoch": 1.1019138194199836, "grad_norm": 1.2301052808761597, "learning_rate": 4.223491774095455e-05, "loss": 0.0189, "step": 18770 }, { "epoch": 1.1025008805917578, "grad_norm": 0.06131039187312126, "learning_rate": 4.2189238720266826e-05, "loss": 0.0198, "step": 18780 }, { "epoch": 1.1030879417635318, "grad_norm": 2.1937789916992188, "learning_rate": 4.214356638075836e-05, "loss": 0.0255, "step": 18790 }, { "epoch": 1.103675002935306, "grad_norm": 0.10104241222143173, "learning_rate": 4.2097900761496445e-05, "loss": 0.0178, "step": 18800 }, { "epoch": 1.10426206410708, "grad_norm": 0.29033026099205017, "learning_rate": 4.2052241901542576e-05, "loss": 0.0362, "step": 18810 }, { "epoch": 1.1048491252788541, "grad_norm": 0.2741455137729645, "learning_rate": 4.2006589839952526e-05, "loss": 0.0122, "step": 18820 }, { "epoch": 1.1054361864506281, "grad_norm": 0.2224307656288147, "learning_rate": 4.1960944615776175e-05, "loss": 0.0264, "step": 18830 }, { "epoch": 1.1060232476224023, "grad_norm": 1.2905281782150269, "learning_rate": 4.191530626805762e-05, "loss": 0.023, "step": 18840 }, { "epoch": 1.1066103087941763, "grad_norm": 1.4105613231658936, "learning_rate": 4.186967483583505e-05, "loss": 0.0301, "step": 18850 }, { "epoch": 1.1071973699659505, "grad_norm": 2.536943197250366, "learning_rate": 4.1824050358140724e-05, "loss": 0.0128, "step": 18860 }, { "epoch": 1.1077844311377245, "grad_norm": 0.912386953830719, "learning_rate": 4.1778432874001006e-05, "loss": 0.0164, "step": 18870 }, { "epoch": 1.1083714923094987, "grad_norm": 0.1990130990743637, "learning_rate": 4.173282242243618e-05, "loss": 0.0131, "step": 18880 }, { "epoch": 1.1089585534812727, "grad_norm": 2.3780107498168945, "learning_rate": 4.168721904246063e-05, "loss": 0.0147, "step": 18890 }, { "epoch": 1.109545614653047, "grad_norm": 1.3942874670028687, "learning_rate": 4.164162277308259e-05, "loss": 0.0128, "step": 18900 }, { "epoch": 1.110132675824821, "grad_norm": 2.336909055709839, "learning_rate": 4.15960336533043e-05, "loss": 0.0181, "step": 18910 }, { "epoch": 1.110719736996595, "grad_norm": 0.11453723162412643, "learning_rate": 4.1550451722121806e-05, "loss": 0.0066, "step": 18920 }, { "epoch": 1.111306798168369, "grad_norm": 1.5155072212219238, "learning_rate": 4.1504877018525065e-05, "loss": 0.0143, "step": 18930 }, { "epoch": 1.1118938593401433, "grad_norm": 0.6730071306228638, "learning_rate": 4.14593095814978e-05, "loss": 0.019, "step": 18940 }, { "epoch": 1.1124809205119173, "grad_norm": 0.03255538269877434, "learning_rate": 4.141374945001758e-05, "loss": 0.0185, "step": 18950 }, { "epoch": 1.1130679816836915, "grad_norm": 1.301459789276123, "learning_rate": 4.136819666305566e-05, "loss": 0.0413, "step": 18960 }, { "epoch": 1.1136550428554655, "grad_norm": 2.044764518737793, "learning_rate": 4.1322651259577064e-05, "loss": 0.012, "step": 18970 }, { "epoch": 1.1142421040272397, "grad_norm": 2.8077080249786377, "learning_rate": 4.1277113278540456e-05, "loss": 0.0259, "step": 18980 }, { "epoch": 1.1148291651990136, "grad_norm": 3.5825212001800537, "learning_rate": 4.123158275889819e-05, "loss": 0.0199, "step": 18990 }, { "epoch": 1.1154162263707879, "grad_norm": 2.8341922760009766, "learning_rate": 4.118605973959623e-05, "loss": 0.0197, "step": 19000 }, { "epoch": 1.1160032875425618, "grad_norm": 1.1807790994644165, "learning_rate": 4.11405442595741e-05, "loss": 0.0169, "step": 19010 }, { "epoch": 1.116590348714336, "grad_norm": 1.8640202283859253, "learning_rate": 4.1095036357764915e-05, "loss": 0.0267, "step": 19020 }, { "epoch": 1.11717740988611, "grad_norm": 0.8677136301994324, "learning_rate": 4.104953607309524e-05, "loss": 0.0181, "step": 19030 }, { "epoch": 1.1177644710578842, "grad_norm": 3.1262636184692383, "learning_rate": 4.100404344448522e-05, "loss": 0.0204, "step": 19040 }, { "epoch": 1.1183515322296582, "grad_norm": 0.575782835483551, "learning_rate": 4.095855851084836e-05, "loss": 0.0185, "step": 19050 }, { "epoch": 1.1189385934014324, "grad_norm": 0.1899765133857727, "learning_rate": 4.091308131109165e-05, "loss": 0.0403, "step": 19060 }, { "epoch": 1.1195256545732066, "grad_norm": 0.6368929743766785, "learning_rate": 4.086761188411541e-05, "loss": 0.0138, "step": 19070 }, { "epoch": 1.1201127157449806, "grad_norm": 1.2761796712875366, "learning_rate": 4.082215026881337e-05, "loss": 0.0144, "step": 19080 }, { "epoch": 1.1206997769167548, "grad_norm": 1.0666402578353882, "learning_rate": 4.0776696504072506e-05, "loss": 0.0136, "step": 19090 }, { "epoch": 1.1212868380885288, "grad_norm": 1.846991777420044, "learning_rate": 4.073125062877317e-05, "loss": 0.0178, "step": 19100 }, { "epoch": 1.121873899260303, "grad_norm": 1.860212802886963, "learning_rate": 4.068581268178886e-05, "loss": 0.019, "step": 19110 }, { "epoch": 1.122460960432077, "grad_norm": 0.2708970010280609, "learning_rate": 4.064038270198638e-05, "loss": 0.0162, "step": 19120 }, { "epoch": 1.1230480216038512, "grad_norm": 2.09175181388855, "learning_rate": 4.05949607282257e-05, "loss": 0.0248, "step": 19130 }, { "epoch": 1.1236350827756252, "grad_norm": 2.0685741901397705, "learning_rate": 4.054954679935988e-05, "loss": 0.0124, "step": 19140 }, { "epoch": 1.1242221439473994, "grad_norm": 0.9099451303482056, "learning_rate": 4.05041409542352e-05, "loss": 0.0256, "step": 19150 }, { "epoch": 1.1248092051191734, "grad_norm": 2.0168309211730957, "learning_rate": 4.0458743231690925e-05, "loss": 0.0166, "step": 19160 }, { "epoch": 1.1253962662909476, "grad_norm": 1.3179303407669067, "learning_rate": 4.041335367055945e-05, "loss": 0.0147, "step": 19170 }, { "epoch": 1.1259833274627216, "grad_norm": 0.3907220661640167, "learning_rate": 4.0367972309666145e-05, "loss": 0.0205, "step": 19180 }, { "epoch": 1.1265703886344958, "grad_norm": 0.7503154873847961, "learning_rate": 4.03225991878294e-05, "loss": 0.0101, "step": 19190 }, { "epoch": 1.1271574498062698, "grad_norm": 0.5562353730201721, "learning_rate": 4.027723434386049e-05, "loss": 0.0286, "step": 19200 }, { "epoch": 1.127744510978044, "grad_norm": 0.17754141986370087, "learning_rate": 4.0231877816563695e-05, "loss": 0.0204, "step": 19210 }, { "epoch": 1.128331572149818, "grad_norm": 2.8177473545074463, "learning_rate": 4.0186529644736114e-05, "loss": 0.0159, "step": 19220 }, { "epoch": 1.1289186333215921, "grad_norm": 0.18983376026153564, "learning_rate": 4.014118986716776e-05, "loss": 0.0174, "step": 19230 }, { "epoch": 1.1295056944933661, "grad_norm": 2.237233877182007, "learning_rate": 4.0095858522641394e-05, "loss": 0.0114, "step": 19240 }, { "epoch": 1.1300927556651403, "grad_norm": 1.1644716262817383, "learning_rate": 4.005053564993261e-05, "loss": 0.0343, "step": 19250 }, { "epoch": 1.1306798168369143, "grad_norm": 1.384375810623169, "learning_rate": 4.000522128780978e-05, "loss": 0.0301, "step": 19260 }, { "epoch": 1.1312668780086885, "grad_norm": 2.5286264419555664, "learning_rate": 3.995991547503392e-05, "loss": 0.0322, "step": 19270 }, { "epoch": 1.1318539391804625, "grad_norm": 2.0052082538604736, "learning_rate": 3.991461825035882e-05, "loss": 0.0263, "step": 19280 }, { "epoch": 1.1324410003522367, "grad_norm": 1.8916563987731934, "learning_rate": 3.986932965253081e-05, "loss": 0.0243, "step": 19290 }, { "epoch": 1.1330280615240107, "grad_norm": 0.11359672248363495, "learning_rate": 3.9824049720289e-05, "loss": 0.0131, "step": 19300 }, { "epoch": 1.133615122695785, "grad_norm": 2.318324565887451, "learning_rate": 3.9778778492364924e-05, "loss": 0.0482, "step": 19310 }, { "epoch": 1.134202183867559, "grad_norm": 0.8235112428665161, "learning_rate": 3.973351600748278e-05, "loss": 0.0125, "step": 19320 }, { "epoch": 1.134789245039333, "grad_norm": 0.7303930521011353, "learning_rate": 3.968826230435923e-05, "loss": 0.0175, "step": 19330 }, { "epoch": 1.135376306211107, "grad_norm": 1.6614584922790527, "learning_rate": 3.964301742170349e-05, "loss": 0.0142, "step": 19340 }, { "epoch": 1.1359633673828813, "grad_norm": 0.1857631355524063, "learning_rate": 3.9597781398217135e-05, "loss": 0.0256, "step": 19350 }, { "epoch": 1.1365504285546555, "grad_norm": 1.6671332120895386, "learning_rate": 3.9552554272594256e-05, "loss": 0.0181, "step": 19360 }, { "epoch": 1.1371374897264295, "grad_norm": 0.8466367721557617, "learning_rate": 3.9507336083521256e-05, "loss": 0.0166, "step": 19370 }, { "epoch": 1.1377245508982037, "grad_norm": 3.6741080284118652, "learning_rate": 3.946212686967696e-05, "loss": 0.0116, "step": 19380 }, { "epoch": 1.1383116120699777, "grad_norm": 0.9576346278190613, "learning_rate": 3.9416926669732454e-05, "loss": 0.0216, "step": 19390 }, { "epoch": 1.1388986732417519, "grad_norm": 2.505992889404297, "learning_rate": 3.937173552235117e-05, "loss": 0.0231, "step": 19400 }, { "epoch": 1.1394857344135259, "grad_norm": 1.0942327976226807, "learning_rate": 3.932655346618876e-05, "loss": 0.0328, "step": 19410 }, { "epoch": 1.1400727955853, "grad_norm": 0.9428576231002808, "learning_rate": 3.9281380539893114e-05, "loss": 0.0133, "step": 19420 }, { "epoch": 1.140659856757074, "grad_norm": 1.203651785850525, "learning_rate": 3.923621678210432e-05, "loss": 0.023, "step": 19430 }, { "epoch": 1.1412469179288482, "grad_norm": 0.5374916791915894, "learning_rate": 3.9191062231454586e-05, "loss": 0.0138, "step": 19440 }, { "epoch": 1.1418339791006222, "grad_norm": 0.11401000618934631, "learning_rate": 3.914591692656831e-05, "loss": 0.0147, "step": 19450 }, { "epoch": 1.1424210402723964, "grad_norm": 2.2636830806732178, "learning_rate": 3.9100780906061896e-05, "loss": 0.0159, "step": 19460 }, { "epoch": 1.1430081014441704, "grad_norm": 1.200324296951294, "learning_rate": 3.905565420854388e-05, "loss": 0.0391, "step": 19470 }, { "epoch": 1.1435951626159446, "grad_norm": 2.1873414516448975, "learning_rate": 3.901053687261479e-05, "loss": 0.0157, "step": 19480 }, { "epoch": 1.1441822237877186, "grad_norm": 0.5097103714942932, "learning_rate": 3.896542893686716e-05, "loss": 0.0149, "step": 19490 }, { "epoch": 1.1447692849594928, "grad_norm": 1.5123475790023804, "learning_rate": 3.892033043988547e-05, "loss": 0.0186, "step": 19500 }, { "epoch": 1.1453563461312668, "grad_norm": 0.18340526521205902, "learning_rate": 3.887524142024614e-05, "loss": 0.0209, "step": 19510 }, { "epoch": 1.145943407303041, "grad_norm": 3.1682090759277344, "learning_rate": 3.883016191651744e-05, "loss": 0.0201, "step": 19520 }, { "epoch": 1.146530468474815, "grad_norm": 1.0965871810913086, "learning_rate": 3.878509196725957e-05, "loss": 0.0224, "step": 19530 }, { "epoch": 1.1471175296465892, "grad_norm": 0.11458544433116913, "learning_rate": 3.874003161102453e-05, "loss": 0.0168, "step": 19540 }, { "epoch": 1.1477045908183632, "grad_norm": 1.2999083995819092, "learning_rate": 3.869498088635608e-05, "loss": 0.031, "step": 19550 }, { "epoch": 1.1482916519901374, "grad_norm": 0.534529983997345, "learning_rate": 3.864993983178978e-05, "loss": 0.0234, "step": 19560 }, { "epoch": 1.1488787131619116, "grad_norm": 0.2957916855812073, "learning_rate": 3.860490848585291e-05, "loss": 0.0212, "step": 19570 }, { "epoch": 1.1494657743336856, "grad_norm": 0.7371472120285034, "learning_rate": 3.8559886887064434e-05, "loss": 0.0077, "step": 19580 }, { "epoch": 1.1500528355054596, "grad_norm": 1.357395887374878, "learning_rate": 3.851487507393498e-05, "loss": 0.0274, "step": 19590 }, { "epoch": 1.1506398966772338, "grad_norm": 0.6381208300590515, "learning_rate": 3.846987308496686e-05, "loss": 0.0219, "step": 19600 }, { "epoch": 1.151226957849008, "grad_norm": 1.3650586605072021, "learning_rate": 3.8424880958653855e-05, "loss": 0.0252, "step": 19610 }, { "epoch": 1.151814019020782, "grad_norm": 0.9871792197227478, "learning_rate": 3.8379898733481455e-05, "loss": 0.0127, "step": 19620 }, { "epoch": 1.1524010801925562, "grad_norm": 2.2490391731262207, "learning_rate": 3.8334926447926576e-05, "loss": 0.0209, "step": 19630 }, { "epoch": 1.1529881413643301, "grad_norm": 0.5458321571350098, "learning_rate": 3.82899641404577e-05, "loss": 0.0158, "step": 19640 }, { "epoch": 1.1535752025361043, "grad_norm": 1.415770411491394, "learning_rate": 3.8245011849534724e-05, "loss": 0.0134, "step": 19650 }, { "epoch": 1.1541622637078783, "grad_norm": 1.3493916988372803, "learning_rate": 3.820006961360901e-05, "loss": 0.0123, "step": 19660 }, { "epoch": 1.1547493248796525, "grad_norm": 1.5763715505599976, "learning_rate": 3.8155137471123294e-05, "loss": 0.0262, "step": 19670 }, { "epoch": 1.1553363860514265, "grad_norm": 1.3639353513717651, "learning_rate": 3.8110215460511696e-05, "loss": 0.0311, "step": 19680 }, { "epoch": 1.1559234472232007, "grad_norm": 4.087533473968506, "learning_rate": 3.806530362019969e-05, "loss": 0.0336, "step": 19690 }, { "epoch": 1.1565105083949747, "grad_norm": 0.9178034663200378, "learning_rate": 3.802040198860397e-05, "loss": 0.017, "step": 19700 }, { "epoch": 1.157097569566749, "grad_norm": 0.9727124571800232, "learning_rate": 3.7975510604132626e-05, "loss": 0.0275, "step": 19710 }, { "epoch": 1.157684630738523, "grad_norm": 0.1609918177127838, "learning_rate": 3.793062950518484e-05, "loss": 0.0231, "step": 19720 }, { "epoch": 1.158271691910297, "grad_norm": 1.1793239116668701, "learning_rate": 3.788575873015111e-05, "loss": 0.0201, "step": 19730 }, { "epoch": 1.158858753082071, "grad_norm": 1.8278889656066895, "learning_rate": 3.7840898317413034e-05, "loss": 0.0196, "step": 19740 }, { "epoch": 1.1594458142538453, "grad_norm": 1.4205490350723267, "learning_rate": 3.7796048305343383e-05, "loss": 0.0151, "step": 19750 }, { "epoch": 1.1600328754256193, "grad_norm": 1.1149266958236694, "learning_rate": 3.7751208732306015e-05, "loss": 0.0153, "step": 19760 }, { "epoch": 1.1606199365973935, "grad_norm": 0.7259837985038757, "learning_rate": 3.770637963665589e-05, "loss": 0.0202, "step": 19770 }, { "epoch": 1.1612069977691675, "grad_norm": 2.9467718601226807, "learning_rate": 3.766156105673891e-05, "loss": 0.0214, "step": 19780 }, { "epoch": 1.1617940589409417, "grad_norm": 2.441718816757202, "learning_rate": 3.761675303089213e-05, "loss": 0.0264, "step": 19790 }, { "epoch": 1.1623811201127157, "grad_norm": 0.11856356263160706, "learning_rate": 3.757195559744345e-05, "loss": 0.0172, "step": 19800 }, { "epoch": 1.1629681812844899, "grad_norm": 1.5592923164367676, "learning_rate": 3.7527168794711764e-05, "loss": 0.0183, "step": 19810 }, { "epoch": 1.1635552424562638, "grad_norm": 0.534089207649231, "learning_rate": 3.748239266100689e-05, "loss": 0.0185, "step": 19820 }, { "epoch": 1.164142303628038, "grad_norm": 0.6166985630989075, "learning_rate": 3.7437627234629464e-05, "loss": 0.0205, "step": 19830 }, { "epoch": 1.164729364799812, "grad_norm": 0.7895412445068359, "learning_rate": 3.7392872553871025e-05, "loss": 0.0054, "step": 19840 }, { "epoch": 1.1653164259715862, "grad_norm": 1.0005983114242554, "learning_rate": 3.7348128657013864e-05, "loss": 0.0176, "step": 19850 }, { "epoch": 1.1659034871433605, "grad_norm": 0.9038300514221191, "learning_rate": 3.730339558233111e-05, "loss": 0.0188, "step": 19860 }, { "epoch": 1.1664905483151344, "grad_norm": 0.28302982449531555, "learning_rate": 3.7258673368086545e-05, "loss": 0.0106, "step": 19870 }, { "epoch": 1.1670776094869084, "grad_norm": 0.9288384318351746, "learning_rate": 3.721396205253478e-05, "loss": 0.0166, "step": 19880 }, { "epoch": 1.1676646706586826, "grad_norm": 1.2959872484207153, "learning_rate": 3.716926167392098e-05, "loss": 0.0098, "step": 19890 }, { "epoch": 1.1682517318304568, "grad_norm": 1.1480991840362549, "learning_rate": 3.7124572270481056e-05, "loss": 0.0202, "step": 19900 }, { "epoch": 1.1688387930022308, "grad_norm": 4.931199073791504, "learning_rate": 3.707989388044146e-05, "loss": 0.0165, "step": 19910 }, { "epoch": 1.169425854174005, "grad_norm": 0.030009958893060684, "learning_rate": 3.7035226542019275e-05, "loss": 0.0259, "step": 19920 }, { "epoch": 1.170012915345779, "grad_norm": 0.5939013957977295, "learning_rate": 3.699057029342209e-05, "loss": 0.0216, "step": 19930 }, { "epoch": 1.1705999765175532, "grad_norm": 1.6112911701202393, "learning_rate": 3.6945925172848054e-05, "loss": 0.0121, "step": 19940 }, { "epoch": 1.1711870376893272, "grad_norm": 0.5483613014221191, "learning_rate": 3.6901291218485725e-05, "loss": 0.0138, "step": 19950 }, { "epoch": 1.1717740988611014, "grad_norm": 1.590866208076477, "learning_rate": 3.685666846851417e-05, "loss": 0.0277, "step": 19960 }, { "epoch": 1.1723611600328754, "grad_norm": 1.124561071395874, "learning_rate": 3.6812056961102894e-05, "loss": 0.018, "step": 19970 }, { "epoch": 1.1729482212046496, "grad_norm": 1.2572624683380127, "learning_rate": 3.67674567344117e-05, "loss": 0.0203, "step": 19980 }, { "epoch": 1.1735352823764236, "grad_norm": 1.9352120161056519, "learning_rate": 3.672286782659081e-05, "loss": 0.0166, "step": 19990 }, { "epoch": 1.1741223435481978, "grad_norm": 1.4766746759414673, "learning_rate": 3.6678290275780724e-05, "loss": 0.0176, "step": 20000 }, { "epoch": 1.1747094047199718, "grad_norm": 1.764749526977539, "learning_rate": 3.6633724120112274e-05, "loss": 0.0145, "step": 20010 }, { "epoch": 1.175296465891746, "grad_norm": 3.0195517539978027, "learning_rate": 3.658916939770649e-05, "loss": 0.0238, "step": 20020 }, { "epoch": 1.17588352706352, "grad_norm": 1.7537109851837158, "learning_rate": 3.6544626146674685e-05, "loss": 0.0095, "step": 20030 }, { "epoch": 1.1764705882352942, "grad_norm": 0.3347777724266052, "learning_rate": 3.650009440511828e-05, "loss": 0.0118, "step": 20040 }, { "epoch": 1.1770576494070681, "grad_norm": 2.6815648078918457, "learning_rate": 3.645557421112893e-05, "loss": 0.0143, "step": 20050 }, { "epoch": 1.1776447105788423, "grad_norm": 0.5392882823944092, "learning_rate": 3.641106560278834e-05, "loss": 0.0136, "step": 20060 }, { "epoch": 1.1782317717506163, "grad_norm": 0.43846121430397034, "learning_rate": 3.636656861816838e-05, "loss": 0.0186, "step": 20070 }, { "epoch": 1.1788188329223905, "grad_norm": 0.09271737188100815, "learning_rate": 3.632208329533092e-05, "loss": 0.022, "step": 20080 }, { "epoch": 1.1794058940941645, "grad_norm": 1.203433632850647, "learning_rate": 3.627760967232788e-05, "loss": 0.0114, "step": 20090 }, { "epoch": 1.1799929552659387, "grad_norm": 1.8334535360336304, "learning_rate": 3.6233147787201175e-05, "loss": 0.0196, "step": 20100 }, { "epoch": 1.180580016437713, "grad_norm": 1.0618259906768799, "learning_rate": 3.618869767798263e-05, "loss": 0.0185, "step": 20110 }, { "epoch": 1.181167077609487, "grad_norm": 2.4536964893341064, "learning_rate": 3.6144259382694114e-05, "loss": 0.0179, "step": 20120 }, { "epoch": 1.181754138781261, "grad_norm": 0.7509942650794983, "learning_rate": 3.6099832939347237e-05, "loss": 0.0153, "step": 20130 }, { "epoch": 1.182341199953035, "grad_norm": 1.69878089427948, "learning_rate": 3.605541838594359e-05, "loss": 0.0168, "step": 20140 }, { "epoch": 1.1829282611248093, "grad_norm": 2.670074701309204, "learning_rate": 3.6011015760474534e-05, "loss": 0.0124, "step": 20150 }, { "epoch": 1.1835153222965833, "grad_norm": 1.557386875152588, "learning_rate": 3.596662510092126e-05, "loss": 0.0234, "step": 20160 }, { "epoch": 1.1841023834683573, "grad_norm": 4.631673812866211, "learning_rate": 3.5922246445254706e-05, "loss": 0.0338, "step": 20170 }, { "epoch": 1.1846894446401315, "grad_norm": 2.1271257400512695, "learning_rate": 3.587787983143554e-05, "loss": 0.0265, "step": 20180 }, { "epoch": 1.1852765058119057, "grad_norm": 2.2365972995758057, "learning_rate": 3.583352529741413e-05, "loss": 0.0175, "step": 20190 }, { "epoch": 1.1858635669836797, "grad_norm": 0.07313284277915955, "learning_rate": 3.578918288113055e-05, "loss": 0.0369, "step": 20200 }, { "epoch": 1.1864506281554539, "grad_norm": 1.7829078435897827, "learning_rate": 3.5744852620514415e-05, "loss": 0.0167, "step": 20210 }, { "epoch": 1.1870376893272279, "grad_norm": 0.19565449655056, "learning_rate": 3.570053455348502e-05, "loss": 0.0091, "step": 20220 }, { "epoch": 1.187624750499002, "grad_norm": 0.5372190475463867, "learning_rate": 3.565622871795127e-05, "loss": 0.0149, "step": 20230 }, { "epoch": 1.188211811670776, "grad_norm": 3.2339234352111816, "learning_rate": 3.561193515181147e-05, "loss": 0.024, "step": 20240 }, { "epoch": 1.1887988728425503, "grad_norm": 0.9953827261924744, "learning_rate": 3.5567653892953564e-05, "loss": 0.0235, "step": 20250 }, { "epoch": 1.1893859340143242, "grad_norm": 0.4738207459449768, "learning_rate": 3.552338497925488e-05, "loss": 0.0253, "step": 20260 }, { "epoch": 1.1899729951860984, "grad_norm": 1.1432874202728271, "learning_rate": 3.5479128448582246e-05, "loss": 0.017, "step": 20270 }, { "epoch": 1.1905600563578724, "grad_norm": 2.495492458343506, "learning_rate": 3.543488433879184e-05, "loss": 0.0367, "step": 20280 }, { "epoch": 1.1911471175296466, "grad_norm": 3.612032175064087, "learning_rate": 3.539065268772929e-05, "loss": 0.0245, "step": 20290 }, { "epoch": 1.1917341787014206, "grad_norm": 1.7298696041107178, "learning_rate": 3.5346433533229474e-05, "loss": 0.0122, "step": 20300 }, { "epoch": 1.1923212398731948, "grad_norm": 1.522752285003662, "learning_rate": 3.530222691311666e-05, "loss": 0.0081, "step": 20310 }, { "epoch": 1.1929083010449688, "grad_norm": 4.213247776031494, "learning_rate": 3.525803286520437e-05, "loss": 0.0195, "step": 20320 }, { "epoch": 1.193495362216743, "grad_norm": 0.8256901502609253, "learning_rate": 3.521385142729535e-05, "loss": 0.0259, "step": 20330 }, { "epoch": 1.194082423388517, "grad_norm": 1.6951062679290771, "learning_rate": 3.516968263718159e-05, "loss": 0.0201, "step": 20340 }, { "epoch": 1.1946694845602912, "grad_norm": 0.9061385989189148, "learning_rate": 3.512552653264425e-05, "loss": 0.0105, "step": 20350 }, { "epoch": 1.1952565457320652, "grad_norm": 1.230067253112793, "learning_rate": 3.5081383151453604e-05, "loss": 0.0186, "step": 20360 }, { "epoch": 1.1958436069038394, "grad_norm": 1.266953468322754, "learning_rate": 3.5037252531369104e-05, "loss": 0.0238, "step": 20370 }, { "epoch": 1.1964306680756134, "grad_norm": 0.3283993601799011, "learning_rate": 3.499313471013928e-05, "loss": 0.0053, "step": 20380 }, { "epoch": 1.1970177292473876, "grad_norm": 6.5485310554504395, "learning_rate": 3.494902972550165e-05, "loss": 0.0243, "step": 20390 }, { "epoch": 1.1976047904191618, "grad_norm": 3.0127108097076416, "learning_rate": 3.490493761518281e-05, "loss": 0.0376, "step": 20400 }, { "epoch": 1.1981918515909358, "grad_norm": 5.080477714538574, "learning_rate": 3.486085841689832e-05, "loss": 0.0154, "step": 20410 }, { "epoch": 1.1987789127627098, "grad_norm": 0.7297155857086182, "learning_rate": 3.481679216835273e-05, "loss": 0.0341, "step": 20420 }, { "epoch": 1.199365973934484, "grad_norm": 2.8755810260772705, "learning_rate": 3.477273890723944e-05, "loss": 0.0321, "step": 20430 }, { "epoch": 1.1999530351062582, "grad_norm": 1.0556422472000122, "learning_rate": 3.4728698671240854e-05, "loss": 0.019, "step": 20440 }, { "epoch": 1.2005400962780322, "grad_norm": 3.1558666229248047, "learning_rate": 3.468467149802808e-05, "loss": 0.033, "step": 20450 }, { "epoch": 1.2011271574498064, "grad_norm": 2.6377081871032715, "learning_rate": 3.4640657425261224e-05, "loss": 0.0127, "step": 20460 }, { "epoch": 1.2017142186215803, "grad_norm": 2.073103666305542, "learning_rate": 3.459665649058904e-05, "loss": 0.0091, "step": 20470 }, { "epoch": 1.2023012797933545, "grad_norm": 0.430960476398468, "learning_rate": 3.455266873164914e-05, "loss": 0.0167, "step": 20480 }, { "epoch": 1.2028883409651285, "grad_norm": 1.621891975402832, "learning_rate": 3.45086941860678e-05, "loss": 0.019, "step": 20490 }, { "epoch": 1.2034754021369027, "grad_norm": 1.142160177230835, "learning_rate": 3.446473289146006e-05, "loss": 0.0089, "step": 20500 }, { "epoch": 1.2040624633086767, "grad_norm": 1.7501323223114014, "learning_rate": 3.442078488542957e-05, "loss": 0.0227, "step": 20510 }, { "epoch": 1.204649524480451, "grad_norm": 0.16087234020233154, "learning_rate": 3.437685020556864e-05, "loss": 0.009, "step": 20520 }, { "epoch": 1.205236585652225, "grad_norm": 0.8386476039886475, "learning_rate": 3.433292888945818e-05, "loss": 0.0155, "step": 20530 }, { "epoch": 1.2058236468239991, "grad_norm": 0.21038372814655304, "learning_rate": 3.428902097466764e-05, "loss": 0.0122, "step": 20540 }, { "epoch": 1.206410707995773, "grad_norm": 2.1462059020996094, "learning_rate": 3.424512649875506e-05, "loss": 0.0121, "step": 20550 }, { "epoch": 1.2069977691675473, "grad_norm": 3.1834864616394043, "learning_rate": 3.420124549926693e-05, "loss": 0.0188, "step": 20560 }, { "epoch": 1.2075848303393213, "grad_norm": 0.31415820121765137, "learning_rate": 3.4157378013738264e-05, "loss": 0.0207, "step": 20570 }, { "epoch": 1.2081718915110955, "grad_norm": 1.479813575744629, "learning_rate": 3.411352407969245e-05, "loss": 0.012, "step": 20580 }, { "epoch": 1.2087589526828695, "grad_norm": 3.589937925338745, "learning_rate": 3.406968373464137e-05, "loss": 0.0308, "step": 20590 }, { "epoch": 1.2093460138546437, "grad_norm": 1.240348219871521, "learning_rate": 3.402585701608519e-05, "loss": 0.0171, "step": 20600 }, { "epoch": 1.2099330750264177, "grad_norm": 0.276764452457428, "learning_rate": 3.398204396151251e-05, "loss": 0.0141, "step": 20610 }, { "epoch": 1.2105201361981919, "grad_norm": 1.3588885068893433, "learning_rate": 3.3938244608400164e-05, "loss": 0.0204, "step": 20620 }, { "epoch": 1.2111071973699659, "grad_norm": 4.115050792694092, "learning_rate": 3.389445899421332e-05, "loss": 0.0181, "step": 20630 }, { "epoch": 1.21169425854174, "grad_norm": 0.29594939947128296, "learning_rate": 3.385068715640536e-05, "loss": 0.0135, "step": 20640 }, { "epoch": 1.2122813197135143, "grad_norm": 0.11409210413694382, "learning_rate": 3.380692913241791e-05, "loss": 0.0214, "step": 20650 }, { "epoch": 1.2128683808852883, "grad_norm": 0.31874606013298035, "learning_rate": 3.376318495968076e-05, "loss": 0.0292, "step": 20660 }, { "epoch": 1.2134554420570622, "grad_norm": 1.7642158269882202, "learning_rate": 3.371945467561186e-05, "loss": 0.0362, "step": 20670 }, { "epoch": 1.2140425032288364, "grad_norm": 0.44985368847846985, "learning_rate": 3.367573831761728e-05, "loss": 0.016, "step": 20680 }, { "epoch": 1.2146295644006107, "grad_norm": 1.4707359075546265, "learning_rate": 3.363203592309117e-05, "loss": 0.0134, "step": 20690 }, { "epoch": 1.2152166255723846, "grad_norm": 0.029591698199510574, "learning_rate": 3.358834752941576e-05, "loss": 0.0106, "step": 20700 }, { "epoch": 1.2158036867441586, "grad_norm": 0.5049862265586853, "learning_rate": 3.354467317396124e-05, "loss": 0.0278, "step": 20710 }, { "epoch": 1.2163907479159328, "grad_norm": 1.4027550220489502, "learning_rate": 3.35010128940859e-05, "loss": 0.0331, "step": 20720 }, { "epoch": 1.216977809087707, "grad_norm": 0.664897084236145, "learning_rate": 3.345736672713588e-05, "loss": 0.0113, "step": 20730 }, { "epoch": 1.217564870259481, "grad_norm": 1.8764830827713013, "learning_rate": 3.341373471044531e-05, "loss": 0.0154, "step": 20740 }, { "epoch": 1.2181519314312552, "grad_norm": 2.603501081466675, "learning_rate": 3.33701168813362e-05, "loss": 0.0093, "step": 20750 }, { "epoch": 1.2187389926030292, "grad_norm": 0.622331440448761, "learning_rate": 3.3326513277118446e-05, "loss": 0.0207, "step": 20760 }, { "epoch": 1.2193260537748034, "grad_norm": 2.407594919204712, "learning_rate": 3.328292393508972e-05, "loss": 0.0255, "step": 20770 }, { "epoch": 1.2199131149465774, "grad_norm": 0.8337243795394897, "learning_rate": 3.323934889253556e-05, "loss": 0.0124, "step": 20780 }, { "epoch": 1.2205001761183516, "grad_norm": 1.6804401874542236, "learning_rate": 3.3195788186729245e-05, "loss": 0.0137, "step": 20790 }, { "epoch": 1.2210872372901256, "grad_norm": 5.784061908721924, "learning_rate": 3.315224185493176e-05, "loss": 0.0153, "step": 20800 }, { "epoch": 1.2216742984618998, "grad_norm": 3.385096311569214, "learning_rate": 3.310870993439187e-05, "loss": 0.0142, "step": 20810 }, { "epoch": 1.2222613596336738, "grad_norm": 1.1386430263519287, "learning_rate": 3.3065192462345915e-05, "loss": 0.0124, "step": 20820 }, { "epoch": 1.222848420805448, "grad_norm": 0.872481107711792, "learning_rate": 3.302168947601797e-05, "loss": 0.0105, "step": 20830 }, { "epoch": 1.223435481977222, "grad_norm": 3.004664897918701, "learning_rate": 3.297820101261964e-05, "loss": 0.0102, "step": 20840 }, { "epoch": 1.2240225431489962, "grad_norm": 0.5233016610145569, "learning_rate": 3.293472710935017e-05, "loss": 0.0112, "step": 20850 }, { "epoch": 1.2246096043207702, "grad_norm": 0.3687719702720642, "learning_rate": 3.289126780339631e-05, "loss": 0.0169, "step": 20860 }, { "epoch": 1.2251966654925444, "grad_norm": 1.0242693424224854, "learning_rate": 3.2847823131932365e-05, "loss": 0.0221, "step": 20870 }, { "epoch": 1.2257837266643183, "grad_norm": 1.5213781595230103, "learning_rate": 3.280439313212006e-05, "loss": 0.0084, "step": 20880 }, { "epoch": 1.2263707878360925, "grad_norm": 3.757567882537842, "learning_rate": 3.276097784110862e-05, "loss": 0.0273, "step": 20890 }, { "epoch": 1.2269578490078665, "grad_norm": 1.1027705669403076, "learning_rate": 3.271757729603467e-05, "loss": 0.0175, "step": 20900 }, { "epoch": 1.2275449101796407, "grad_norm": 0.06984422355890274, "learning_rate": 3.267419153402225e-05, "loss": 0.016, "step": 20910 }, { "epoch": 1.2281319713514147, "grad_norm": 0.2697144150733948, "learning_rate": 3.2630820592182696e-05, "loss": 0.0353, "step": 20920 }, { "epoch": 1.228719032523189, "grad_norm": 2.341513156890869, "learning_rate": 3.258746450761471e-05, "loss": 0.0103, "step": 20930 }, { "epoch": 1.2293060936949631, "grad_norm": 2.067079782485962, "learning_rate": 3.25441233174043e-05, "loss": 0.017, "step": 20940 }, { "epoch": 1.2298931548667371, "grad_norm": 0.19684366881847382, "learning_rate": 3.250079705862468e-05, "loss": 0.0151, "step": 20950 }, { "epoch": 1.230480216038511, "grad_norm": 0.21965119242668152, "learning_rate": 3.245748576833636e-05, "loss": 0.0333, "step": 20960 }, { "epoch": 1.2310672772102853, "grad_norm": 3.70943021774292, "learning_rate": 3.241418948358696e-05, "loss": 0.0145, "step": 20970 }, { "epoch": 1.2316543383820595, "grad_norm": 0.9915881752967834, "learning_rate": 3.237090824141134e-05, "loss": 0.0093, "step": 20980 }, { "epoch": 1.2322413995538335, "grad_norm": 0.19427676498889923, "learning_rate": 3.2327642078831466e-05, "loss": 0.0119, "step": 20990 }, { "epoch": 1.2328284607256077, "grad_norm": 2.175851821899414, "learning_rate": 3.228439103285641e-05, "loss": 0.0213, "step": 21000 }, { "epoch": 1.2328284607256077, "eval_loss": 0.4875093698501587, "eval_runtime": 269.7127, "eval_samples_per_second": 3.504, "eval_steps_per_second": 3.504, "step": 21000 }, { "epoch": 1.2334155218973817, "grad_norm": 1.7164874076843262, "learning_rate": 3.2241155140482294e-05, "loss": 0.0144, "step": 21010 }, { "epoch": 1.234002583069156, "grad_norm": 1.4230515956878662, "learning_rate": 3.2197934438692314e-05, "loss": 0.0163, "step": 21020 }, { "epoch": 1.2345896442409299, "grad_norm": 4.605391502380371, "learning_rate": 3.2154728964456605e-05, "loss": 0.0336, "step": 21030 }, { "epoch": 1.235176705412704, "grad_norm": 0.2723656892776489, "learning_rate": 3.211153875473239e-05, "loss": 0.0165, "step": 21040 }, { "epoch": 1.235763766584478, "grad_norm": 1.2724515199661255, "learning_rate": 3.206836384646371e-05, "loss": 0.0221, "step": 21050 }, { "epoch": 1.2363508277562523, "grad_norm": 0.3574167788028717, "learning_rate": 3.202520427658159e-05, "loss": 0.0478, "step": 21060 }, { "epoch": 1.2369378889280263, "grad_norm": 1.6354588270187378, "learning_rate": 3.1982060082003954e-05, "loss": 0.0106, "step": 21070 }, { "epoch": 1.2375249500998005, "grad_norm": 1.4375627040863037, "learning_rate": 3.1938931299635484e-05, "loss": 0.016, "step": 21080 }, { "epoch": 1.2381120112715744, "grad_norm": 1.8797725439071655, "learning_rate": 3.189581796636778e-05, "loss": 0.0093, "step": 21090 }, { "epoch": 1.2386990724433486, "grad_norm": 0.5121396780014038, "learning_rate": 3.185272011907915e-05, "loss": 0.0136, "step": 21100 }, { "epoch": 1.2392861336151226, "grad_norm": 3.526323080062866, "learning_rate": 3.180963779463472e-05, "loss": 0.0219, "step": 21110 }, { "epoch": 1.2398731947868968, "grad_norm": 1.7624855041503906, "learning_rate": 3.176657102988628e-05, "loss": 0.0192, "step": 21120 }, { "epoch": 1.2404602559586708, "grad_norm": 0.9396687150001526, "learning_rate": 3.1723519861672354e-05, "loss": 0.0117, "step": 21130 }, { "epoch": 1.241047317130445, "grad_norm": 1.5598224401474, "learning_rate": 3.168048432681808e-05, "loss": 0.0109, "step": 21140 }, { "epoch": 1.241634378302219, "grad_norm": 4.720143795013428, "learning_rate": 3.1637464462135286e-05, "loss": 0.0425, "step": 21150 }, { "epoch": 1.2422214394739932, "grad_norm": 1.2049460411071777, "learning_rate": 3.159446030442232e-05, "loss": 0.0125, "step": 21160 }, { "epoch": 1.2428085006457672, "grad_norm": 0.5285661816596985, "learning_rate": 3.155147189046418e-05, "loss": 0.0092, "step": 21170 }, { "epoch": 1.2433955618175414, "grad_norm": 1.140669822692871, "learning_rate": 3.1508499257032306e-05, "loss": 0.04, "step": 21180 }, { "epoch": 1.2439826229893154, "grad_norm": 4.023752689361572, "learning_rate": 3.1465542440884736e-05, "loss": 0.0304, "step": 21190 }, { "epoch": 1.2445696841610896, "grad_norm": 0.6173288822174072, "learning_rate": 3.1422601478765874e-05, "loss": 0.0118, "step": 21200 }, { "epoch": 1.2451567453328636, "grad_norm": 2.554194450378418, "learning_rate": 3.137967640740665e-05, "loss": 0.0118, "step": 21210 }, { "epoch": 1.2457438065046378, "grad_norm": 2.1060009002685547, "learning_rate": 3.133676726352438e-05, "loss": 0.0285, "step": 21220 }, { "epoch": 1.246330867676412, "grad_norm": 2.7011616230010986, "learning_rate": 3.12938740838227e-05, "loss": 0.0369, "step": 21230 }, { "epoch": 1.246917928848186, "grad_norm": 2.049039840698242, "learning_rate": 3.125099690499168e-05, "loss": 0.0099, "step": 21240 }, { "epoch": 1.24750499001996, "grad_norm": 0.8543749451637268, "learning_rate": 3.120813576370763e-05, "loss": 0.0118, "step": 21250 }, { "epoch": 1.2480920511917342, "grad_norm": 0.427012175321579, "learning_rate": 3.1165290696633185e-05, "loss": 0.0161, "step": 21260 }, { "epoch": 1.2486791123635084, "grad_norm": 2.7977850437164307, "learning_rate": 3.11224617404172e-05, "loss": 0.018, "step": 21270 }, { "epoch": 1.2492661735352824, "grad_norm": 0.6556240320205688, "learning_rate": 3.1079648931694796e-05, "loss": 0.015, "step": 21280 }, { "epoch": 1.2498532347070566, "grad_norm": 0.5119418501853943, "learning_rate": 3.1036852307087183e-05, "loss": 0.0165, "step": 21290 }, { "epoch": 1.2504402958788305, "grad_norm": 1.350526213645935, "learning_rate": 3.099407190320188e-05, "loss": 0.0185, "step": 21300 }, { "epoch": 1.2510273570506047, "grad_norm": 0.18741434812545776, "learning_rate": 3.095130775663237e-05, "loss": 0.0225, "step": 21310 }, { "epoch": 1.2516144182223787, "grad_norm": 0.7803360223770142, "learning_rate": 3.090855990395836e-05, "loss": 0.0138, "step": 21320 }, { "epoch": 1.252201479394153, "grad_norm": 2.625234365463257, "learning_rate": 3.086582838174551e-05, "loss": 0.0164, "step": 21330 }, { "epoch": 1.252788540565927, "grad_norm": 1.4775711297988892, "learning_rate": 3.082311322654562e-05, "loss": 0.0309, "step": 21340 }, { "epoch": 1.2533756017377011, "grad_norm": 1.3735824823379517, "learning_rate": 3.0780414474896414e-05, "loss": 0.0252, "step": 21350 }, { "epoch": 1.2539626629094751, "grad_norm": 1.6356611251831055, "learning_rate": 3.0737732163321596e-05, "loss": 0.0154, "step": 21360 }, { "epoch": 1.2545497240812493, "grad_norm": 1.5371419191360474, "learning_rate": 3.0695066328330845e-05, "loss": 0.0158, "step": 21370 }, { "epoch": 1.2551367852530233, "grad_norm": 0.9981690645217896, "learning_rate": 3.0652417006419674e-05, "loss": 0.0051, "step": 21380 }, { "epoch": 1.2557238464247975, "grad_norm": 0.13646307587623596, "learning_rate": 3.0609784234069575e-05, "loss": 0.0083, "step": 21390 }, { "epoch": 1.2563109075965715, "grad_norm": 1.6275250911712646, "learning_rate": 3.0567168047747776e-05, "loss": 0.0191, "step": 21400 }, { "epoch": 1.2568979687683457, "grad_norm": 1.1363624334335327, "learning_rate": 3.052456848390739e-05, "loss": 0.0228, "step": 21410 }, { "epoch": 1.2574850299401197, "grad_norm": 0.47851109504699707, "learning_rate": 3.048198557898727e-05, "loss": 0.0171, "step": 21420 }, { "epoch": 1.2580720911118939, "grad_norm": 1.8325543403625488, "learning_rate": 3.043941936941207e-05, "loss": 0.014, "step": 21430 }, { "epoch": 1.258659152283668, "grad_norm": 0.8486720323562622, "learning_rate": 3.0396869891592093e-05, "loss": 0.0142, "step": 21440 }, { "epoch": 1.259246213455442, "grad_norm": 0.4087095260620117, "learning_rate": 3.035433718192341e-05, "loss": 0.0087, "step": 21450 }, { "epoch": 1.259833274627216, "grad_norm": 0.012567095458507538, "learning_rate": 3.0311821276787654e-05, "loss": 0.0167, "step": 21460 }, { "epoch": 1.2604203357989903, "grad_norm": 0.21170192956924438, "learning_rate": 3.0269322212552153e-05, "loss": 0.0092, "step": 21470 }, { "epoch": 1.2610073969707645, "grad_norm": 3.670684814453125, "learning_rate": 3.0226840025569857e-05, "loss": 0.0248, "step": 21480 }, { "epoch": 1.2615944581425385, "grad_norm": 2.305076837539673, "learning_rate": 3.0184374752179183e-05, "loss": 0.0126, "step": 21490 }, { "epoch": 1.2621815193143124, "grad_norm": 2.400017499923706, "learning_rate": 3.014192642870416e-05, "loss": 0.0161, "step": 21500 }, { "epoch": 1.2627685804860866, "grad_norm": 1.1175122261047363, "learning_rate": 3.0099495091454268e-05, "loss": 0.0058, "step": 21510 }, { "epoch": 1.2633556416578609, "grad_norm": 2.3368163108825684, "learning_rate": 3.00570807767245e-05, "loss": 0.0058, "step": 21520 }, { "epoch": 1.2639427028296348, "grad_norm": 2.3618102073669434, "learning_rate": 3.0014683520795256e-05, "loss": 0.012, "step": 21530 }, { "epoch": 1.2645297640014088, "grad_norm": 4.362176895141602, "learning_rate": 2.9972303359932386e-05, "loss": 0.0089, "step": 21540 }, { "epoch": 1.265116825173183, "grad_norm": 0.6850021481513977, "learning_rate": 2.992994033038704e-05, "loss": 0.0079, "step": 21550 }, { "epoch": 1.2657038863449572, "grad_norm": 0.9558687210083008, "learning_rate": 2.9887594468395798e-05, "loss": 0.0143, "step": 21560 }, { "epoch": 1.2662909475167312, "grad_norm": 0.13914182782173157, "learning_rate": 2.984526581018049e-05, "loss": 0.0189, "step": 21570 }, { "epoch": 1.2668780086885054, "grad_norm": 0.7266558408737183, "learning_rate": 2.9802954391948294e-05, "loss": 0.0208, "step": 21580 }, { "epoch": 1.2674650698602794, "grad_norm": 0.47501394152641296, "learning_rate": 2.976066024989158e-05, "loss": 0.0113, "step": 21590 }, { "epoch": 1.2680521310320536, "grad_norm": 1.0535529851913452, "learning_rate": 2.9718383420187983e-05, "loss": 0.0143, "step": 21600 }, { "epoch": 1.2686391922038276, "grad_norm": 1.2893620729446411, "learning_rate": 2.96761239390003e-05, "loss": 0.0128, "step": 21610 }, { "epoch": 1.2692262533756018, "grad_norm": 4.001839637756348, "learning_rate": 2.963388184247651e-05, "loss": 0.0093, "step": 21620 }, { "epoch": 1.2698133145473758, "grad_norm": 2.0361053943634033, "learning_rate": 2.959165716674973e-05, "loss": 0.0207, "step": 21630 }, { "epoch": 1.27040037571915, "grad_norm": 0.2779838740825653, "learning_rate": 2.9549449947938108e-05, "loss": 0.014, "step": 21640 }, { "epoch": 1.270987436890924, "grad_norm": 2.0163493156433105, "learning_rate": 2.9507260222144973e-05, "loss": 0.0113, "step": 21650 }, { "epoch": 1.2715744980626982, "grad_norm": 0.9817140698432922, "learning_rate": 2.9465088025458586e-05, "loss": 0.0099, "step": 21660 }, { "epoch": 1.2721615592344722, "grad_norm": 0.3210301399230957, "learning_rate": 2.942293339395227e-05, "loss": 0.0152, "step": 21670 }, { "epoch": 1.2727486204062464, "grad_norm": 0.5245820879936218, "learning_rate": 2.9380796363684303e-05, "loss": 0.0109, "step": 21680 }, { "epoch": 1.2733356815780204, "grad_norm": 3.9654788970947266, "learning_rate": 2.9338676970697926e-05, "loss": 0.0389, "step": 21690 }, { "epoch": 1.2739227427497946, "grad_norm": 0.46540847420692444, "learning_rate": 2.9296575251021265e-05, "loss": 0.0132, "step": 21700 }, { "epoch": 1.2745098039215685, "grad_norm": 1.349572777748108, "learning_rate": 2.925449124066737e-05, "loss": 0.0224, "step": 21710 }, { "epoch": 1.2750968650933427, "grad_norm": 0.2008529156446457, "learning_rate": 2.9212424975634078e-05, "loss": 0.0081, "step": 21720 }, { "epoch": 1.275683926265117, "grad_norm": 0.6957343816757202, "learning_rate": 2.9170376491904127e-05, "loss": 0.0189, "step": 21730 }, { "epoch": 1.276270987436891, "grad_norm": 1.4542927742004395, "learning_rate": 2.912834582544497e-05, "loss": 0.0069, "step": 21740 }, { "epoch": 1.276858048608665, "grad_norm": 0.18256020545959473, "learning_rate": 2.9086333012208865e-05, "loss": 0.0127, "step": 21750 }, { "epoch": 1.2774451097804391, "grad_norm": 1.6553212404251099, "learning_rate": 2.9044338088132816e-05, "loss": 0.0152, "step": 21760 }, { "epoch": 1.2780321709522133, "grad_norm": 0.18057706952095032, "learning_rate": 2.9002361089138453e-05, "loss": 0.0261, "step": 21770 }, { "epoch": 1.2786192321239873, "grad_norm": 0.0635433942079544, "learning_rate": 2.896040205113214e-05, "loss": 0.007, "step": 21780 }, { "epoch": 1.2792062932957613, "grad_norm": 0.10675487667322159, "learning_rate": 2.8918461010004842e-05, "loss": 0.0068, "step": 21790 }, { "epoch": 1.2797933544675355, "grad_norm": 4.841487407684326, "learning_rate": 2.887653800163218e-05, "loss": 0.0171, "step": 21800 }, { "epoch": 1.2803804156393097, "grad_norm": 0.09284064918756485, "learning_rate": 2.8834633061874256e-05, "loss": 0.0172, "step": 21810 }, { "epoch": 1.2809674768110837, "grad_norm": 2.0839176177978516, "learning_rate": 2.87927462265758e-05, "loss": 0.0106, "step": 21820 }, { "epoch": 1.2815545379828577, "grad_norm": 0.8402649164199829, "learning_rate": 2.875087753156603e-05, "loss": 0.0188, "step": 21830 }, { "epoch": 1.2821415991546319, "grad_norm": 1.7124103307724, "learning_rate": 2.8709027012658663e-05, "loss": 0.0157, "step": 21840 }, { "epoch": 1.282728660326406, "grad_norm": 2.3623087406158447, "learning_rate": 2.8667194705651807e-05, "loss": 0.0157, "step": 21850 }, { "epoch": 1.28331572149818, "grad_norm": 0.08953145146369934, "learning_rate": 2.862538064632808e-05, "loss": 0.004, "step": 21860 }, { "epoch": 1.2839027826699543, "grad_norm": 1.0278525352478027, "learning_rate": 2.858358487045441e-05, "loss": 0.0226, "step": 21870 }, { "epoch": 1.2844898438417283, "grad_norm": 2.5860304832458496, "learning_rate": 2.854180741378214e-05, "loss": 0.0178, "step": 21880 }, { "epoch": 1.2850769050135025, "grad_norm": 0.4281832277774811, "learning_rate": 2.8500048312046927e-05, "loss": 0.0116, "step": 21890 }, { "epoch": 1.2856639661852765, "grad_norm": 0.7965812683105469, "learning_rate": 2.8458307600968725e-05, "loss": 0.0111, "step": 21900 }, { "epoch": 1.2862510273570507, "grad_norm": 0.49417802691459656, "learning_rate": 2.8416585316251776e-05, "loss": 0.0198, "step": 21910 }, { "epoch": 1.2868380885288246, "grad_norm": 1.8703595399856567, "learning_rate": 2.8374881493584516e-05, "loss": 0.013, "step": 21920 }, { "epoch": 1.2874251497005988, "grad_norm": 0.5230078101158142, "learning_rate": 2.8333196168639632e-05, "loss": 0.0092, "step": 21930 }, { "epoch": 1.2880122108723728, "grad_norm": 0.09066743403673172, "learning_rate": 2.8291529377073956e-05, "loss": 0.0182, "step": 21940 }, { "epoch": 1.288599272044147, "grad_norm": 0.2826184034347534, "learning_rate": 2.824988115452849e-05, "loss": 0.0263, "step": 21950 }, { "epoch": 1.289186333215921, "grad_norm": 0.26999109983444214, "learning_rate": 2.8208251536628344e-05, "loss": 0.0085, "step": 21960 }, { "epoch": 1.2897733943876952, "grad_norm": 1.7400518655776978, "learning_rate": 2.8166640558982743e-05, "loss": 0.0223, "step": 21970 }, { "epoch": 1.2903604555594694, "grad_norm": 2.5776426792144775, "learning_rate": 2.8125048257184896e-05, "loss": 0.0141, "step": 21980 }, { "epoch": 1.2909475167312434, "grad_norm": 1.6490485668182373, "learning_rate": 2.8083474666812127e-05, "loss": 0.0064, "step": 21990 }, { "epoch": 1.2915345779030174, "grad_norm": 0.6944535970687866, "learning_rate": 2.8041919823425633e-05, "loss": 0.0164, "step": 22000 }, { "epoch": 1.2921216390747916, "grad_norm": 0.12942366302013397, "learning_rate": 2.800038376257075e-05, "loss": 0.0139, "step": 22010 }, { "epoch": 1.2927087002465658, "grad_norm": 0.05547524243593216, "learning_rate": 2.7958866519776572e-05, "loss": 0.0065, "step": 22020 }, { "epoch": 1.2932957614183398, "grad_norm": 0.5519035458564758, "learning_rate": 2.791736813055621e-05, "loss": 0.0104, "step": 22030 }, { "epoch": 1.2938828225901138, "grad_norm": 0.9530815482139587, "learning_rate": 2.787588863040661e-05, "loss": 0.023, "step": 22040 }, { "epoch": 1.294469883761888, "grad_norm": 2.2111546993255615, "learning_rate": 2.7834428054808543e-05, "loss": 0.0174, "step": 22050 }, { "epoch": 1.2950569449336622, "grad_norm": 0.15427449345588684, "learning_rate": 2.7792986439226615e-05, "loss": 0.0099, "step": 22060 }, { "epoch": 1.2956440061054362, "grad_norm": 0.021044138818979263, "learning_rate": 2.7751563819109218e-05, "loss": 0.0191, "step": 22070 }, { "epoch": 1.2962310672772102, "grad_norm": 0.694969117641449, "learning_rate": 2.7710160229888504e-05, "loss": 0.0119, "step": 22080 }, { "epoch": 1.2968181284489844, "grad_norm": 0.4075547754764557, "learning_rate": 2.7668775706980288e-05, "loss": 0.0163, "step": 22090 }, { "epoch": 1.2974051896207586, "grad_norm": 2.8552281856536865, "learning_rate": 2.7627410285784163e-05, "loss": 0.0124, "step": 22100 }, { "epoch": 1.2979922507925326, "grad_norm": 2.519705057144165, "learning_rate": 2.7586064001683286e-05, "loss": 0.0235, "step": 22110 }, { "epoch": 1.2985793119643068, "grad_norm": 3.5238704681396484, "learning_rate": 2.754473689004453e-05, "loss": 0.0096, "step": 22120 }, { "epoch": 1.2991663731360807, "grad_norm": 1.1213005781173706, "learning_rate": 2.750342898621833e-05, "loss": 0.014, "step": 22130 }, { "epoch": 1.299753434307855, "grad_norm": 0.7825922966003418, "learning_rate": 2.7462140325538714e-05, "loss": 0.0114, "step": 22140 }, { "epoch": 1.300340495479629, "grad_norm": 0.6387771964073181, "learning_rate": 2.7420870943323197e-05, "loss": 0.017, "step": 22150 }, { "epoch": 1.3009275566514031, "grad_norm": 0.14254145324230194, "learning_rate": 2.7379620874872856e-05, "loss": 0.0306, "step": 22160 }, { "epoch": 1.3015146178231771, "grad_norm": 1.306911826133728, "learning_rate": 2.7338390155472215e-05, "loss": 0.0194, "step": 22170 }, { "epoch": 1.3021016789949513, "grad_norm": 0.15860214829444885, "learning_rate": 2.729717882038925e-05, "loss": 0.0106, "step": 22180 }, { "epoch": 1.3026887401667253, "grad_norm": 0.9113043546676636, "learning_rate": 2.725598690487543e-05, "loss": 0.0154, "step": 22190 }, { "epoch": 1.3032758013384995, "grad_norm": 0.9923615455627441, "learning_rate": 2.721481444416548e-05, "loss": 0.0106, "step": 22200 }, { "epoch": 1.3038628625102735, "grad_norm": 1.5613048076629639, "learning_rate": 2.7173661473477608e-05, "loss": 0.0162, "step": 22210 }, { "epoch": 1.3044499236820477, "grad_norm": 0.5515766739845276, "learning_rate": 2.7132528028013248e-05, "loss": 0.0148, "step": 22220 }, { "epoch": 1.3050369848538217, "grad_norm": 0.08424253016710281, "learning_rate": 2.7091414142957204e-05, "loss": 0.0052, "step": 22230 }, { "epoch": 1.305624046025596, "grad_norm": 0.14063885807991028, "learning_rate": 2.7050319853477522e-05, "loss": 0.013, "step": 22240 }, { "epoch": 1.3062111071973699, "grad_norm": 1.6327464580535889, "learning_rate": 2.7009245194725507e-05, "loss": 0.015, "step": 22250 }, { "epoch": 1.306798168369144, "grad_norm": 0.1495017409324646, "learning_rate": 2.6968190201835625e-05, "loss": 0.0086, "step": 22260 }, { "epoch": 1.3073852295409183, "grad_norm": 1.8175768852233887, "learning_rate": 2.6927154909925577e-05, "loss": 0.0157, "step": 22270 }, { "epoch": 1.3079722907126923, "grad_norm": 1.9871352910995483, "learning_rate": 2.6886139354096164e-05, "loss": 0.0178, "step": 22280 }, { "epoch": 1.3085593518844663, "grad_norm": 3.3310537338256836, "learning_rate": 2.684514356943132e-05, "loss": 0.0161, "step": 22290 }, { "epoch": 1.3091464130562405, "grad_norm": 1.8495787382125854, "learning_rate": 2.6804167590998096e-05, "loss": 0.0255, "step": 22300 }, { "epoch": 1.3097334742280147, "grad_norm": 0.10988368093967438, "learning_rate": 2.676321145384657e-05, "loss": 0.0078, "step": 22310 }, { "epoch": 1.3103205353997887, "grad_norm": 2.718132495880127, "learning_rate": 2.6722275193009872e-05, "loss": 0.0081, "step": 22320 }, { "epoch": 1.3109075965715626, "grad_norm": 1.8900574445724487, "learning_rate": 2.668135884350408e-05, "loss": 0.013, "step": 22330 }, { "epoch": 1.3114946577433368, "grad_norm": 1.063090443611145, "learning_rate": 2.664046244032832e-05, "loss": 0.0196, "step": 22340 }, { "epoch": 1.312081718915111, "grad_norm": 0.3889022171497345, "learning_rate": 2.659958601846454e-05, "loss": 0.012, "step": 22350 }, { "epoch": 1.312668780086885, "grad_norm": 0.4353967010974884, "learning_rate": 2.6558729612877753e-05, "loss": 0.0108, "step": 22360 }, { "epoch": 1.313255841258659, "grad_norm": 0.04097278043627739, "learning_rate": 2.6517893258515702e-05, "loss": 0.0315, "step": 22370 }, { "epoch": 1.3138429024304332, "grad_norm": 1.8794195652008057, "learning_rate": 2.647707699030909e-05, "loss": 0.0189, "step": 22380 }, { "epoch": 1.3144299636022074, "grad_norm": 1.039872646331787, "learning_rate": 2.6436280843171346e-05, "loss": 0.0064, "step": 22390 }, { "epoch": 1.3150170247739814, "grad_norm": 1.8896512985229492, "learning_rate": 2.639550485199874e-05, "loss": 0.0135, "step": 22400 }, { "epoch": 1.3156040859457556, "grad_norm": 0.401368647813797, "learning_rate": 2.635474905167032e-05, "loss": 0.0185, "step": 22410 }, { "epoch": 1.3161911471175296, "grad_norm": 2.171654224395752, "learning_rate": 2.631401347704783e-05, "loss": 0.0191, "step": 22420 }, { "epoch": 1.3167782082893038, "grad_norm": 0.8098046183586121, "learning_rate": 2.627329816297569e-05, "loss": 0.0131, "step": 22430 }, { "epoch": 1.3173652694610778, "grad_norm": 0.5069458484649658, "learning_rate": 2.6232603144281066e-05, "loss": 0.0116, "step": 22440 }, { "epoch": 1.317952330632852, "grad_norm": 0.17364555597305298, "learning_rate": 2.6191928455773662e-05, "loss": 0.0053, "step": 22450 }, { "epoch": 1.318539391804626, "grad_norm": 0.7691751718521118, "learning_rate": 2.615127413224588e-05, "loss": 0.0301, "step": 22460 }, { "epoch": 1.3191264529764002, "grad_norm": 0.03866483271121979, "learning_rate": 2.611064020847266e-05, "loss": 0.021, "step": 22470 }, { "epoch": 1.3197135141481742, "grad_norm": 1.2438933849334717, "learning_rate": 2.6070026719211505e-05, "loss": 0.0179, "step": 22480 }, { "epoch": 1.3203005753199484, "grad_norm": 2.419809341430664, "learning_rate": 2.6029433699202454e-05, "loss": 0.0193, "step": 22490 }, { "epoch": 1.3208876364917224, "grad_norm": 1.7950797080993652, "learning_rate": 2.598886118316798e-05, "loss": 0.0143, "step": 22500 }, { "epoch": 1.3214746976634966, "grad_norm": 1.6761778593063354, "learning_rate": 2.5948309205813094e-05, "loss": 0.0142, "step": 22510 }, { "epoch": 1.3220617588352708, "grad_norm": 4.547269344329834, "learning_rate": 2.590777780182515e-05, "loss": 0.0141, "step": 22520 }, { "epoch": 1.3226488200070448, "grad_norm": 2.2153279781341553, "learning_rate": 2.5867267005873996e-05, "loss": 0.0149, "step": 22530 }, { "epoch": 1.3232358811788187, "grad_norm": 1.2146892547607422, "learning_rate": 2.582677685261179e-05, "loss": 0.0124, "step": 22540 }, { "epoch": 1.323822942350593, "grad_norm": 2.6923255920410156, "learning_rate": 2.578630737667308e-05, "loss": 0.016, "step": 22550 }, { "epoch": 1.3244100035223672, "grad_norm": 0.5817638635635376, "learning_rate": 2.574585861267466e-05, "loss": 0.0115, "step": 22560 }, { "epoch": 1.3249970646941411, "grad_norm": 1.5570284128189087, "learning_rate": 2.570543059521569e-05, "loss": 0.0415, "step": 22570 }, { "epoch": 1.3255841258659151, "grad_norm": 0.1070183590054512, "learning_rate": 2.566502335887747e-05, "loss": 0.0259, "step": 22580 }, { "epoch": 1.3261711870376893, "grad_norm": 3.1985042095184326, "learning_rate": 2.5624636938223675e-05, "loss": 0.026, "step": 22590 }, { "epoch": 1.3267582482094635, "grad_norm": 0.05600825324654579, "learning_rate": 2.5584271367800072e-05, "loss": 0.0089, "step": 22600 }, { "epoch": 1.3273453093812375, "grad_norm": 0.8721899390220642, "learning_rate": 2.5543926682134588e-05, "loss": 0.0156, "step": 22610 }, { "epoch": 1.3279323705530115, "grad_norm": 0.1772165149450302, "learning_rate": 2.550360291573735e-05, "loss": 0.0185, "step": 22620 }, { "epoch": 1.3285194317247857, "grad_norm": 2.8990161418914795, "learning_rate": 2.546330010310052e-05, "loss": 0.0218, "step": 22630 }, { "epoch": 1.32910649289656, "grad_norm": 0.6351332068443298, "learning_rate": 2.5423018278698386e-05, "loss": 0.012, "step": 22640 }, { "epoch": 1.329693554068334, "grad_norm": 0.08211655914783478, "learning_rate": 2.5382757476987268e-05, "loss": 0.0055, "step": 22650 }, { "epoch": 1.330280615240108, "grad_norm": 0.531152606010437, "learning_rate": 2.5342517732405523e-05, "loss": 0.0239, "step": 22660 }, { "epoch": 1.330867676411882, "grad_norm": 7.08525276184082, "learning_rate": 2.530229907937344e-05, "loss": 0.0433, "step": 22670 }, { "epoch": 1.3314547375836563, "grad_norm": 0.765272855758667, "learning_rate": 2.5262101552293345e-05, "loss": 0.0232, "step": 22680 }, { "epoch": 1.3320417987554303, "grad_norm": 1.7761889696121216, "learning_rate": 2.52219251855494e-05, "loss": 0.0106, "step": 22690 }, { "epoch": 1.3326288599272045, "grad_norm": 1.9805822372436523, "learning_rate": 2.5181770013507754e-05, "loss": 0.0155, "step": 22700 }, { "epoch": 1.3332159210989785, "grad_norm": 0.1292826235294342, "learning_rate": 2.5141636070516382e-05, "loss": 0.0108, "step": 22710 }, { "epoch": 1.3338029822707527, "grad_norm": 0.33158397674560547, "learning_rate": 2.5101523390905112e-05, "loss": 0.0165, "step": 22720 }, { "epoch": 1.3343900434425267, "grad_norm": 0.04000745341181755, "learning_rate": 2.5061432008985598e-05, "loss": 0.007, "step": 22730 }, { "epoch": 1.3349771046143009, "grad_norm": 4.565971374511719, "learning_rate": 2.5021361959051226e-05, "loss": 0.021, "step": 22740 }, { "epoch": 1.3355641657860748, "grad_norm": 1.0022798776626587, "learning_rate": 2.4981313275377177e-05, "loss": 0.0312, "step": 22750 }, { "epoch": 1.336151226957849, "grad_norm": 1.3789433240890503, "learning_rate": 2.4941285992220354e-05, "loss": 0.0058, "step": 22760 }, { "epoch": 1.336738288129623, "grad_norm": 2.178168535232544, "learning_rate": 2.4901280143819368e-05, "loss": 0.0131, "step": 22770 }, { "epoch": 1.3373253493013972, "grad_norm": 1.021647572517395, "learning_rate": 2.4861295764394426e-05, "loss": 0.0151, "step": 22780 }, { "epoch": 1.3379124104731712, "grad_norm": 3.3021364212036133, "learning_rate": 2.482133288814747e-05, "loss": 0.0149, "step": 22790 }, { "epoch": 1.3384994716449454, "grad_norm": 0.023236090317368507, "learning_rate": 2.4781391549261955e-05, "loss": 0.0108, "step": 22800 }, { "epoch": 1.3390865328167196, "grad_norm": 0.8128157258033752, "learning_rate": 2.4741471781902975e-05, "loss": 0.0235, "step": 22810 }, { "epoch": 1.3396735939884936, "grad_norm": 4.04619836807251, "learning_rate": 2.470157362021715e-05, "loss": 0.0303, "step": 22820 }, { "epoch": 1.3402606551602676, "grad_norm": 0.10842420160770416, "learning_rate": 2.4661697098332648e-05, "loss": 0.0095, "step": 22830 }, { "epoch": 1.3408477163320418, "grad_norm": 1.3374766111373901, "learning_rate": 2.462184225035905e-05, "loss": 0.0094, "step": 22840 }, { "epoch": 1.341434777503816, "grad_norm": 3.2762277126312256, "learning_rate": 2.4582009110387506e-05, "loss": 0.0071, "step": 22850 }, { "epoch": 1.34202183867559, "grad_norm": 1.3600951433181763, "learning_rate": 2.4542197712490483e-05, "loss": 0.022, "step": 22860 }, { "epoch": 1.342608899847364, "grad_norm": 1.192181944847107, "learning_rate": 2.4502408090721934e-05, "loss": 0.0142, "step": 22870 }, { "epoch": 1.3431959610191382, "grad_norm": 1.137093424797058, "learning_rate": 2.446264027911716e-05, "loss": 0.0628, "step": 22880 }, { "epoch": 1.3437830221909124, "grad_norm": 0.13391529023647308, "learning_rate": 2.4422894311692807e-05, "loss": 0.0147, "step": 22890 }, { "epoch": 1.3443700833626864, "grad_norm": 0.5997415781021118, "learning_rate": 2.438317022244684e-05, "loss": 0.0156, "step": 22900 }, { "epoch": 1.3449571445344604, "grad_norm": 0.11458457261323929, "learning_rate": 2.4343468045358476e-05, "loss": 0.0253, "step": 22910 }, { "epoch": 1.3455442057062346, "grad_norm": 0.26020538806915283, "learning_rate": 2.4303787814388247e-05, "loss": 0.0163, "step": 22920 }, { "epoch": 1.3461312668780088, "grad_norm": 1.4215712547302246, "learning_rate": 2.4264129563477822e-05, "loss": 0.0319, "step": 22930 }, { "epoch": 1.3467183280497828, "grad_norm": 1.1173344850540161, "learning_rate": 2.4224493326550214e-05, "loss": 0.0258, "step": 22940 }, { "epoch": 1.347305389221557, "grad_norm": 2.754993200302124, "learning_rate": 2.418487913750946e-05, "loss": 0.0253, "step": 22950 }, { "epoch": 1.347892450393331, "grad_norm": 0.8712794184684753, "learning_rate": 2.4145287030240826e-05, "loss": 0.0161, "step": 22960 }, { "epoch": 1.3484795115651051, "grad_norm": 0.07789886742830276, "learning_rate": 2.410571703861063e-05, "loss": 0.0086, "step": 22970 }, { "epoch": 1.3490665727368791, "grad_norm": 1.6168535947799683, "learning_rate": 2.4066169196466326e-05, "loss": 0.0202, "step": 22980 }, { "epoch": 1.3496536339086533, "grad_norm": 0.6253584027290344, "learning_rate": 2.4026643537636395e-05, "loss": 0.0159, "step": 22990 }, { "epoch": 1.3502406950804273, "grad_norm": 4.4001054763793945, "learning_rate": 2.3987140095930343e-05, "loss": 0.0101, "step": 23000 }, { "epoch": 1.3508277562522015, "grad_norm": 0.6846688389778137, "learning_rate": 2.3947658905138702e-05, "loss": 0.0173, "step": 23010 }, { "epoch": 1.3514148174239755, "grad_norm": 0.027123354375362396, "learning_rate": 2.3908199999032904e-05, "loss": 0.0086, "step": 23020 }, { "epoch": 1.3520018785957497, "grad_norm": 0.6296704411506653, "learning_rate": 2.3868763411365396e-05, "loss": 0.0056, "step": 23030 }, { "epoch": 1.3525889397675237, "grad_norm": 1.7773135900497437, "learning_rate": 2.382934917586947e-05, "loss": 0.0161, "step": 23040 }, { "epoch": 1.353176000939298, "grad_norm": 0.052110861986875534, "learning_rate": 2.378995732625933e-05, "loss": 0.0159, "step": 23050 }, { "epoch": 1.353763062111072, "grad_norm": 1.0526155233383179, "learning_rate": 2.375058789623004e-05, "loss": 0.016, "step": 23060 }, { "epoch": 1.354350123282846, "grad_norm": 0.11346473544836044, "learning_rate": 2.3711240919457493e-05, "loss": 0.0073, "step": 23070 }, { "epoch": 1.35493718445462, "grad_norm": 0.9317972660064697, "learning_rate": 2.367191642959832e-05, "loss": 0.0294, "step": 23080 }, { "epoch": 1.3555242456263943, "grad_norm": 0.19858163595199585, "learning_rate": 2.3632614460289985e-05, "loss": 0.0306, "step": 23090 }, { "epoch": 1.3561113067981685, "grad_norm": 0.00909331627190113, "learning_rate": 2.3593335045150626e-05, "loss": 0.0038, "step": 23100 }, { "epoch": 1.3566983679699425, "grad_norm": 0.14268755912780762, "learning_rate": 2.3554078217779145e-05, "loss": 0.0063, "step": 23110 }, { "epoch": 1.3572854291417165, "grad_norm": 2.0616939067840576, "learning_rate": 2.3514844011755087e-05, "loss": 0.0178, "step": 23120 }, { "epoch": 1.3578724903134907, "grad_norm": 0.20650817453861237, "learning_rate": 2.3475632460638692e-05, "loss": 0.0124, "step": 23130 }, { "epoch": 1.3584595514852649, "grad_norm": 0.26865655183792114, "learning_rate": 2.3436443597970735e-05, "loss": 0.017, "step": 23140 }, { "epoch": 1.3590466126570389, "grad_norm": 0.7835626602172852, "learning_rate": 2.3397277457272665e-05, "loss": 0.0194, "step": 23150 }, { "epoch": 1.3596336738288128, "grad_norm": 0.1329191029071808, "learning_rate": 2.3358134072046466e-05, "loss": 0.0057, "step": 23160 }, { "epoch": 1.360220735000587, "grad_norm": 0.566769003868103, "learning_rate": 2.331901347577466e-05, "loss": 0.0114, "step": 23170 }, { "epoch": 1.3608077961723613, "grad_norm": 0.7775740027427673, "learning_rate": 2.327991570192029e-05, "loss": 0.0171, "step": 23180 }, { "epoch": 1.3613948573441352, "grad_norm": 1.4745599031448364, "learning_rate": 2.3240840783926827e-05, "loss": 0.0154, "step": 23190 }, { "epoch": 1.3619819185159094, "grad_norm": 0.4420107901096344, "learning_rate": 2.320178875521826e-05, "loss": 0.0183, "step": 23200 }, { "epoch": 1.3625689796876834, "grad_norm": 2.0607283115386963, "learning_rate": 2.3162759649198928e-05, "loss": 0.0315, "step": 23210 }, { "epoch": 1.3631560408594576, "grad_norm": 0.39873799681663513, "learning_rate": 2.3123753499253618e-05, "loss": 0.0089, "step": 23220 }, { "epoch": 1.3637431020312316, "grad_norm": 0.5348065495491028, "learning_rate": 2.3084770338747464e-05, "loss": 0.0039, "step": 23230 }, { "epoch": 1.3643301632030058, "grad_norm": 0.4253537654876709, "learning_rate": 2.3045810201025946e-05, "loss": 0.03, "step": 23240 }, { "epoch": 1.3649172243747798, "grad_norm": 0.5349234938621521, "learning_rate": 2.300687311941481e-05, "loss": 0.0091, "step": 23250 }, { "epoch": 1.365504285546554, "grad_norm": 0.1099616214632988, "learning_rate": 2.296795912722014e-05, "loss": 0.0091, "step": 23260 }, { "epoch": 1.366091346718328, "grad_norm": 1.0128790140151978, "learning_rate": 2.29290682577282e-05, "loss": 0.0247, "step": 23270 }, { "epoch": 1.3666784078901022, "grad_norm": 0.43558740615844727, "learning_rate": 2.2890200544205516e-05, "loss": 0.0044, "step": 23280 }, { "epoch": 1.3672654690618762, "grad_norm": 2.41221284866333, "learning_rate": 2.285135601989885e-05, "loss": 0.0311, "step": 23290 }, { "epoch": 1.3678525302336504, "grad_norm": 4.838248252868652, "learning_rate": 2.2812534718035046e-05, "loss": 0.0161, "step": 23300 }, { "epoch": 1.3684395914054244, "grad_norm": 2.3558995723724365, "learning_rate": 2.277373667182114e-05, "loss": 0.0353, "step": 23310 }, { "epoch": 1.3690266525771986, "grad_norm": 0.7227855920791626, "learning_rate": 2.2734961914444225e-05, "loss": 0.0087, "step": 23320 }, { "epoch": 1.3696137137489726, "grad_norm": 0.5397270917892456, "learning_rate": 2.2696210479071524e-05, "loss": 0.0107, "step": 23330 }, { "epoch": 1.3702007749207468, "grad_norm": 0.44219905138015747, "learning_rate": 2.2657482398850287e-05, "loss": 0.013, "step": 23340 }, { "epoch": 1.370787836092521, "grad_norm": 3.8944509029388428, "learning_rate": 2.261877770690781e-05, "loss": 0.0227, "step": 23350 }, { "epoch": 1.371374897264295, "grad_norm": 1.2879079580307007, "learning_rate": 2.2580096436351333e-05, "loss": 0.0126, "step": 23360 }, { "epoch": 1.371961958436069, "grad_norm": 0.7565488815307617, "learning_rate": 2.2541438620268124e-05, "loss": 0.0125, "step": 23370 }, { "epoch": 1.3725490196078431, "grad_norm": 0.37920913100242615, "learning_rate": 2.2502804291725315e-05, "loss": 0.0124, "step": 23380 }, { "epoch": 1.3731360807796174, "grad_norm": 1.009751319885254, "learning_rate": 2.246419348377001e-05, "loss": 0.0075, "step": 23390 }, { "epoch": 1.3737231419513913, "grad_norm": 0.5621313452720642, "learning_rate": 2.242560622942918e-05, "loss": 0.0449, "step": 23400 }, { "epoch": 1.3743102031231653, "grad_norm": 0.046052515506744385, "learning_rate": 2.2387042561709654e-05, "loss": 0.0149, "step": 23410 }, { "epoch": 1.3748972642949395, "grad_norm": 0.5175926685333252, "learning_rate": 2.2348502513598035e-05, "loss": 0.0184, "step": 23420 }, { "epoch": 1.3754843254667137, "grad_norm": 0.031084802001714706, "learning_rate": 2.2309986118060784e-05, "loss": 0.0153, "step": 23430 }, { "epoch": 1.3760713866384877, "grad_norm": 2.160630941390991, "learning_rate": 2.227149340804412e-05, "loss": 0.016, "step": 23440 }, { "epoch": 1.3766584478102617, "grad_norm": 1.9437071084976196, "learning_rate": 2.2233024416473948e-05, "loss": 0.0148, "step": 23450 }, { "epoch": 1.377245508982036, "grad_norm": 2.112046957015991, "learning_rate": 2.2194579176255954e-05, "loss": 0.0149, "step": 23460 }, { "epoch": 1.37783257015381, "grad_norm": 0.9080813527107239, "learning_rate": 2.215615772027546e-05, "loss": 0.0205, "step": 23470 }, { "epoch": 1.378419631325584, "grad_norm": 0.025609241798520088, "learning_rate": 2.2117760081397506e-05, "loss": 0.0099, "step": 23480 }, { "epoch": 1.3790066924973583, "grad_norm": 2.6546828746795654, "learning_rate": 2.2079386292466652e-05, "loss": 0.0272, "step": 23490 }, { "epoch": 1.3795937536691323, "grad_norm": 0.9885413646697998, "learning_rate": 2.2041036386307173e-05, "loss": 0.0115, "step": 23500 }, { "epoch": 1.3801808148409065, "grad_norm": 2.635192394256592, "learning_rate": 2.2002710395722805e-05, "loss": 0.0285, "step": 23510 }, { "epoch": 1.3807678760126805, "grad_norm": 0.9918876886367798, "learning_rate": 2.196440835349695e-05, "loss": 0.0194, "step": 23520 }, { "epoch": 1.3813549371844547, "grad_norm": 0.0278923436999321, "learning_rate": 2.192613029239241e-05, "loss": 0.0171, "step": 23530 }, { "epoch": 1.3819419983562287, "grad_norm": 0.4955386221408844, "learning_rate": 2.188787624515156e-05, "loss": 0.0066, "step": 23540 }, { "epoch": 1.3825290595280029, "grad_norm": 0.3491193950176239, "learning_rate": 2.184964624449617e-05, "loss": 0.0185, "step": 23550 }, { "epoch": 1.3831161206997769, "grad_norm": 1.2932661771774292, "learning_rate": 2.181144032312747e-05, "loss": 0.011, "step": 23560 }, { "epoch": 1.383703181871551, "grad_norm": 1.398934245109558, "learning_rate": 2.1773258513726098e-05, "loss": 0.0151, "step": 23570 }, { "epoch": 1.384290243043325, "grad_norm": 0.3947453498840332, "learning_rate": 2.173510084895206e-05, "loss": 0.0057, "step": 23580 }, { "epoch": 1.3848773042150992, "grad_norm": 1.3551521301269531, "learning_rate": 2.1696967361444733e-05, "loss": 0.0266, "step": 23590 }, { "epoch": 1.3854643653868732, "grad_norm": 1.1816303730010986, "learning_rate": 2.165885808382275e-05, "loss": 0.0166, "step": 23600 }, { "epoch": 1.3860514265586474, "grad_norm": 0.12572306394577026, "learning_rate": 2.16207730486841e-05, "loss": 0.0169, "step": 23610 }, { "epoch": 1.3866384877304214, "grad_norm": 4.830150604248047, "learning_rate": 2.1582712288605994e-05, "loss": 0.0065, "step": 23620 }, { "epoch": 1.3872255489021956, "grad_norm": 1.1418870687484741, "learning_rate": 2.1544675836144907e-05, "loss": 0.0227, "step": 23630 }, { "epoch": 1.3878126100739698, "grad_norm": 0.8437126278877258, "learning_rate": 2.1506663723836502e-05, "loss": 0.0271, "step": 23640 }, { "epoch": 1.3883996712457438, "grad_norm": 0.02604585886001587, "learning_rate": 2.146867598419565e-05, "loss": 0.015, "step": 23650 }, { "epoch": 1.3889867324175178, "grad_norm": 3.6496636867523193, "learning_rate": 2.1430712649716328e-05, "loss": 0.0263, "step": 23660 }, { "epoch": 1.389573793589292, "grad_norm": 1.259790301322937, "learning_rate": 2.1392773752871685e-05, "loss": 0.0111, "step": 23670 }, { "epoch": 1.3901608547610662, "grad_norm": 0.3626500964164734, "learning_rate": 2.13548593261139e-05, "loss": 0.0101, "step": 23680 }, { "epoch": 1.3907479159328402, "grad_norm": 2.913550853729248, "learning_rate": 2.1316969401874316e-05, "loss": 0.0116, "step": 23690 }, { "epoch": 1.3913349771046142, "grad_norm": 0.4137645959854126, "learning_rate": 2.1279104012563266e-05, "loss": 0.0151, "step": 23700 }, { "epoch": 1.3919220382763884, "grad_norm": 0.3663376271724701, "learning_rate": 2.1241263190570065e-05, "loss": 0.0032, "step": 23710 }, { "epoch": 1.3925090994481626, "grad_norm": 0.2907414734363556, "learning_rate": 2.120344696826308e-05, "loss": 0.0131, "step": 23720 }, { "epoch": 1.3930961606199366, "grad_norm": 0.5462959408760071, "learning_rate": 2.1165655377989557e-05, "loss": 0.0253, "step": 23730 }, { "epoch": 1.3936832217917106, "grad_norm": 0.25368309020996094, "learning_rate": 2.112788845207574e-05, "loss": 0.0133, "step": 23740 }, { "epoch": 1.3942702829634848, "grad_norm": 1.6873301267623901, "learning_rate": 2.1090146222826758e-05, "loss": 0.0089, "step": 23750 }, { "epoch": 1.394857344135259, "grad_norm": 1.7360011339187622, "learning_rate": 2.1052428722526614e-05, "loss": 0.0105, "step": 23760 }, { "epoch": 1.395444405307033, "grad_norm": 1.2504751682281494, "learning_rate": 2.1014735983438126e-05, "loss": 0.0188, "step": 23770 }, { "epoch": 1.3960314664788072, "grad_norm": 0.2804781198501587, "learning_rate": 2.0977068037802994e-05, "loss": 0.0131, "step": 23780 }, { "epoch": 1.3966185276505811, "grad_norm": 1.5608546733856201, "learning_rate": 2.093942491784164e-05, "loss": 0.0201, "step": 23790 }, { "epoch": 1.3972055888223553, "grad_norm": 1.196268916130066, "learning_rate": 2.090180665575329e-05, "loss": 0.0127, "step": 23800 }, { "epoch": 1.3977926499941293, "grad_norm": 1.2257963418960571, "learning_rate": 2.0864213283715927e-05, "loss": 0.0124, "step": 23810 }, { "epoch": 1.3983797111659035, "grad_norm": 0.7349857687950134, "learning_rate": 2.0826644833886215e-05, "loss": 0.0105, "step": 23820 }, { "epoch": 1.3989667723376775, "grad_norm": 0.45612555742263794, "learning_rate": 2.0789101338399485e-05, "loss": 0.0232, "step": 23830 }, { "epoch": 1.3995538335094517, "grad_norm": 0.11380612105131149, "learning_rate": 2.075158282936975e-05, "loss": 0.0119, "step": 23840 }, { "epoch": 1.4001408946812257, "grad_norm": 0.6474462151527405, "learning_rate": 2.0714089338889658e-05, "loss": 0.0163, "step": 23850 }, { "epoch": 1.400727955853, "grad_norm": 0.5076104998588562, "learning_rate": 2.067662089903039e-05, "loss": 0.0101, "step": 23860 }, { "epoch": 1.401315017024774, "grad_norm": 0.42162176966667175, "learning_rate": 2.063917754184182e-05, "loss": 0.0156, "step": 23870 }, { "epoch": 1.401902078196548, "grad_norm": 1.7585097551345825, "learning_rate": 2.0601759299352246e-05, "loss": 0.016, "step": 23880 }, { "epoch": 1.4024891393683223, "grad_norm": 0.926378607749939, "learning_rate": 2.056436620356857e-05, "loss": 0.0107, "step": 23890 }, { "epoch": 1.4030762005400963, "grad_norm": 1.3140865564346313, "learning_rate": 2.05269982864761e-05, "loss": 0.0071, "step": 23900 }, { "epoch": 1.4036632617118703, "grad_norm": 0.8328626751899719, "learning_rate": 2.048965558003869e-05, "loss": 0.0099, "step": 23910 }, { "epoch": 1.4042503228836445, "grad_norm": 3.2166967391967773, "learning_rate": 2.0452338116198576e-05, "loss": 0.0178, "step": 23920 }, { "epoch": 1.4048373840554187, "grad_norm": 0.309231698513031, "learning_rate": 2.041504592687645e-05, "loss": 0.0133, "step": 23930 }, { "epoch": 1.4054244452271927, "grad_norm": 0.9916130304336548, "learning_rate": 2.037777904397132e-05, "loss": 0.0125, "step": 23940 }, { "epoch": 1.4060115063989667, "grad_norm": 2.266409158706665, "learning_rate": 2.03405374993606e-05, "loss": 0.0073, "step": 23950 }, { "epoch": 1.4065985675707409, "grad_norm": 3.0142762660980225, "learning_rate": 2.0303321324899992e-05, "loss": 0.0148, "step": 23960 }, { "epoch": 1.407185628742515, "grad_norm": 0.2837011516094208, "learning_rate": 2.026613055242353e-05, "loss": 0.0119, "step": 23970 }, { "epoch": 1.407772689914289, "grad_norm": 0.03561149910092354, "learning_rate": 2.0228965213743506e-05, "loss": 0.0077, "step": 23980 }, { "epoch": 1.408359751086063, "grad_norm": 3.673842430114746, "learning_rate": 2.019182534065045e-05, "loss": 0.0126, "step": 23990 }, { "epoch": 1.4089468122578372, "grad_norm": 0.9790199995040894, "learning_rate": 2.0154710964913143e-05, "loss": 0.022, "step": 24000 }, { "epoch": 1.4089468122578372, "eval_loss": 0.4964119791984558, "eval_runtime": 269.6182, "eval_samples_per_second": 3.505, "eval_steps_per_second": 3.505, "step": 24000 }, { "epoch": 1.4095338734296115, "grad_norm": 2.58009934425354, "learning_rate": 2.0117622118278484e-05, "loss": 0.029, "step": 24010 }, { "epoch": 1.4101209346013854, "grad_norm": 1.0348402261734009, "learning_rate": 2.0080558832471625e-05, "loss": 0.01, "step": 24020 }, { "epoch": 1.4107079957731596, "grad_norm": 1.5390087366104126, "learning_rate": 2.0043521139195763e-05, "loss": 0.0073, "step": 24030 }, { "epoch": 1.4112950569449336, "grad_norm": 0.3770119845867157, "learning_rate": 2.000650907013228e-05, "loss": 0.0122, "step": 24040 }, { "epoch": 1.4118821181167078, "grad_norm": 0.5790906548500061, "learning_rate": 1.9969522656940593e-05, "loss": 0.0159, "step": 24050 }, { "epoch": 1.4124691792884818, "grad_norm": 2.1481640338897705, "learning_rate": 1.9932561931258213e-05, "loss": 0.0178, "step": 24060 }, { "epoch": 1.413056240460256, "grad_norm": 0.11280938982963562, "learning_rate": 1.9895626924700618e-05, "loss": 0.0279, "step": 24070 }, { "epoch": 1.41364330163203, "grad_norm": 2.8088388442993164, "learning_rate": 1.985871766886136e-05, "loss": 0.0196, "step": 24080 }, { "epoch": 1.4142303628038042, "grad_norm": 1.7610342502593994, "learning_rate": 1.982183419531188e-05, "loss": 0.0146, "step": 24090 }, { "epoch": 1.4148174239755782, "grad_norm": 0.11945214122533798, "learning_rate": 1.978497653560167e-05, "loss": 0.0222, "step": 24100 }, { "epoch": 1.4154044851473524, "grad_norm": 0.6273439526557922, "learning_rate": 1.9748144721258033e-05, "loss": 0.0194, "step": 24110 }, { "epoch": 1.4159915463191264, "grad_norm": 0.548652172088623, "learning_rate": 1.9711338783786237e-05, "loss": 0.0136, "step": 24120 }, { "epoch": 1.4165786074909006, "grad_norm": 0.7424293756484985, "learning_rate": 1.9674558754669413e-05, "loss": 0.017, "step": 24130 }, { "epoch": 1.4171656686626746, "grad_norm": 0.6504852771759033, "learning_rate": 1.963780466536847e-05, "loss": 0.0088, "step": 24140 }, { "epoch": 1.4177527298344488, "grad_norm": 0.025761183351278305, "learning_rate": 1.960107654732219e-05, "loss": 0.0252, "step": 24150 }, { "epoch": 1.4183397910062228, "grad_norm": 0.9394327998161316, "learning_rate": 1.956437443194712e-05, "loss": 0.0122, "step": 24160 }, { "epoch": 1.418926852177997, "grad_norm": 0.008449507877230644, "learning_rate": 1.952769835063758e-05, "loss": 0.0193, "step": 24170 }, { "epoch": 1.4195139133497712, "grad_norm": 0.12990213930606842, "learning_rate": 1.9491048334765566e-05, "loss": 0.0085, "step": 24180 }, { "epoch": 1.4201009745215452, "grad_norm": 3.595780611038208, "learning_rate": 1.9454424415680857e-05, "loss": 0.0172, "step": 24190 }, { "epoch": 1.4206880356933191, "grad_norm": 0.04440838843584061, "learning_rate": 1.9417826624710834e-05, "loss": 0.0062, "step": 24200 }, { "epoch": 1.4212750968650933, "grad_norm": 1.4020355939865112, "learning_rate": 1.938125499316058e-05, "loss": 0.0144, "step": 24210 }, { "epoch": 1.4218621580368676, "grad_norm": 1.042809247970581, "learning_rate": 1.9344709552312783e-05, "loss": 0.0166, "step": 24220 }, { "epoch": 1.4224492192086415, "grad_norm": 0.8555197715759277, "learning_rate": 1.930819033342775e-05, "loss": 0.0124, "step": 24230 }, { "epoch": 1.4230362803804155, "grad_norm": 1.24617338180542, "learning_rate": 1.9271697367743304e-05, "loss": 0.0175, "step": 24240 }, { "epoch": 1.4236233415521897, "grad_norm": 2.595890760421753, "learning_rate": 1.9235230686474864e-05, "loss": 0.0172, "step": 24250 }, { "epoch": 1.424210402723964, "grad_norm": 0.24006566405296326, "learning_rate": 1.9198790320815347e-05, "loss": 0.0148, "step": 24260 }, { "epoch": 1.424797463895738, "grad_norm": 2.9668405055999756, "learning_rate": 1.916237630193516e-05, "loss": 0.0095, "step": 24270 }, { "epoch": 1.425384525067512, "grad_norm": 2.8897407054901123, "learning_rate": 1.912598866098219e-05, "loss": 0.011, "step": 24280 }, { "epoch": 1.425971586239286, "grad_norm": 0.30364325642585754, "learning_rate": 1.908962742908172e-05, "loss": 0.0148, "step": 24290 }, { "epoch": 1.4265586474110603, "grad_norm": 0.215542733669281, "learning_rate": 1.905329263733649e-05, "loss": 0.0245, "step": 24300 }, { "epoch": 1.4271457085828343, "grad_norm": 0.6496206521987915, "learning_rate": 1.901698431682658e-05, "loss": 0.0142, "step": 24310 }, { "epoch": 1.4277327697546085, "grad_norm": 0.18006618320941925, "learning_rate": 1.8980702498609453e-05, "loss": 0.0177, "step": 24320 }, { "epoch": 1.4283198309263825, "grad_norm": 0.8954939842224121, "learning_rate": 1.8944447213719914e-05, "loss": 0.0144, "step": 24330 }, { "epoch": 1.4289068920981567, "grad_norm": 0.24192702770233154, "learning_rate": 1.890821849317006e-05, "loss": 0.0132, "step": 24340 }, { "epoch": 1.4294939532699307, "grad_norm": 0.33100923895835876, "learning_rate": 1.8872016367949237e-05, "loss": 0.0057, "step": 24350 }, { "epoch": 1.4300810144417049, "grad_norm": 0.46353206038475037, "learning_rate": 1.883584086902409e-05, "loss": 0.0122, "step": 24360 }, { "epoch": 1.4306680756134789, "grad_norm": 0.22060637176036835, "learning_rate": 1.8799692027338446e-05, "loss": 0.0176, "step": 24370 }, { "epoch": 1.431255136785253, "grad_norm": 2.436633825302124, "learning_rate": 1.8763569873813354e-05, "loss": 0.0124, "step": 24380 }, { "epoch": 1.431842197957027, "grad_norm": 0.6386492252349854, "learning_rate": 1.8727474439347027e-05, "loss": 0.0252, "step": 24390 }, { "epoch": 1.4324292591288013, "grad_norm": 0.03924378752708435, "learning_rate": 1.8691405754814833e-05, "loss": 0.0038, "step": 24400 }, { "epoch": 1.4330163203005752, "grad_norm": 0.5494101643562317, "learning_rate": 1.865536385106927e-05, "loss": 0.0107, "step": 24410 }, { "epoch": 1.4336033814723494, "grad_norm": 1.5377278327941895, "learning_rate": 1.861934875893987e-05, "loss": 0.0114, "step": 24420 }, { "epoch": 1.4341904426441237, "grad_norm": 0.10740471631288528, "learning_rate": 1.85833605092333e-05, "loss": 0.0196, "step": 24430 }, { "epoch": 1.4347775038158976, "grad_norm": 1.406084656715393, "learning_rate": 1.8547399132733195e-05, "loss": 0.0154, "step": 24440 }, { "epoch": 1.4353645649876716, "grad_norm": 0.4072183072566986, "learning_rate": 1.8511464660200307e-05, "loss": 0.0033, "step": 24450 }, { "epoch": 1.4359516261594458, "grad_norm": 0.1420377492904663, "learning_rate": 1.847555712237226e-05, "loss": 0.025, "step": 24460 }, { "epoch": 1.43653868733122, "grad_norm": 1.86223566532135, "learning_rate": 1.8439676549963737e-05, "loss": 0.0303, "step": 24470 }, { "epoch": 1.437125748502994, "grad_norm": 0.26109209656715393, "learning_rate": 1.840382297366626e-05, "loss": 0.0082, "step": 24480 }, { "epoch": 1.437712809674768, "grad_norm": 0.5081905126571655, "learning_rate": 1.8367996424148326e-05, "loss": 0.0096, "step": 24490 }, { "epoch": 1.4382998708465422, "grad_norm": 0.1958279013633728, "learning_rate": 1.8332196932055305e-05, "loss": 0.005, "step": 24500 }, { "epoch": 1.4388869320183164, "grad_norm": 3.2699451446533203, "learning_rate": 1.8296424528009425e-05, "loss": 0.0066, "step": 24510 }, { "epoch": 1.4394739931900904, "grad_norm": 0.007179723586887121, "learning_rate": 1.8260679242609703e-05, "loss": 0.0076, "step": 24520 }, { "epoch": 1.4400610543618644, "grad_norm": 2.6423394680023193, "learning_rate": 1.8224961106432003e-05, "loss": 0.0155, "step": 24530 }, { "epoch": 1.4406481155336386, "grad_norm": 0.479322612285614, "learning_rate": 1.818927015002897e-05, "loss": 0.0107, "step": 24540 }, { "epoch": 1.4412351767054128, "grad_norm": 1.1707197427749634, "learning_rate": 1.815360640392994e-05, "loss": 0.0154, "step": 24550 }, { "epoch": 1.4418222378771868, "grad_norm": 0.06962751597166061, "learning_rate": 1.8117969898641042e-05, "loss": 0.0132, "step": 24560 }, { "epoch": 1.442409299048961, "grad_norm": 1.9589660167694092, "learning_rate": 1.8082360664645065e-05, "loss": 0.0059, "step": 24570 }, { "epoch": 1.442996360220735, "grad_norm": 0.21840360760688782, "learning_rate": 1.8046778732401513e-05, "loss": 0.0248, "step": 24580 }, { "epoch": 1.4435834213925092, "grad_norm": 1.1731334924697876, "learning_rate": 1.8011224132346466e-05, "loss": 0.0153, "step": 24590 }, { "epoch": 1.4441704825642832, "grad_norm": 0.37557175755500793, "learning_rate": 1.7975696894892698e-05, "loss": 0.014, "step": 24600 }, { "epoch": 1.4447575437360574, "grad_norm": 1.0333017110824585, "learning_rate": 1.7940197050429492e-05, "loss": 0.0177, "step": 24610 }, { "epoch": 1.4453446049078313, "grad_norm": 0.058123670518398285, "learning_rate": 1.7904724629322817e-05, "loss": 0.0157, "step": 24620 }, { "epoch": 1.4459316660796055, "grad_norm": 1.601378321647644, "learning_rate": 1.7869279661915077e-05, "loss": 0.0056, "step": 24630 }, { "epoch": 1.4465187272513795, "grad_norm": 2.2808120250701904, "learning_rate": 1.7833862178525267e-05, "loss": 0.0192, "step": 24640 }, { "epoch": 1.4471057884231537, "grad_norm": 1.0641933679580688, "learning_rate": 1.77984722094488e-05, "loss": 0.0049, "step": 24650 }, { "epoch": 1.4476928495949277, "grad_norm": 0.20524010062217712, "learning_rate": 1.776310978495762e-05, "loss": 0.0119, "step": 24660 }, { "epoch": 1.448279910766702, "grad_norm": 0.057442184537649155, "learning_rate": 1.7727774935300078e-05, "loss": 0.0128, "step": 24670 }, { "epoch": 1.448866971938476, "grad_norm": 0.015572305768728256, "learning_rate": 1.769246769070095e-05, "loss": 0.0116, "step": 24680 }, { "epoch": 1.4494540331102501, "grad_norm": 0.7050707936286926, "learning_rate": 1.7657188081361402e-05, "loss": 0.0087, "step": 24690 }, { "epoch": 1.450041094282024, "grad_norm": 0.15074096620082855, "learning_rate": 1.762193613745893e-05, "loss": 0.0097, "step": 24700 }, { "epoch": 1.4506281554537983, "grad_norm": 0.49033835530281067, "learning_rate": 1.7586711889147407e-05, "loss": 0.0213, "step": 24710 }, { "epoch": 1.4512152166255725, "grad_norm": 1.2922837734222412, "learning_rate": 1.7551515366556975e-05, "loss": 0.0186, "step": 24720 }, { "epoch": 1.4518022777973465, "grad_norm": 1.3437488079071045, "learning_rate": 1.7516346599794092e-05, "loss": 0.0034, "step": 24730 }, { "epoch": 1.4523893389691205, "grad_norm": 2.089573621749878, "learning_rate": 1.748120561894147e-05, "loss": 0.0247, "step": 24740 }, { "epoch": 1.4529764001408947, "grad_norm": 0.4231824278831482, "learning_rate": 1.7446092454058066e-05, "loss": 0.0124, "step": 24750 }, { "epoch": 1.453563461312669, "grad_norm": 1.7569494247436523, "learning_rate": 1.7411007135178987e-05, "loss": 0.0256, "step": 24760 }, { "epoch": 1.4541505224844429, "grad_norm": 0.38075587153434753, "learning_rate": 1.7375949692315584e-05, "loss": 0.0145, "step": 24770 }, { "epoch": 1.4547375836562169, "grad_norm": 0.051655061542987823, "learning_rate": 1.7340920155455327e-05, "loss": 0.007, "step": 24780 }, { "epoch": 1.455324644827991, "grad_norm": 0.28470665216445923, "learning_rate": 1.7305918554561824e-05, "loss": 0.0074, "step": 24790 }, { "epoch": 1.4559117059997653, "grad_norm": 0.8261436223983765, "learning_rate": 1.72709449195748e-05, "loss": 0.0155, "step": 24800 }, { "epoch": 1.4564987671715393, "grad_norm": 0.9011455178260803, "learning_rate": 1.7235999280410047e-05, "loss": 0.0075, "step": 24810 }, { "epoch": 1.4570858283433132, "grad_norm": 0.8218479752540588, "learning_rate": 1.720108166695943e-05, "loss": 0.0075, "step": 24820 }, { "epoch": 1.4576728895150874, "grad_norm": 0.9791045188903809, "learning_rate": 1.716619210909079e-05, "loss": 0.0082, "step": 24830 }, { "epoch": 1.4582599506868617, "grad_norm": 0.06882752478122711, "learning_rate": 1.7131330636648014e-05, "loss": 0.0085, "step": 24840 }, { "epoch": 1.4588470118586356, "grad_norm": 0.153967022895813, "learning_rate": 1.709649727945096e-05, "loss": 0.0082, "step": 24850 }, { "epoch": 1.4594340730304098, "grad_norm": 1.007752776145935, "learning_rate": 1.7061692067295447e-05, "loss": 0.0101, "step": 24860 }, { "epoch": 1.4600211342021838, "grad_norm": 1.0477262735366821, "learning_rate": 1.7026915029953168e-05, "loss": 0.0139, "step": 24870 }, { "epoch": 1.460608195373958, "grad_norm": 0.3526778221130371, "learning_rate": 1.6992166197171787e-05, "loss": 0.0264, "step": 24880 }, { "epoch": 1.461195256545732, "grad_norm": 0.5737596750259399, "learning_rate": 1.695744559867477e-05, "loss": 0.0222, "step": 24890 }, { "epoch": 1.4617823177175062, "grad_norm": 0.39420345425605774, "learning_rate": 1.692275326416149e-05, "loss": 0.0419, "step": 24900 }, { "epoch": 1.4623693788892802, "grad_norm": 2.024820566177368, "learning_rate": 1.6888089223307113e-05, "loss": 0.0183, "step": 24910 }, { "epoch": 1.4629564400610544, "grad_norm": 0.5054715871810913, "learning_rate": 1.685345350576264e-05, "loss": 0.0106, "step": 24920 }, { "epoch": 1.4635435012328284, "grad_norm": 1.0659489631652832, "learning_rate": 1.681884614115477e-05, "loss": 0.0191, "step": 24930 }, { "epoch": 1.4641305624046026, "grad_norm": 0.8166112303733826, "learning_rate": 1.6784267159086026e-05, "loss": 0.0146, "step": 24940 }, { "epoch": 1.4647176235763766, "grad_norm": 1.2386140823364258, "learning_rate": 1.6749716589134627e-05, "loss": 0.0148, "step": 24950 }, { "epoch": 1.4653046847481508, "grad_norm": 0.14238373935222626, "learning_rate": 1.6715194460854468e-05, "loss": 0.0142, "step": 24960 }, { "epoch": 1.465891745919925, "grad_norm": 1.0595828294754028, "learning_rate": 1.6680700803775135e-05, "loss": 0.0098, "step": 24970 }, { "epoch": 1.466478807091699, "grad_norm": 0.2660886347293854, "learning_rate": 1.6646235647401863e-05, "loss": 0.0105, "step": 24980 }, { "epoch": 1.467065868263473, "grad_norm": 0.1357952356338501, "learning_rate": 1.6611799021215525e-05, "loss": 0.0114, "step": 24990 }, { "epoch": 1.4676529294352472, "grad_norm": 1.4781466722488403, "learning_rate": 1.6577390954672523e-05, "loss": 0.0161, "step": 25000 }, { "epoch": 1.4682399906070214, "grad_norm": 1.8186302185058594, "learning_rate": 1.6543011477204912e-05, "loss": 0.0095, "step": 25010 }, { "epoch": 1.4688270517787954, "grad_norm": 0.9752593040466309, "learning_rate": 1.650866061822021e-05, "loss": 0.0233, "step": 25020 }, { "epoch": 1.4694141129505693, "grad_norm": 0.6816263198852539, "learning_rate": 1.6474338407101564e-05, "loss": 0.0072, "step": 25030 }, { "epoch": 1.4700011741223435, "grad_norm": 0.04612405598163605, "learning_rate": 1.6440044873207494e-05, "loss": 0.0051, "step": 25040 }, { "epoch": 1.4705882352941178, "grad_norm": 1.4814155101776123, "learning_rate": 1.6405780045872092e-05, "loss": 0.012, "step": 25050 }, { "epoch": 1.4711752964658917, "grad_norm": 0.811060905456543, "learning_rate": 1.637154395440482e-05, "loss": 0.0145, "step": 25060 }, { "epoch": 1.4717623576376657, "grad_norm": 1.7140158414840698, "learning_rate": 1.63373366280906e-05, "loss": 0.007, "step": 25070 }, { "epoch": 1.47234941880944, "grad_norm": 0.8108124732971191, "learning_rate": 1.6303158096189734e-05, "loss": 0.0166, "step": 25080 }, { "epoch": 1.4729364799812141, "grad_norm": 0.11509488523006439, "learning_rate": 1.6269008387937917e-05, "loss": 0.0233, "step": 25090 }, { "epoch": 1.4735235411529881, "grad_norm": 2.0266237258911133, "learning_rate": 1.623488753254618e-05, "loss": 0.0143, "step": 25100 }, { "epoch": 1.4741106023247623, "grad_norm": 0.528995931148529, "learning_rate": 1.620079555920082e-05, "loss": 0.0107, "step": 25110 }, { "epoch": 1.4746976634965363, "grad_norm": 0.49367189407348633, "learning_rate": 1.6166732497063524e-05, "loss": 0.015, "step": 25120 }, { "epoch": 1.4752847246683105, "grad_norm": 0.20882639288902283, "learning_rate": 1.6132698375271164e-05, "loss": 0.0122, "step": 25130 }, { "epoch": 1.4758717858400845, "grad_norm": 0.6732875108718872, "learning_rate": 1.60986932229359e-05, "loss": 0.0048, "step": 25140 }, { "epoch": 1.4764588470118587, "grad_norm": 2.245265483856201, "learning_rate": 1.6064717069145114e-05, "loss": 0.0074, "step": 25150 }, { "epoch": 1.4770459081836327, "grad_norm": 0.05271900072693825, "learning_rate": 1.6030769942961378e-05, "loss": 0.0127, "step": 25160 }, { "epoch": 1.4776329693554069, "grad_norm": 0.015161091461777687, "learning_rate": 1.5996851873422403e-05, "loss": 0.0224, "step": 25170 }, { "epoch": 1.4782200305271809, "grad_norm": 0.0022124226670712233, "learning_rate": 1.5962962889541105e-05, "loss": 0.0105, "step": 25180 }, { "epoch": 1.478807091698955, "grad_norm": 0.467401921749115, "learning_rate": 1.592910302030544e-05, "loss": 0.0164, "step": 25190 }, { "epoch": 1.479394152870729, "grad_norm": 0.448667049407959, "learning_rate": 1.589527229467857e-05, "loss": 0.0096, "step": 25200 }, { "epoch": 1.4799812140425033, "grad_norm": 0.21955633163452148, "learning_rate": 1.5861470741598618e-05, "loss": 0.0076, "step": 25210 }, { "epoch": 1.4805682752142773, "grad_norm": 1.0934991836547852, "learning_rate": 1.582769838997882e-05, "loss": 0.0106, "step": 25220 }, { "epoch": 1.4811553363860515, "grad_norm": 0.0747188851237297, "learning_rate": 1.579395526870742e-05, "loss": 0.0043, "step": 25230 }, { "epoch": 1.4817423975578254, "grad_norm": 1.3697134256362915, "learning_rate": 1.576024140664764e-05, "loss": 0.0118, "step": 25240 }, { "epoch": 1.4823294587295996, "grad_norm": 0.2039581835269928, "learning_rate": 1.5726556832637686e-05, "loss": 0.0177, "step": 25250 }, { "epoch": 1.4829165199013739, "grad_norm": 0.10999488830566406, "learning_rate": 1.5692901575490725e-05, "loss": 0.0248, "step": 25260 }, { "epoch": 1.4835035810731478, "grad_norm": 0.8862336874008179, "learning_rate": 1.5659275663994842e-05, "loss": 0.0131, "step": 25270 }, { "epoch": 1.4840906422449218, "grad_norm": 1.6316841840744019, "learning_rate": 1.562567912691299e-05, "loss": 0.0294, "step": 25280 }, { "epoch": 1.484677703416696, "grad_norm": 0.7727767825126648, "learning_rate": 1.5592111992983042e-05, "loss": 0.0119, "step": 25290 }, { "epoch": 1.4852647645884702, "grad_norm": 0.19585594534873962, "learning_rate": 1.5558574290917676e-05, "loss": 0.0149, "step": 25300 }, { "epoch": 1.4858518257602442, "grad_norm": 1.503710150718689, "learning_rate": 1.5525066049404425e-05, "loss": 0.0242, "step": 25310 }, { "epoch": 1.4864388869320182, "grad_norm": 1.8903816938400269, "learning_rate": 1.5491587297105616e-05, "loss": 0.0131, "step": 25320 }, { "epoch": 1.4870259481037924, "grad_norm": 1.395975112915039, "learning_rate": 1.5458138062658362e-05, "loss": 0.0335, "step": 25330 }, { "epoch": 1.4876130092755666, "grad_norm": 1.6451828479766846, "learning_rate": 1.5424718374674478e-05, "loss": 0.0164, "step": 25340 }, { "epoch": 1.4882000704473406, "grad_norm": 2.0035738945007324, "learning_rate": 1.539132826174058e-05, "loss": 0.0139, "step": 25350 }, { "epoch": 1.4887871316191146, "grad_norm": 0.17400747537612915, "learning_rate": 1.5357967752417908e-05, "loss": 0.0036, "step": 25360 }, { "epoch": 1.4893741927908888, "grad_norm": 0.5486733913421631, "learning_rate": 1.5324636875242425e-05, "loss": 0.0051, "step": 25370 }, { "epoch": 1.489961253962663, "grad_norm": 0.12028662115335464, "learning_rate": 1.5291335658724787e-05, "loss": 0.0198, "step": 25380 }, { "epoch": 1.490548315134437, "grad_norm": 0.26788875460624695, "learning_rate": 1.5258064131350175e-05, "loss": 0.0259, "step": 25390 }, { "epoch": 1.4911353763062112, "grad_norm": 0.16083145141601562, "learning_rate": 1.522482232157848e-05, "loss": 0.017, "step": 25400 }, { "epoch": 1.4917224374779852, "grad_norm": 1.3062677383422852, "learning_rate": 1.519161025784408e-05, "loss": 0.0248, "step": 25410 }, { "epoch": 1.4923094986497594, "grad_norm": 0.6412332653999329, "learning_rate": 1.5158427968555977e-05, "loss": 0.0124, "step": 25420 }, { "epoch": 1.4928965598215334, "grad_norm": 0.12453275173902512, "learning_rate": 1.5125275482097678e-05, "loss": 0.0129, "step": 25430 }, { "epoch": 1.4934836209933076, "grad_norm": 0.048859767615795135, "learning_rate": 1.5092152826827216e-05, "loss": 0.0191, "step": 25440 }, { "epoch": 1.4940706821650815, "grad_norm": 0.10178660601377487, "learning_rate": 1.5059060031077066e-05, "loss": 0.0117, "step": 25450 }, { "epoch": 1.4946577433368557, "grad_norm": 0.8472589254379272, "learning_rate": 1.5025997123154211e-05, "loss": 0.0073, "step": 25460 }, { "epoch": 1.4952448045086297, "grad_norm": 0.290914386510849, "learning_rate": 1.4992964131340014e-05, "loss": 0.0125, "step": 25470 }, { "epoch": 1.495831865680404, "grad_norm": 1.5838569402694702, "learning_rate": 1.49599610838903e-05, "loss": 0.0202, "step": 25480 }, { "epoch": 1.496418926852178, "grad_norm": 1.999612808227539, "learning_rate": 1.4926988009035258e-05, "loss": 0.0374, "step": 25490 }, { "epoch": 1.4970059880239521, "grad_norm": 0.9647329449653625, "learning_rate": 1.4894044934979435e-05, "loss": 0.0058, "step": 25500 }, { "epoch": 1.4975930491957261, "grad_norm": 0.10217466950416565, "learning_rate": 1.4861131889901741e-05, "loss": 0.0112, "step": 25510 }, { "epoch": 1.4981801103675003, "grad_norm": 1.8786249160766602, "learning_rate": 1.4828248901955349e-05, "loss": 0.0109, "step": 25520 }, { "epoch": 1.4987671715392743, "grad_norm": 0.14015641808509827, "learning_rate": 1.4795395999267785e-05, "loss": 0.0089, "step": 25530 }, { "epoch": 1.4993542327110485, "grad_norm": 1.404290795326233, "learning_rate": 1.4762573209940761e-05, "loss": 0.0155, "step": 25540 }, { "epoch": 1.4999412938828227, "grad_norm": 0.03318443521857262, "learning_rate": 1.4729780562050333e-05, "loss": 0.0197, "step": 25550 }, { "epoch": 1.5005283550545967, "grad_norm": 0.7886251211166382, "learning_rate": 1.469701808364668e-05, "loss": 0.0148, "step": 25560 }, { "epoch": 1.5011154162263707, "grad_norm": 0.5714460015296936, "learning_rate": 1.466428580275424e-05, "loss": 0.0103, "step": 25570 }, { "epoch": 1.5017024773981449, "grad_norm": 0.2059546560049057, "learning_rate": 1.4631583747371568e-05, "loss": 0.0079, "step": 25580 }, { "epoch": 1.502289538569919, "grad_norm": 0.13180407881736755, "learning_rate": 1.459891194547141e-05, "loss": 0.009, "step": 25590 }, { "epoch": 1.502876599741693, "grad_norm": 1.722730040550232, "learning_rate": 1.4566270425000605e-05, "loss": 0.0214, "step": 25600 }, { "epoch": 1.503463660913467, "grad_norm": 2.9748826026916504, "learning_rate": 1.4533659213880124e-05, "loss": 0.0148, "step": 25610 }, { "epoch": 1.5040507220852413, "grad_norm": 1.225185513496399, "learning_rate": 1.4501078340004953e-05, "loss": 0.0254, "step": 25620 }, { "epoch": 1.5046377832570155, "grad_norm": 1.331903100013733, "learning_rate": 1.4468527831244188e-05, "loss": 0.0059, "step": 25630 }, { "epoch": 1.5052248444287895, "grad_norm": 0.04709402844309807, "learning_rate": 1.4436007715440908e-05, "loss": 0.0178, "step": 25640 }, { "epoch": 1.5058119056005634, "grad_norm": 0.08070889115333557, "learning_rate": 1.4403518020412221e-05, "loss": 0.0107, "step": 25650 }, { "epoch": 1.5063989667723376, "grad_norm": 0.9099550247192383, "learning_rate": 1.4371058773949204e-05, "loss": 0.0104, "step": 25660 }, { "epoch": 1.5069860279441119, "grad_norm": 1.0330448150634766, "learning_rate": 1.4338630003816889e-05, "loss": 0.0151, "step": 25670 }, { "epoch": 1.5075730891158858, "grad_norm": 0.6540343165397644, "learning_rate": 1.430623173775426e-05, "loss": 0.0077, "step": 25680 }, { "epoch": 1.5081601502876598, "grad_norm": 0.602891743183136, "learning_rate": 1.4273864003474157e-05, "loss": 0.0126, "step": 25690 }, { "epoch": 1.508747211459434, "grad_norm": 0.09980016201734543, "learning_rate": 1.4241526828663366e-05, "loss": 0.0084, "step": 25700 }, { "epoch": 1.5093342726312082, "grad_norm": 0.36208033561706543, "learning_rate": 1.4209220240982468e-05, "loss": 0.0074, "step": 25710 }, { "epoch": 1.5099213338029824, "grad_norm": 2.92419695854187, "learning_rate": 1.4176944268065928e-05, "loss": 0.012, "step": 25720 }, { "epoch": 1.5105083949747564, "grad_norm": 2.5434062480926514, "learning_rate": 1.4144698937522022e-05, "loss": 0.0137, "step": 25730 }, { "epoch": 1.5110954561465304, "grad_norm": 0.10986926406621933, "learning_rate": 1.4112484276932808e-05, "loss": 0.0143, "step": 25740 }, { "epoch": 1.5116825173183046, "grad_norm": 0.022298848256468773, "learning_rate": 1.4080300313854072e-05, "loss": 0.0067, "step": 25750 }, { "epoch": 1.5122695784900788, "grad_norm": 1.5028719902038574, "learning_rate": 1.404814707581542e-05, "loss": 0.0118, "step": 25760 }, { "epoch": 1.5128566396618528, "grad_norm": 0.846768856048584, "learning_rate": 1.401602459032007e-05, "loss": 0.0221, "step": 25770 }, { "epoch": 1.5134437008336268, "grad_norm": 1.592557430267334, "learning_rate": 1.3983932884845046e-05, "loss": 0.01, "step": 25780 }, { "epoch": 1.514030762005401, "grad_norm": 0.569669783115387, "learning_rate": 1.3951871986840997e-05, "loss": 0.0094, "step": 25790 }, { "epoch": 1.5146178231771752, "grad_norm": 1.4314780235290527, "learning_rate": 1.3919841923732186e-05, "loss": 0.0183, "step": 25800 }, { "epoch": 1.5152048843489492, "grad_norm": 0.031544461846351624, "learning_rate": 1.3887842722916555e-05, "loss": 0.0048, "step": 25810 }, { "epoch": 1.5157919455207232, "grad_norm": 0.037735532969236374, "learning_rate": 1.3855874411765602e-05, "loss": 0.0081, "step": 25820 }, { "epoch": 1.5163790066924974, "grad_norm": 0.6033819317817688, "learning_rate": 1.3823937017624427e-05, "loss": 0.0074, "step": 25830 }, { "epoch": 1.5169660678642716, "grad_norm": 0.6098164916038513, "learning_rate": 1.3792030567811687e-05, "loss": 0.0079, "step": 25840 }, { "epoch": 1.5175531290360456, "grad_norm": 1.4705510139465332, "learning_rate": 1.3760155089619575e-05, "loss": 0.0129, "step": 25850 }, { "epoch": 1.5181401902078195, "grad_norm": 0.5626569390296936, "learning_rate": 1.3728310610313755e-05, "loss": 0.009, "step": 25860 }, { "epoch": 1.5187272513795937, "grad_norm": 0.4385286867618561, "learning_rate": 1.369649715713342e-05, "loss": 0.0062, "step": 25870 }, { "epoch": 1.519314312551368, "grad_norm": 0.13057030737400055, "learning_rate": 1.366471475729118e-05, "loss": 0.0071, "step": 25880 }, { "epoch": 1.519901373723142, "grad_norm": 0.10085809230804443, "learning_rate": 1.3632963437973122e-05, "loss": 0.0145, "step": 25890 }, { "epoch": 1.520488434894916, "grad_norm": 0.1775263398885727, "learning_rate": 1.3601243226338734e-05, "loss": 0.0068, "step": 25900 }, { "epoch": 1.5210754960666901, "grad_norm": 1.0344690084457397, "learning_rate": 1.3569554149520886e-05, "loss": 0.0114, "step": 25910 }, { "epoch": 1.5216625572384643, "grad_norm": 0.1530085802078247, "learning_rate": 1.3537896234625835e-05, "loss": 0.0109, "step": 25920 }, { "epoch": 1.5222496184102383, "grad_norm": 1.2540932893753052, "learning_rate": 1.350626950873315e-05, "loss": 0.022, "step": 25930 }, { "epoch": 1.5228366795820123, "grad_norm": 2.722752809524536, "learning_rate": 1.3474673998895764e-05, "loss": 0.0089, "step": 25940 }, { "epoch": 1.5234237407537865, "grad_norm": 0.46137571334838867, "learning_rate": 1.3443109732139841e-05, "loss": 0.0181, "step": 25950 }, { "epoch": 1.5240108019255607, "grad_norm": 1.4344850778579712, "learning_rate": 1.3411576735464925e-05, "loss": 0.0193, "step": 25960 }, { "epoch": 1.5245978630973347, "grad_norm": 0.8635249137878418, "learning_rate": 1.3380075035843714e-05, "loss": 0.0113, "step": 25970 }, { "epoch": 1.5251849242691087, "grad_norm": 1.7438290119171143, "learning_rate": 1.3348604660222198e-05, "loss": 0.0258, "step": 25980 }, { "epoch": 1.5257719854408829, "grad_norm": 0.3975509703159332, "learning_rate": 1.3317165635519518e-05, "loss": 0.0149, "step": 25990 }, { "epoch": 1.526359046612657, "grad_norm": 1.7383859157562256, "learning_rate": 1.3285757988628045e-05, "loss": 0.0279, "step": 26000 }, { "epoch": 1.5269461077844313, "grad_norm": 0.4724438786506653, "learning_rate": 1.3254381746413291e-05, "loss": 0.013, "step": 26010 }, { "epoch": 1.5275331689562053, "grad_norm": 0.6692148447036743, "learning_rate": 1.3223036935713923e-05, "loss": 0.0224, "step": 26020 }, { "epoch": 1.5281202301279793, "grad_norm": 0.008498435840010643, "learning_rate": 1.3191723583341681e-05, "loss": 0.0307, "step": 26030 }, { "epoch": 1.5287072912997535, "grad_norm": 0.6550512909889221, "learning_rate": 1.3160441716081446e-05, "loss": 0.0156, "step": 26040 }, { "epoch": 1.5292943524715277, "grad_norm": 0.05668057128787041, "learning_rate": 1.3129191360691112e-05, "loss": 0.0086, "step": 26050 }, { "epoch": 1.5298814136433017, "grad_norm": 0.003640669398009777, "learning_rate": 1.309797254390167e-05, "loss": 0.0117, "step": 26060 }, { "epoch": 1.5304684748150756, "grad_norm": 1.0109336376190186, "learning_rate": 1.306678529241711e-05, "loss": 0.0055, "step": 26070 }, { "epoch": 1.5310555359868498, "grad_norm": 0.35992541909217834, "learning_rate": 1.3035629632914426e-05, "loss": 0.0069, "step": 26080 }, { "epoch": 1.531642597158624, "grad_norm": 0.16760732233524323, "learning_rate": 1.3004505592043598e-05, "loss": 0.0172, "step": 26090 }, { "epoch": 1.532229658330398, "grad_norm": 0.7822508811950684, "learning_rate": 1.2973413196427519e-05, "loss": 0.0155, "step": 26100 }, { "epoch": 1.532816719502172, "grad_norm": 1.6121885776519775, "learning_rate": 1.2942352472662078e-05, "loss": 0.0129, "step": 26110 }, { "epoch": 1.5334037806739462, "grad_norm": 1.1530961990356445, "learning_rate": 1.2911323447315993e-05, "loss": 0.0157, "step": 26120 }, { "epoch": 1.5339908418457204, "grad_norm": 2.8421037197113037, "learning_rate": 1.288032614693097e-05, "loss": 0.0151, "step": 26130 }, { "epoch": 1.5345779030174944, "grad_norm": 2.1462604999542236, "learning_rate": 1.2849360598021471e-05, "loss": 0.0069, "step": 26140 }, { "epoch": 1.5351649641892684, "grad_norm": 0.04612084850668907, "learning_rate": 1.2818426827074886e-05, "loss": 0.0184, "step": 26150 }, { "epoch": 1.5357520253610426, "grad_norm": 0.10645939409732819, "learning_rate": 1.2787524860551352e-05, "loss": 0.0112, "step": 26160 }, { "epoch": 1.5363390865328168, "grad_norm": 0.6683909296989441, "learning_rate": 1.2756654724883849e-05, "loss": 0.0072, "step": 26170 }, { "epoch": 1.5369261477045908, "grad_norm": 1.2476474046707153, "learning_rate": 1.2725816446478112e-05, "loss": 0.0117, "step": 26180 }, { "epoch": 1.5375132088763648, "grad_norm": 1.236338496208191, "learning_rate": 1.2695010051712625e-05, "loss": 0.0079, "step": 26190 }, { "epoch": 1.538100270048139, "grad_norm": 2.2373862266540527, "learning_rate": 1.2664235566938632e-05, "loss": 0.0095, "step": 26200 }, { "epoch": 1.5386873312199132, "grad_norm": 0.3796428143978119, "learning_rate": 1.2633493018480009e-05, "loss": 0.0054, "step": 26210 }, { "epoch": 1.5392743923916872, "grad_norm": 0.11374548822641373, "learning_rate": 1.2602782432633387e-05, "loss": 0.0106, "step": 26220 }, { "epoch": 1.5398614535634612, "grad_norm": 0.6043453812599182, "learning_rate": 1.2572103835668004e-05, "loss": 0.0124, "step": 26230 }, { "epoch": 1.5404485147352354, "grad_norm": 0.7459734082221985, "learning_rate": 1.2541457253825773e-05, "loss": 0.0186, "step": 26240 }, { "epoch": 1.5410355759070096, "grad_norm": 0.21875302493572235, "learning_rate": 1.2510842713321208e-05, "loss": 0.0096, "step": 26250 }, { "epoch": 1.5416226370787838, "grad_norm": 0.9383326172828674, "learning_rate": 1.248026024034143e-05, "loss": 0.0042, "step": 26260 }, { "epoch": 1.5422096982505578, "grad_norm": 3.49565052986145, "learning_rate": 1.2449709861046077e-05, "loss": 0.0154, "step": 26270 }, { "epoch": 1.5427967594223317, "grad_norm": 0.03930044546723366, "learning_rate": 1.2419191601567409e-05, "loss": 0.0069, "step": 26280 }, { "epoch": 1.543383820594106, "grad_norm": 0.2098531424999237, "learning_rate": 1.238870548801015e-05, "loss": 0.0105, "step": 26290 }, { "epoch": 1.5439708817658802, "grad_norm": 0.32755404710769653, "learning_rate": 1.235825154645156e-05, "loss": 0.0094, "step": 26300 }, { "epoch": 1.5445579429376541, "grad_norm": 0.014306893572211266, "learning_rate": 1.232782980294137e-05, "loss": 0.0096, "step": 26310 }, { "epoch": 1.5451450041094281, "grad_norm": 0.4917788505554199, "learning_rate": 1.2297440283501793e-05, "loss": 0.0141, "step": 26320 }, { "epoch": 1.5457320652812023, "grad_norm": 0.27238497138023376, "learning_rate": 1.2267083014127424e-05, "loss": 0.0143, "step": 26330 }, { "epoch": 1.5463191264529765, "grad_norm": 1.6253865957260132, "learning_rate": 1.2236758020785316e-05, "loss": 0.0097, "step": 26340 }, { "epoch": 1.5469061876247505, "grad_norm": 0.07669629901647568, "learning_rate": 1.2206465329414901e-05, "loss": 0.0054, "step": 26350 }, { "epoch": 1.5474932487965245, "grad_norm": 0.2042701095342636, "learning_rate": 1.217620496592799e-05, "loss": 0.0119, "step": 26360 }, { "epoch": 1.5480803099682987, "grad_norm": 0.7941452860832214, "learning_rate": 1.2145976956208738e-05, "loss": 0.0071, "step": 26370 }, { "epoch": 1.548667371140073, "grad_norm": 0.279593825340271, "learning_rate": 1.211578132611359e-05, "loss": 0.0208, "step": 26380 }, { "epoch": 1.549254432311847, "grad_norm": 0.46462684869766235, "learning_rate": 1.2085618101471363e-05, "loss": 0.0095, "step": 26390 }, { "epoch": 1.5498414934836209, "grad_norm": 0.02638358436524868, "learning_rate": 1.205548730808308e-05, "loss": 0.0221, "step": 26400 }, { "epoch": 1.550428554655395, "grad_norm": 0.19672612845897675, "learning_rate": 1.2025388971722068e-05, "loss": 0.0214, "step": 26410 }, { "epoch": 1.5510156158271693, "grad_norm": 3.860138416290283, "learning_rate": 1.1995323118133894e-05, "loss": 0.0256, "step": 26420 }, { "epoch": 1.5516026769989433, "grad_norm": 0.5015102028846741, "learning_rate": 1.196528977303633e-05, "loss": 0.0085, "step": 26430 }, { "epoch": 1.5521897381707173, "grad_norm": 0.4696909487247467, "learning_rate": 1.1935288962119317e-05, "loss": 0.0083, "step": 26440 }, { "epoch": 1.5527767993424915, "grad_norm": 1.9576561450958252, "learning_rate": 1.190532071104502e-05, "loss": 0.0203, "step": 26450 }, { "epoch": 1.5533638605142657, "grad_norm": 0.5189847946166992, "learning_rate": 1.1875385045447679e-05, "loss": 0.0038, "step": 26460 }, { "epoch": 1.5539509216860397, "grad_norm": 0.4064716696739197, "learning_rate": 1.1845481990933716e-05, "loss": 0.0147, "step": 26470 }, { "epoch": 1.5545379828578136, "grad_norm": 0.027080390602350235, "learning_rate": 1.1815611573081681e-05, "loss": 0.0145, "step": 26480 }, { "epoch": 1.5551250440295878, "grad_norm": 0.09110064059495926, "learning_rate": 1.1785773817442137e-05, "loss": 0.0109, "step": 26490 }, { "epoch": 1.555712105201362, "grad_norm": 2.345695734024048, "learning_rate": 1.1755968749537754e-05, "loss": 0.0311, "step": 26500 }, { "epoch": 1.556299166373136, "grad_norm": 0.7785966396331787, "learning_rate": 1.172619639486322e-05, "loss": 0.0075, "step": 26510 }, { "epoch": 1.55688622754491, "grad_norm": 1.9212803840637207, "learning_rate": 1.1696456778885262e-05, "loss": 0.0124, "step": 26520 }, { "epoch": 1.5574732887166842, "grad_norm": 0.225474551320076, "learning_rate": 1.166674992704258e-05, "loss": 0.0162, "step": 26530 }, { "epoch": 1.5580603498884584, "grad_norm": 1.1490429639816284, "learning_rate": 1.163707586474589e-05, "loss": 0.006, "step": 26540 }, { "epoch": 1.5586474110602326, "grad_norm": 0.9614079594612122, "learning_rate": 1.1607434617377788e-05, "loss": 0.0152, "step": 26550 }, { "epoch": 1.5592344722320066, "grad_norm": 0.6767802238464355, "learning_rate": 1.157782621029288e-05, "loss": 0.0115, "step": 26560 }, { "epoch": 1.5598215334037806, "grad_norm": 1.456064224243164, "learning_rate": 1.1548250668817612e-05, "loss": 0.0055, "step": 26570 }, { "epoch": 1.5604085945755548, "grad_norm": 1.0975860357284546, "learning_rate": 1.1518708018250369e-05, "loss": 0.0129, "step": 26580 }, { "epoch": 1.560995655747329, "grad_norm": 2.509904384613037, "learning_rate": 1.148919828386138e-05, "loss": 0.0148, "step": 26590 }, { "epoch": 1.561582716919103, "grad_norm": 0.10708583891391754, "learning_rate": 1.1459721490892732e-05, "loss": 0.0121, "step": 26600 }, { "epoch": 1.562169778090877, "grad_norm": 0.23943229019641876, "learning_rate": 1.1430277664558298e-05, "loss": 0.0047, "step": 26610 }, { "epoch": 1.5627568392626512, "grad_norm": 2.456552505493164, "learning_rate": 1.1400866830043789e-05, "loss": 0.0153, "step": 26620 }, { "epoch": 1.5633439004344254, "grad_norm": 2.6100728511810303, "learning_rate": 1.1371489012506698e-05, "loss": 0.0314, "step": 26630 }, { "epoch": 1.5639309616061994, "grad_norm": 0.43356645107269287, "learning_rate": 1.1342144237076236e-05, "loss": 0.0065, "step": 26640 }, { "epoch": 1.5645180227779734, "grad_norm": 0.14831143617630005, "learning_rate": 1.131283252885338e-05, "loss": 0.0137, "step": 26650 }, { "epoch": 1.5651050839497476, "grad_norm": 0.5145511627197266, "learning_rate": 1.1283553912910833e-05, "loss": 0.0057, "step": 26660 }, { "epoch": 1.5656921451215218, "grad_norm": 1.0553534030914307, "learning_rate": 1.1254308414292975e-05, "loss": 0.0113, "step": 26670 }, { "epoch": 1.5662792062932958, "grad_norm": 1.0165165662765503, "learning_rate": 1.1225096058015844e-05, "loss": 0.0088, "step": 26680 }, { "epoch": 1.5668662674650697, "grad_norm": 0.07155635952949524, "learning_rate": 1.1195916869067159e-05, "loss": 0.009, "step": 26690 }, { "epoch": 1.567453328636844, "grad_norm": 1.211666464805603, "learning_rate": 1.1166770872406223e-05, "loss": 0.0086, "step": 26700 }, { "epoch": 1.5680403898086182, "grad_norm": 0.4265386760234833, "learning_rate": 1.1137658092964026e-05, "loss": 0.0163, "step": 26710 }, { "epoch": 1.5686274509803921, "grad_norm": 1.1349551677703857, "learning_rate": 1.1108578555643056e-05, "loss": 0.0175, "step": 26720 }, { "epoch": 1.5692145121521661, "grad_norm": 0.4345262348651886, "learning_rate": 1.1079532285317435e-05, "loss": 0.0068, "step": 26730 }, { "epoch": 1.5698015733239403, "grad_norm": 0.5875182151794434, "learning_rate": 1.1050519306832768e-05, "loss": 0.0104, "step": 26740 }, { "epoch": 1.5703886344957145, "grad_norm": 0.3351311683654785, "learning_rate": 1.1021539645006229e-05, "loss": 0.0063, "step": 26750 }, { "epoch": 1.5709756956674885, "grad_norm": 0.3784141540527344, "learning_rate": 1.0992593324626488e-05, "loss": 0.0084, "step": 26760 }, { "epoch": 1.5715627568392625, "grad_norm": 3.271113872528076, "learning_rate": 1.0963680370453678e-05, "loss": 0.0175, "step": 26770 }, { "epoch": 1.5721498180110367, "grad_norm": 0.15095481276512146, "learning_rate": 1.0934800807219415e-05, "loss": 0.0176, "step": 26780 }, { "epoch": 1.572736879182811, "grad_norm": 1.0471665859222412, "learning_rate": 1.090595465962671e-05, "loss": 0.0032, "step": 26790 }, { "epoch": 1.573323940354585, "grad_norm": 1.388066291809082, "learning_rate": 1.0877141952350046e-05, "loss": 0.0117, "step": 26800 }, { "epoch": 1.573911001526359, "grad_norm": 0.2667527496814728, "learning_rate": 1.0848362710035253e-05, "loss": 0.0107, "step": 26810 }, { "epoch": 1.574498062698133, "grad_norm": 1.1280386447906494, "learning_rate": 1.0819616957299567e-05, "loss": 0.0071, "step": 26820 }, { "epoch": 1.5750851238699073, "grad_norm": 1.30109441280365, "learning_rate": 1.0790904718731565e-05, "loss": 0.0198, "step": 26830 }, { "epoch": 1.5756721850416815, "grad_norm": 0.5417740941047668, "learning_rate": 1.0762226018891175e-05, "loss": 0.0032, "step": 26840 }, { "epoch": 1.5762592462134555, "grad_norm": 2.700136661529541, "learning_rate": 1.0733580882309591e-05, "loss": 0.0186, "step": 26850 }, { "epoch": 1.5768463073852295, "grad_norm": 0.22245927155017853, "learning_rate": 1.0704969333489362e-05, "loss": 0.0077, "step": 26860 }, { "epoch": 1.5774333685570037, "grad_norm": 0.9244855642318726, "learning_rate": 1.0676391396904229e-05, "loss": 0.011, "step": 26870 }, { "epoch": 1.5780204297287779, "grad_norm": 1.6688724756240845, "learning_rate": 1.0647847096999276e-05, "loss": 0.015, "step": 26880 }, { "epoch": 1.5786074909005519, "grad_norm": 0.008767356164753437, "learning_rate": 1.0619336458190726e-05, "loss": 0.0107, "step": 26890 }, { "epoch": 1.5791945520723258, "grad_norm": 0.03255594149231911, "learning_rate": 1.0590859504866058e-05, "loss": 0.0051, "step": 26900 }, { "epoch": 1.5797816132441, "grad_norm": 0.20186622440814972, "learning_rate": 1.0562416261383945e-05, "loss": 0.0133, "step": 26910 }, { "epoch": 1.5803686744158743, "grad_norm": 0.10093547403812408, "learning_rate": 1.0534006752074171e-05, "loss": 0.0107, "step": 26920 }, { "epoch": 1.5809557355876482, "grad_norm": 0.24645297229290009, "learning_rate": 1.050563100123772e-05, "loss": 0.0183, "step": 26930 }, { "epoch": 1.5815427967594222, "grad_norm": 0.11304015666246414, "learning_rate": 1.0477289033146675e-05, "loss": 0.009, "step": 26940 }, { "epoch": 1.5821298579311964, "grad_norm": 1.0527604818344116, "learning_rate": 1.0448980872044239e-05, "loss": 0.013, "step": 26950 }, { "epoch": 1.5827169191029706, "grad_norm": 0.31132858991622925, "learning_rate": 1.0420706542144664e-05, "loss": 0.0129, "step": 26960 }, { "epoch": 1.5833039802747446, "grad_norm": 0.5216774344444275, "learning_rate": 1.03924660676333e-05, "loss": 0.0121, "step": 26970 }, { "epoch": 1.5838910414465186, "grad_norm": 0.040675628930330276, "learning_rate": 1.0364259472666504e-05, "loss": 0.0089, "step": 26980 }, { "epoch": 1.5844781026182928, "grad_norm": 1.0602043867111206, "learning_rate": 1.0336086781371679e-05, "loss": 0.0119, "step": 26990 }, { "epoch": 1.585065163790067, "grad_norm": 0.5476510524749756, "learning_rate": 1.030794801784722e-05, "loss": 0.0133, "step": 27000 }, { "epoch": 1.585065163790067, "eval_loss": 0.5030481219291687, "eval_runtime": 269.4941, "eval_samples_per_second": 3.507, "eval_steps_per_second": 3.507, "step": 27000 }, { "epoch": 1.585652224961841, "grad_norm": 0.3357763886451721, "learning_rate": 1.0279843206162509e-05, "loss": 0.009, "step": 27010 }, { "epoch": 1.586239286133615, "grad_norm": 0.47375836968421936, "learning_rate": 1.0251772370357854e-05, "loss": 0.0112, "step": 27020 }, { "epoch": 1.5868263473053892, "grad_norm": 0.6899937391281128, "learning_rate": 1.022373553444454e-05, "loss": 0.0112, "step": 27030 }, { "epoch": 1.5874134084771634, "grad_norm": 0.04963413625955582, "learning_rate": 1.019573272240476e-05, "loss": 0.0076, "step": 27040 }, { "epoch": 1.5880004696489374, "grad_norm": 0.4993203282356262, "learning_rate": 1.0167763958191556e-05, "loss": 0.0114, "step": 27050 }, { "epoch": 1.5885875308207114, "grad_norm": 0.4155748188495636, "learning_rate": 1.013982926572895e-05, "loss": 0.0086, "step": 27060 }, { "epoch": 1.5891745919924856, "grad_norm": 0.35768797993659973, "learning_rate": 1.0111928668911702e-05, "loss": 0.015, "step": 27070 }, { "epoch": 1.5897616531642598, "grad_norm": 1.7045468091964722, "learning_rate": 1.0084062191605498e-05, "loss": 0.0112, "step": 27080 }, { "epoch": 1.590348714336034, "grad_norm": 1.3340132236480713, "learning_rate": 1.0056229857646771e-05, "loss": 0.0196, "step": 27090 }, { "epoch": 1.590935775507808, "grad_norm": 0.4323832392692566, "learning_rate": 1.0028431690842793e-05, "loss": 0.0184, "step": 27100 }, { "epoch": 1.591522836679582, "grad_norm": 2.0426080226898193, "learning_rate": 1.00006677149716e-05, "loss": 0.0112, "step": 27110 }, { "epoch": 1.5921098978513561, "grad_norm": 1.3638851642608643, "learning_rate": 9.972937953781986e-06, "loss": 0.0161, "step": 27120 }, { "epoch": 1.5926969590231304, "grad_norm": 0.09961730241775513, "learning_rate": 9.945242430993446e-06, "loss": 0.0103, "step": 27130 }, { "epoch": 1.5932840201949043, "grad_norm": 4.037911891937256, "learning_rate": 9.917581170296241e-06, "loss": 0.0139, "step": 27140 }, { "epoch": 1.5938710813666783, "grad_norm": 0.10096555948257446, "learning_rate": 9.889954195351276e-06, "loss": 0.0058, "step": 27150 }, { "epoch": 1.5944581425384525, "grad_norm": 0.047120388597249985, "learning_rate": 9.862361529790149e-06, "loss": 0.003, "step": 27160 }, { "epoch": 1.5950452037102267, "grad_norm": 0.9101646542549133, "learning_rate": 9.83480319721512e-06, "loss": 0.0097, "step": 27170 }, { "epoch": 1.5956322648820007, "grad_norm": 2.343736171722412, "learning_rate": 9.807279221199067e-06, "loss": 0.0219, "step": 27180 }, { "epoch": 1.5962193260537747, "grad_norm": 0.00710842851549387, "learning_rate": 9.7797896252855e-06, "loss": 0.0058, "step": 27190 }, { "epoch": 1.596806387225549, "grad_norm": 0.5659670829772949, "learning_rate": 9.752334432988485e-06, "loss": 0.005, "step": 27200 }, { "epoch": 1.5973934483973231, "grad_norm": 0.3442319333553314, "learning_rate": 9.724913667792696e-06, "loss": 0.0044, "step": 27210 }, { "epoch": 1.597980509569097, "grad_norm": 0.1260579228401184, "learning_rate": 9.69752735315333e-06, "loss": 0.0093, "step": 27220 }, { "epoch": 1.598567570740871, "grad_norm": 0.743402898311615, "learning_rate": 9.670175512496127e-06, "loss": 0.0204, "step": 27230 }, { "epoch": 1.5991546319126453, "grad_norm": 0.012050657533109188, "learning_rate": 9.642858169217356e-06, "loss": 0.0248, "step": 27240 }, { "epoch": 1.5997416930844195, "grad_norm": 0.8421478867530823, "learning_rate": 9.615575346683758e-06, "loss": 0.0179, "step": 27250 }, { "epoch": 1.6003287542561935, "grad_norm": 0.0184626467525959, "learning_rate": 9.588327068232539e-06, "loss": 0.0115, "step": 27260 }, { "epoch": 1.6009158154279675, "grad_norm": 0.231166809797287, "learning_rate": 9.561113357171386e-06, "loss": 0.0049, "step": 27270 }, { "epoch": 1.6015028765997417, "grad_norm": 0.19372732937335968, "learning_rate": 9.533934236778364e-06, "loss": 0.0295, "step": 27280 }, { "epoch": 1.6020899377715159, "grad_norm": 0.2418680489063263, "learning_rate": 9.506789730302034e-06, "loss": 0.0183, "step": 27290 }, { "epoch": 1.6026769989432899, "grad_norm": 0.274222731590271, "learning_rate": 9.47967986096126e-06, "loss": 0.0057, "step": 27300 }, { "epoch": 1.6032640601150638, "grad_norm": 0.04827715456485748, "learning_rate": 9.45260465194533e-06, "loss": 0.0199, "step": 27310 }, { "epoch": 1.603851121286838, "grad_norm": 1.2527906894683838, "learning_rate": 9.425564126413889e-06, "loss": 0.0142, "step": 27320 }, { "epoch": 1.6044381824586122, "grad_norm": 0.2187308967113495, "learning_rate": 9.398558307496868e-06, "loss": 0.0074, "step": 27330 }, { "epoch": 1.6050252436303862, "grad_norm": 0.40605297684669495, "learning_rate": 9.37158721829456e-06, "loss": 0.0112, "step": 27340 }, { "epoch": 1.6056123048021604, "grad_norm": 3.947662591934204, "learning_rate": 9.344650881877515e-06, "loss": 0.0091, "step": 27350 }, { "epoch": 1.6061993659739344, "grad_norm": 0.09445164352655411, "learning_rate": 9.317749321286601e-06, "loss": 0.0195, "step": 27360 }, { "epoch": 1.6067864271457086, "grad_norm": 0.6971142888069153, "learning_rate": 9.290882559532877e-06, "loss": 0.0098, "step": 27370 }, { "epoch": 1.6073734883174828, "grad_norm": 2.9889259338378906, "learning_rate": 9.264050619597697e-06, "loss": 0.0086, "step": 27380 }, { "epoch": 1.6079605494892568, "grad_norm": 1.257043480873108, "learning_rate": 9.23725352443257e-06, "loss": 0.0119, "step": 27390 }, { "epoch": 1.6085476106610308, "grad_norm": 0.30998942255973816, "learning_rate": 9.210491296959256e-06, "loss": 0.0078, "step": 27400 }, { "epoch": 1.609134671832805, "grad_norm": 0.11986259371042252, "learning_rate": 9.183763960069652e-06, "loss": 0.0091, "step": 27410 }, { "epoch": 1.6097217330045792, "grad_norm": 1.7720203399658203, "learning_rate": 9.157071536625838e-06, "loss": 0.0111, "step": 27420 }, { "epoch": 1.6103087941763532, "grad_norm": 1.4282121658325195, "learning_rate": 9.130414049459995e-06, "loss": 0.0105, "step": 27430 }, { "epoch": 1.6108958553481272, "grad_norm": 0.14585615694522858, "learning_rate": 9.103791521374444e-06, "loss": 0.0161, "step": 27440 }, { "epoch": 1.6114829165199014, "grad_norm": 0.45412710309028625, "learning_rate": 9.077203975141607e-06, "loss": 0.0224, "step": 27450 }, { "epoch": 1.6120699776916756, "grad_norm": 0.833221435546875, "learning_rate": 9.050651433503965e-06, "loss": 0.0072, "step": 27460 }, { "epoch": 1.6126570388634496, "grad_norm": 0.06275784224271774, "learning_rate": 9.024133919174082e-06, "loss": 0.0073, "step": 27470 }, { "epoch": 1.6132441000352236, "grad_norm": 0.8522456288337708, "learning_rate": 8.997651454834527e-06, "loss": 0.033, "step": 27480 }, { "epoch": 1.6138311612069978, "grad_norm": 1.919840693473816, "learning_rate": 8.971204063137916e-06, "loss": 0.0105, "step": 27490 }, { "epoch": 1.614418222378772, "grad_norm": 1.106185793876648, "learning_rate": 8.944791766706844e-06, "loss": 0.0197, "step": 27500 }, { "epoch": 1.615005283550546, "grad_norm": 1.1418821811676025, "learning_rate": 8.918414588133894e-06, "loss": 0.0262, "step": 27510 }, { "epoch": 1.61559234472232, "grad_norm": 0.36705297231674194, "learning_rate": 8.892072549981622e-06, "loss": 0.0129, "step": 27520 }, { "epoch": 1.6161794058940941, "grad_norm": 2.395829677581787, "learning_rate": 8.865765674782528e-06, "loss": 0.0186, "step": 27530 }, { "epoch": 1.6167664670658684, "grad_norm": 0.783276379108429, "learning_rate": 8.839493985038988e-06, "loss": 0.0148, "step": 27540 }, { "epoch": 1.6173535282376423, "grad_norm": 2.0919573307037354, "learning_rate": 8.81325750322335e-06, "loss": 0.0105, "step": 27550 }, { "epoch": 1.6179405894094163, "grad_norm": 0.41778576374053955, "learning_rate": 8.78705625177777e-06, "loss": 0.0191, "step": 27560 }, { "epoch": 1.6185276505811905, "grad_norm": 0.07614164799451828, "learning_rate": 8.76089025311434e-06, "loss": 0.0056, "step": 27570 }, { "epoch": 1.6191147117529647, "grad_norm": 0.0933682918548584, "learning_rate": 8.734759529614956e-06, "loss": 0.0066, "step": 27580 }, { "epoch": 1.6197017729247387, "grad_norm": 0.5314574241638184, "learning_rate": 8.708664103631354e-06, "loss": 0.0114, "step": 27590 }, { "epoch": 1.6202888340965127, "grad_norm": 0.17074057459831238, "learning_rate": 8.682603997485078e-06, "loss": 0.0194, "step": 27600 }, { "epoch": 1.620875895268287, "grad_norm": 2.2320051193237305, "learning_rate": 8.656579233467443e-06, "loss": 0.0226, "step": 27610 }, { "epoch": 1.621462956440061, "grad_norm": 2.8677802085876465, "learning_rate": 8.63058983383957e-06, "loss": 0.0161, "step": 27620 }, { "epoch": 1.6220500176118353, "grad_norm": 2.6390633583068848, "learning_rate": 8.604635820832258e-06, "loss": 0.0304, "step": 27630 }, { "epoch": 1.6226370787836093, "grad_norm": 0.17321078479290009, "learning_rate": 8.578717216646143e-06, "loss": 0.0155, "step": 27640 }, { "epoch": 1.6232241399553833, "grad_norm": 0.2078569233417511, "learning_rate": 8.55283404345148e-06, "loss": 0.0057, "step": 27650 }, { "epoch": 1.6238112011271575, "grad_norm": 0.6215654015541077, "learning_rate": 8.526986323388263e-06, "loss": 0.0148, "step": 27660 }, { "epoch": 1.6243982622989317, "grad_norm": 0.284978985786438, "learning_rate": 8.501174078566143e-06, "loss": 0.0113, "step": 27670 }, { "epoch": 1.6249853234707057, "grad_norm": 0.7308915257453918, "learning_rate": 8.475397331064427e-06, "loss": 0.0065, "step": 27680 }, { "epoch": 1.6255723846424797, "grad_norm": 0.08350818604230881, "learning_rate": 8.449656102932075e-06, "loss": 0.0075, "step": 27690 }, { "epoch": 1.6261594458142539, "grad_norm": 0.8391403555870056, "learning_rate": 8.42395041618766e-06, "loss": 0.0047, "step": 27700 }, { "epoch": 1.626746506986028, "grad_norm": 0.010693466290831566, "learning_rate": 8.398280292819321e-06, "loss": 0.0215, "step": 27710 }, { "epoch": 1.627333568157802, "grad_norm": 0.9087989330291748, "learning_rate": 8.37264575478482e-06, "loss": 0.0181, "step": 27720 }, { "epoch": 1.627920629329576, "grad_norm": 1.3562119007110596, "learning_rate": 8.347046824011467e-06, "loss": 0.0159, "step": 27730 }, { "epoch": 1.6285076905013502, "grad_norm": 0.3279988467693329, "learning_rate": 8.321483522396084e-06, "loss": 0.0121, "step": 27740 }, { "epoch": 1.6290947516731245, "grad_norm": 1.1179426908493042, "learning_rate": 8.295955871805061e-06, "loss": 0.0123, "step": 27750 }, { "epoch": 1.6296818128448984, "grad_norm": 0.3496987521648407, "learning_rate": 8.27046389407427e-06, "loss": 0.0067, "step": 27760 }, { "epoch": 1.6302688740166724, "grad_norm": 1.15142023563385, "learning_rate": 8.245007611009087e-06, "loss": 0.0077, "step": 27770 }, { "epoch": 1.6308559351884466, "grad_norm": 0.46409231424331665, "learning_rate": 8.219587044384307e-06, "loss": 0.011, "step": 27780 }, { "epoch": 1.6314429963602208, "grad_norm": 0.42062243819236755, "learning_rate": 8.194202215944247e-06, "loss": 0.0252, "step": 27790 }, { "epoch": 1.6320300575319948, "grad_norm": 1.3170115947723389, "learning_rate": 8.168853147402566e-06, "loss": 0.0113, "step": 27800 }, { "epoch": 1.6326171187037688, "grad_norm": 0.15095262229442596, "learning_rate": 8.14353986044244e-06, "loss": 0.0195, "step": 27810 }, { "epoch": 1.633204179875543, "grad_norm": 0.6896628737449646, "learning_rate": 8.11826237671634e-06, "loss": 0.0104, "step": 27820 }, { "epoch": 1.6337912410473172, "grad_norm": 0.35963237285614014, "learning_rate": 8.093020717846177e-06, "loss": 0.0065, "step": 27830 }, { "epoch": 1.6343783022190912, "grad_norm": 0.4240446388721466, "learning_rate": 8.067814905423176e-06, "loss": 0.013, "step": 27840 }, { "epoch": 1.6349653633908652, "grad_norm": 0.8460161089897156, "learning_rate": 8.042644961007927e-06, "loss": 0.0116, "step": 27850 }, { "epoch": 1.6355524245626394, "grad_norm": 0.6835250854492188, "learning_rate": 8.017510906130332e-06, "loss": 0.0099, "step": 27860 }, { "epoch": 1.6361394857344136, "grad_norm": 0.409324049949646, "learning_rate": 7.992412762289592e-06, "loss": 0.0073, "step": 27870 }, { "epoch": 1.6367265469061876, "grad_norm": 0.9912254810333252, "learning_rate": 7.967350550954201e-06, "loss": 0.0042, "step": 27880 }, { "epoch": 1.6373136080779616, "grad_norm": 0.8225031495094299, "learning_rate": 7.942324293561876e-06, "loss": 0.0175, "step": 27890 }, { "epoch": 1.6379006692497358, "grad_norm": 0.7784579992294312, "learning_rate": 7.917334011519646e-06, "loss": 0.0086, "step": 27900 }, { "epoch": 1.63848773042151, "grad_norm": 0.9788603782653809, "learning_rate": 7.892379726203702e-06, "loss": 0.0111, "step": 27910 }, { "epoch": 1.6390747915932842, "grad_norm": 0.4499010443687439, "learning_rate": 7.86746145895948e-06, "loss": 0.0063, "step": 27920 }, { "epoch": 1.6396618527650582, "grad_norm": 0.7561082243919373, "learning_rate": 7.84257923110161e-06, "loss": 0.0132, "step": 27930 }, { "epoch": 1.6402489139368321, "grad_norm": 1.6390308141708374, "learning_rate": 7.81773306391389e-06, "loss": 0.0078, "step": 27940 }, { "epoch": 1.6408359751086063, "grad_norm": 0.8007199764251709, "learning_rate": 7.792922978649248e-06, "loss": 0.0064, "step": 27950 }, { "epoch": 1.6414230362803806, "grad_norm": 0.25197023153305054, "learning_rate": 7.768148996529789e-06, "loss": 0.0161, "step": 27960 }, { "epoch": 1.6420100974521545, "grad_norm": 2.500655174255371, "learning_rate": 7.743411138746686e-06, "loss": 0.0264, "step": 27970 }, { "epoch": 1.6425971586239285, "grad_norm": 1.3843352794647217, "learning_rate": 7.718709426460258e-06, "loss": 0.0192, "step": 27980 }, { "epoch": 1.6431842197957027, "grad_norm": 0.005819643381983042, "learning_rate": 7.694043880799889e-06, "loss": 0.0062, "step": 27990 }, { "epoch": 1.643771280967477, "grad_norm": 1.0938959121704102, "learning_rate": 7.669414522864028e-06, "loss": 0.0082, "step": 28000 }, { "epoch": 1.644358342139251, "grad_norm": 0.06952214986085892, "learning_rate": 7.644821373720168e-06, "loss": 0.0075, "step": 28010 }, { "epoch": 1.644945403311025, "grad_norm": 1.3579368591308594, "learning_rate": 7.620264454404819e-06, "loss": 0.0094, "step": 28020 }, { "epoch": 1.645532464482799, "grad_norm": 0.05804837495088577, "learning_rate": 7.595743785923515e-06, "loss": 0.0074, "step": 28030 }, { "epoch": 1.6461195256545733, "grad_norm": 1.416920781135559, "learning_rate": 7.571259389250779e-06, "loss": 0.0081, "step": 28040 }, { "epoch": 1.6467065868263473, "grad_norm": 0.14133776724338531, "learning_rate": 7.546811285330119e-06, "loss": 0.0111, "step": 28050 }, { "epoch": 1.6472936479981213, "grad_norm": 0.17169342935085297, "learning_rate": 7.522399495073962e-06, "loss": 0.0044, "step": 28060 }, { "epoch": 1.6478807091698955, "grad_norm": 0.22633597254753113, "learning_rate": 7.4980240393637216e-06, "loss": 0.0058, "step": 28070 }, { "epoch": 1.6484677703416697, "grad_norm": 2.8432278633117676, "learning_rate": 7.473684939049685e-06, "loss": 0.0147, "step": 28080 }, { "epoch": 1.6490548315134437, "grad_norm": 0.7656290531158447, "learning_rate": 7.449382214951073e-06, "loss": 0.0207, "step": 28090 }, { "epoch": 1.6496418926852177, "grad_norm": 0.3263416886329651, "learning_rate": 7.425115887855983e-06, "loss": 0.016, "step": 28100 }, { "epoch": 1.6502289538569919, "grad_norm": 0.5870920419692993, "learning_rate": 7.400885978521393e-06, "loss": 0.0145, "step": 28110 }, { "epoch": 1.650816015028766, "grad_norm": 2.001688241958618, "learning_rate": 7.376692507673083e-06, "loss": 0.0174, "step": 28120 }, { "epoch": 1.65140307620054, "grad_norm": 0.018592197448015213, "learning_rate": 7.3525354960057195e-06, "loss": 0.0076, "step": 28130 }, { "epoch": 1.651990137372314, "grad_norm": 1.0145572423934937, "learning_rate": 7.328414964182756e-06, "loss": 0.0047, "step": 28140 }, { "epoch": 1.6525771985440882, "grad_norm": 1.2617369890213013, "learning_rate": 7.304330932836434e-06, "loss": 0.0044, "step": 28150 }, { "epoch": 1.6531642597158624, "grad_norm": 0.3889461159706116, "learning_rate": 7.2802834225677905e-06, "loss": 0.0038, "step": 28160 }, { "epoch": 1.6537513208876367, "grad_norm": 1.0298255681991577, "learning_rate": 7.256272453946616e-06, "loss": 0.0267, "step": 28170 }, { "epoch": 1.6543383820594106, "grad_norm": 0.31365713477134705, "learning_rate": 7.23229804751146e-06, "loss": 0.0127, "step": 28180 }, { "epoch": 1.6549254432311846, "grad_norm": 0.2739083468914032, "learning_rate": 7.208360223769555e-06, "loss": 0.0156, "step": 28190 }, { "epoch": 1.6555125044029588, "grad_norm": 1.263667106628418, "learning_rate": 7.184459003196892e-06, "loss": 0.0195, "step": 28200 }, { "epoch": 1.656099565574733, "grad_norm": 3.5378830432891846, "learning_rate": 7.1605944062380916e-06, "loss": 0.0181, "step": 28210 }, { "epoch": 1.656686626746507, "grad_norm": 0.9924853444099426, "learning_rate": 7.136766453306537e-06, "loss": 0.0104, "step": 28220 }, { "epoch": 1.657273687918281, "grad_norm": 1.4844236373901367, "learning_rate": 7.112975164784175e-06, "loss": 0.0156, "step": 28230 }, { "epoch": 1.6578607490900552, "grad_norm": 0.9118503928184509, "learning_rate": 7.089220561021648e-06, "loss": 0.0095, "step": 28240 }, { "epoch": 1.6584478102618294, "grad_norm": 1.9157600402832031, "learning_rate": 7.065502662338186e-06, "loss": 0.0151, "step": 28250 }, { "epoch": 1.6590348714336034, "grad_norm": 0.635193943977356, "learning_rate": 7.041821489021639e-06, "loss": 0.0041, "step": 28260 }, { "epoch": 1.6596219326053774, "grad_norm": 2.84649920463562, "learning_rate": 7.018177061328451e-06, "loss": 0.0156, "step": 28270 }, { "epoch": 1.6602089937771516, "grad_norm": 0.9046671390533447, "learning_rate": 6.994569399483614e-06, "loss": 0.0065, "step": 28280 }, { "epoch": 1.6607960549489258, "grad_norm": 1.2423869371414185, "learning_rate": 6.9709985236807e-06, "loss": 0.0053, "step": 28290 }, { "epoch": 1.6613831161206998, "grad_norm": 0.06829379498958588, "learning_rate": 6.947464454081765e-06, "loss": 0.0298, "step": 28300 }, { "epoch": 1.6619701772924738, "grad_norm": 0.7184990644454956, "learning_rate": 6.923967210817439e-06, "loss": 0.0165, "step": 28310 }, { "epoch": 1.662557238464248, "grad_norm": 0.34448954463005066, "learning_rate": 6.900506813986806e-06, "loss": 0.0139, "step": 28320 }, { "epoch": 1.6631442996360222, "grad_norm": 1.090071439743042, "learning_rate": 6.8770832836574596e-06, "loss": 0.0158, "step": 28330 }, { "epoch": 1.6637313608077962, "grad_norm": 0.07841499894857407, "learning_rate": 6.853696639865448e-06, "loss": 0.0115, "step": 28340 }, { "epoch": 1.6643184219795701, "grad_norm": 0.96290123462677, "learning_rate": 6.830346902615281e-06, "loss": 0.0065, "step": 28350 }, { "epoch": 1.6649054831513443, "grad_norm": 0.41085779666900635, "learning_rate": 6.807034091879866e-06, "loss": 0.0064, "step": 28360 }, { "epoch": 1.6654925443231186, "grad_norm": 0.2559986412525177, "learning_rate": 6.783758227600567e-06, "loss": 0.0061, "step": 28370 }, { "epoch": 1.6660796054948925, "grad_norm": 0.7906329035758972, "learning_rate": 6.760519329687099e-06, "loss": 0.0251, "step": 28380 }, { "epoch": 1.6666666666666665, "grad_norm": 1.0253071784973145, "learning_rate": 6.737317418017608e-06, "loss": 0.0212, "step": 28390 }, { "epoch": 1.6672537278384407, "grad_norm": 0.6800417900085449, "learning_rate": 6.7141525124385595e-06, "loss": 0.0096, "step": 28400 }, { "epoch": 1.667840789010215, "grad_norm": 0.6423472166061401, "learning_rate": 6.6910246327647864e-06, "loss": 0.0057, "step": 28410 }, { "epoch": 1.668427850181989, "grad_norm": 2.37115478515625, "learning_rate": 6.667933798779447e-06, "loss": 0.0332, "step": 28420 }, { "epoch": 1.669014911353763, "grad_norm": 0.19904451072216034, "learning_rate": 6.644880030234002e-06, "loss": 0.0067, "step": 28430 }, { "epoch": 1.669601972525537, "grad_norm": 1.2901134490966797, "learning_rate": 6.621863346848217e-06, "loss": 0.0116, "step": 28440 }, { "epoch": 1.6701890336973113, "grad_norm": 0.21249383687973022, "learning_rate": 6.598883768310133e-06, "loss": 0.0135, "step": 28450 }, { "epoch": 1.6707760948690855, "grad_norm": 0.33667126297950745, "learning_rate": 6.575941314276063e-06, "loss": 0.0046, "step": 28460 }, { "epoch": 1.6713631560408595, "grad_norm": 1.0641509294509888, "learning_rate": 6.553036004370533e-06, "loss": 0.0154, "step": 28470 }, { "epoch": 1.6719502172126335, "grad_norm": 1.4862817525863647, "learning_rate": 6.530167858186342e-06, "loss": 0.0084, "step": 28480 }, { "epoch": 1.6725372783844077, "grad_norm": 0.9070628881454468, "learning_rate": 6.507336895284449e-06, "loss": 0.0057, "step": 28490 }, { "epoch": 1.673124339556182, "grad_norm": 3.8373923301696777, "learning_rate": 6.484543135194043e-06, "loss": 0.0116, "step": 28500 }, { "epoch": 1.6737114007279559, "grad_norm": 2.346761465072632, "learning_rate": 6.461786597412489e-06, "loss": 0.017, "step": 28510 }, { "epoch": 1.6742984618997299, "grad_norm": 0.3607553541660309, "learning_rate": 6.439067301405305e-06, "loss": 0.0194, "step": 28520 }, { "epoch": 1.674885523071504, "grad_norm": 0.637547492980957, "learning_rate": 6.416385266606134e-06, "loss": 0.0172, "step": 28530 }, { "epoch": 1.6754725842432783, "grad_norm": 0.1487882435321808, "learning_rate": 6.393740512416785e-06, "loss": 0.0026, "step": 28540 }, { "epoch": 1.6760596454150523, "grad_norm": 0.23470908403396606, "learning_rate": 6.37113305820714e-06, "loss": 0.0146, "step": 28550 }, { "epoch": 1.6766467065868262, "grad_norm": 2.0933635234832764, "learning_rate": 6.348562923315194e-06, "loss": 0.0085, "step": 28560 }, { "epoch": 1.6772337677586004, "grad_norm": 0.010059098713099957, "learning_rate": 6.326030127047045e-06, "loss": 0.0031, "step": 28570 }, { "epoch": 1.6778208289303747, "grad_norm": 3.1899969577789307, "learning_rate": 6.303534688676799e-06, "loss": 0.0102, "step": 28580 }, { "epoch": 1.6784078901021486, "grad_norm": 0.803027868270874, "learning_rate": 6.281076627446652e-06, "loss": 0.0123, "step": 28590 }, { "epoch": 1.6789949512739226, "grad_norm": 2.3589420318603516, "learning_rate": 6.25865596256679e-06, "loss": 0.0248, "step": 28600 }, { "epoch": 1.6795820124456968, "grad_norm": 0.2844483256340027, "learning_rate": 6.236272713215441e-06, "loss": 0.0082, "step": 28610 }, { "epoch": 1.680169073617471, "grad_norm": 0.4956388771533966, "learning_rate": 6.213926898538825e-06, "loss": 0.0202, "step": 28620 }, { "epoch": 1.680756134789245, "grad_norm": 0.3350709080696106, "learning_rate": 6.1916185376511286e-06, "loss": 0.0208, "step": 28630 }, { "epoch": 1.681343195961019, "grad_norm": 0.4688805937767029, "learning_rate": 6.1693476496344996e-06, "loss": 0.0102, "step": 28640 }, { "epoch": 1.6819302571327932, "grad_norm": 0.46661171317100525, "learning_rate": 6.14711425353906e-06, "loss": 0.0055, "step": 28650 }, { "epoch": 1.6825173183045674, "grad_norm": 0.023438239470124245, "learning_rate": 6.124918368382815e-06, "loss": 0.0097, "step": 28660 }, { "epoch": 1.6831043794763414, "grad_norm": 0.024195751175284386, "learning_rate": 6.1027600131517205e-06, "loss": 0.0065, "step": 28670 }, { "epoch": 1.6836914406481154, "grad_norm": 0.900233805179596, "learning_rate": 6.080639206799626e-06, "loss": 0.0119, "step": 28680 }, { "epoch": 1.6842785018198896, "grad_norm": 0.1984151303768158, "learning_rate": 6.058555968248247e-06, "loss": 0.0068, "step": 28690 }, { "epoch": 1.6848655629916638, "grad_norm": 0.29011955857276917, "learning_rate": 6.036510316387195e-06, "loss": 0.0121, "step": 28700 }, { "epoch": 1.685452624163438, "grad_norm": 0.5753807425498962, "learning_rate": 6.014502270073874e-06, "loss": 0.0111, "step": 28710 }, { "epoch": 1.686039685335212, "grad_norm": 1.1721597909927368, "learning_rate": 5.9925318481335925e-06, "loss": 0.0216, "step": 28720 }, { "epoch": 1.686626746506986, "grad_norm": 2.9216232299804688, "learning_rate": 5.970599069359395e-06, "loss": 0.0171, "step": 28730 }, { "epoch": 1.6872138076787602, "grad_norm": 1.7761470079421997, "learning_rate": 5.948703952512214e-06, "loss": 0.0084, "step": 28740 }, { "epoch": 1.6878008688505344, "grad_norm": 0.9608684778213501, "learning_rate": 5.9268465163207e-06, "loss": 0.0078, "step": 28750 }, { "epoch": 1.6883879300223084, "grad_norm": 0.27691853046417236, "learning_rate": 5.9050267794813045e-06, "loss": 0.0117, "step": 28760 }, { "epoch": 1.6889749911940823, "grad_norm": 0.6340759992599487, "learning_rate": 5.883244760658213e-06, "loss": 0.013, "step": 28770 }, { "epoch": 1.6895620523658565, "grad_norm": 0.0834115594625473, "learning_rate": 5.861500478483362e-06, "loss": 0.0195, "step": 28780 }, { "epoch": 1.6901491135376308, "grad_norm": 1.4118295907974243, "learning_rate": 5.83979395155641e-06, "loss": 0.006, "step": 28790 }, { "epoch": 1.6907361747094047, "grad_norm": 0.10325326770544052, "learning_rate": 5.818125198444713e-06, "loss": 0.0085, "step": 28800 }, { "epoch": 1.6913232358811787, "grad_norm": 0.09219737350940704, "learning_rate": 5.796494237683309e-06, "loss": 0.0034, "step": 28810 }, { "epoch": 1.691910297052953, "grad_norm": 1.9999827146530151, "learning_rate": 5.774901087774937e-06, "loss": 0.0182, "step": 28820 }, { "epoch": 1.6924973582247271, "grad_norm": 1.1126513481140137, "learning_rate": 5.753345767189949e-06, "loss": 0.0162, "step": 28830 }, { "epoch": 1.6930844193965011, "grad_norm": 2.611462116241455, "learning_rate": 5.73182829436637e-06, "loss": 0.0096, "step": 28840 }, { "epoch": 1.693671480568275, "grad_norm": 0.024148166179656982, "learning_rate": 5.710348687709855e-06, "loss": 0.0141, "step": 28850 }, { "epoch": 1.6942585417400493, "grad_norm": 0.04645540937781334, "learning_rate": 5.688906965593649e-06, "loss": 0.0143, "step": 28860 }, { "epoch": 1.6948456029118235, "grad_norm": 0.011197218671441078, "learning_rate": 5.667503146358616e-06, "loss": 0.0175, "step": 28870 }, { "epoch": 1.6954326640835975, "grad_norm": 0.9446488618850708, "learning_rate": 5.64613724831316e-06, "loss": 0.0106, "step": 28880 }, { "epoch": 1.6960197252553715, "grad_norm": 0.8722050786018372, "learning_rate": 5.624809289733296e-06, "loss": 0.0092, "step": 28890 }, { "epoch": 1.6966067864271457, "grad_norm": 1.4375278949737549, "learning_rate": 5.603519288862536e-06, "loss": 0.0037, "step": 28900 }, { "epoch": 1.69719384759892, "grad_norm": 1.1370137929916382, "learning_rate": 5.582267263911961e-06, "loss": 0.0072, "step": 28910 }, { "epoch": 1.6977809087706939, "grad_norm": 0.3658529222011566, "learning_rate": 5.561053233060154e-06, "loss": 0.0102, "step": 28920 }, { "epoch": 1.6983679699424679, "grad_norm": 0.33324185013771057, "learning_rate": 5.539877214453215e-06, "loss": 0.0073, "step": 28930 }, { "epoch": 1.698955031114242, "grad_norm": 1.37355637550354, "learning_rate": 5.518739226204689e-06, "loss": 0.0138, "step": 28940 }, { "epoch": 1.6995420922860163, "grad_norm": 0.08746024966239929, "learning_rate": 5.497639286395645e-06, "loss": 0.0172, "step": 28950 }, { "epoch": 1.7001291534577903, "grad_norm": 0.009797824546694756, "learning_rate": 5.476577413074535e-06, "loss": 0.0034, "step": 28960 }, { "epoch": 1.7007162146295642, "grad_norm": 1.9778701066970825, "learning_rate": 5.455553624257331e-06, "loss": 0.0083, "step": 28970 }, { "epoch": 1.7013032758013384, "grad_norm": 0.04707961902022362, "learning_rate": 5.434567937927387e-06, "loss": 0.0107, "step": 28980 }, { "epoch": 1.7018903369731126, "grad_norm": 0.3633284270763397, "learning_rate": 5.413620372035449e-06, "loss": 0.0063, "step": 28990 }, { "epoch": 1.7024773981448869, "grad_norm": 0.024128960445523262, "learning_rate": 5.39271094449969e-06, "loss": 0.0146, "step": 29000 }, { "epoch": 1.7030644593166608, "grad_norm": 2.032116413116455, "learning_rate": 5.371839673205625e-06, "loss": 0.016, "step": 29010 }, { "epoch": 1.7036515204884348, "grad_norm": 0.2847971022129059, "learning_rate": 5.351006576006162e-06, "loss": 0.0113, "step": 29020 }, { "epoch": 1.704238581660209, "grad_norm": 0.08455371111631393, "learning_rate": 5.330211670721535e-06, "loss": 0.0041, "step": 29030 }, { "epoch": 1.7048256428319832, "grad_norm": 2.182123899459839, "learning_rate": 5.309454975139338e-06, "loss": 0.01, "step": 29040 }, { "epoch": 1.7054127040037572, "grad_norm": 8.205873489379883, "learning_rate": 5.288736507014435e-06, "loss": 0.0204, "step": 29050 }, { "epoch": 1.7059997651755312, "grad_norm": 0.3556675910949707, "learning_rate": 5.26805628406904e-06, "loss": 0.0164, "step": 29060 }, { "epoch": 1.7065868263473054, "grad_norm": 0.12249883264303207, "learning_rate": 5.247414323992605e-06, "loss": 0.0109, "step": 29070 }, { "epoch": 1.7071738875190796, "grad_norm": 3.531991481781006, "learning_rate": 5.2268106444418875e-06, "loss": 0.0104, "step": 29080 }, { "epoch": 1.7077609486908536, "grad_norm": 0.9957418441772461, "learning_rate": 5.206245263040893e-06, "loss": 0.0121, "step": 29090 }, { "epoch": 1.7083480098626276, "grad_norm": 0.01013266108930111, "learning_rate": 5.1857181973808735e-06, "loss": 0.0038, "step": 29100 }, { "epoch": 1.7089350710344018, "grad_norm": 0.007872414775192738, "learning_rate": 5.165229465020277e-06, "loss": 0.0031, "step": 29110 }, { "epoch": 1.709522132206176, "grad_norm": 0.21149253845214844, "learning_rate": 5.144779083484791e-06, "loss": 0.0191, "step": 29120 }, { "epoch": 1.71010919337795, "grad_norm": 0.1750185489654541, "learning_rate": 5.1243670702673e-06, "loss": 0.0128, "step": 29130 }, { "epoch": 1.710696254549724, "grad_norm": 1.096483588218689, "learning_rate": 5.103993442827831e-06, "loss": 0.0139, "step": 29140 }, { "epoch": 1.7112833157214982, "grad_norm": 0.05265704542398453, "learning_rate": 5.0836582185936456e-06, "loss": 0.0073, "step": 29150 }, { "epoch": 1.7118703768932724, "grad_norm": 1.1015551090240479, "learning_rate": 5.063361414959083e-06, "loss": 0.0192, "step": 29160 }, { "epoch": 1.7124574380650464, "grad_norm": 1.3283004760742188, "learning_rate": 5.043103049285663e-06, "loss": 0.007, "step": 29170 }, { "epoch": 1.7130444992368203, "grad_norm": 0.2930244207382202, "learning_rate": 5.022883138902007e-06, "loss": 0.0088, "step": 29180 }, { "epoch": 1.7136315604085945, "grad_norm": 1.6627315282821655, "learning_rate": 5.002701701103846e-06, "loss": 0.0125, "step": 29190 }, { "epoch": 1.7142186215803688, "grad_norm": 0.6000300049781799, "learning_rate": 4.982558753154009e-06, "loss": 0.0149, "step": 29200 }, { "epoch": 1.7148056827521427, "grad_norm": 0.060055505484342575, "learning_rate": 4.962454312282411e-06, "loss": 0.0093, "step": 29210 }, { "epoch": 1.7153927439239167, "grad_norm": 3.2035300731658936, "learning_rate": 4.942388395685993e-06, "loss": 0.0133, "step": 29220 }, { "epoch": 1.715979805095691, "grad_norm": 0.2768530547618866, "learning_rate": 4.922361020528782e-06, "loss": 0.0068, "step": 29230 }, { "epoch": 1.7165668662674651, "grad_norm": 0.14047828316688538, "learning_rate": 4.9023722039418015e-06, "loss": 0.0133, "step": 29240 }, { "epoch": 1.7171539274392391, "grad_norm": 0.46277543902397156, "learning_rate": 4.882421963023126e-06, "loss": 0.0212, "step": 29250 }, { "epoch": 1.7177409886110133, "grad_norm": 1.4688807725906372, "learning_rate": 4.86251031483782e-06, "loss": 0.0202, "step": 29260 }, { "epoch": 1.7183280497827873, "grad_norm": 0.06642159074544907, "learning_rate": 4.842637276417927e-06, "loss": 0.0137, "step": 29270 }, { "epoch": 1.7189151109545615, "grad_norm": 1.3905795812606812, "learning_rate": 4.822802864762488e-06, "loss": 0.0143, "step": 29280 }, { "epoch": 1.7195021721263357, "grad_norm": 1.2005339860916138, "learning_rate": 4.80300709683747e-06, "loss": 0.0045, "step": 29290 }, { "epoch": 1.7200892332981097, "grad_norm": 1.708850622177124, "learning_rate": 4.7832499895758166e-06, "loss": 0.0078, "step": 29300 }, { "epoch": 1.7206762944698837, "grad_norm": 2.6543662548065186, "learning_rate": 4.76353155987736e-06, "loss": 0.0172, "step": 29310 }, { "epoch": 1.7212633556416579, "grad_norm": 0.02566930092871189, "learning_rate": 4.7438518246089245e-06, "loss": 0.0132, "step": 29320 }, { "epoch": 1.721850416813432, "grad_norm": 0.25859344005584717, "learning_rate": 4.724210800604151e-06, "loss": 0.002, "step": 29330 }, { "epoch": 1.722437477985206, "grad_norm": 0.5459009408950806, "learning_rate": 4.704608504663627e-06, "loss": 0.0084, "step": 29340 }, { "epoch": 1.72302453915698, "grad_norm": 1.8527095317840576, "learning_rate": 4.685044953554768e-06, "loss": 0.0055, "step": 29350 }, { "epoch": 1.7236116003287543, "grad_norm": 0.3627167344093323, "learning_rate": 4.6655201640118775e-06, "loss": 0.0046, "step": 29360 }, { "epoch": 1.7241986615005285, "grad_norm": 0.9517664313316345, "learning_rate": 4.646034152736101e-06, "loss": 0.0117, "step": 29370 }, { "epoch": 1.7247857226723025, "grad_norm": 0.36534351110458374, "learning_rate": 4.626586936395411e-06, "loss": 0.0089, "step": 29380 }, { "epoch": 1.7253727838440764, "grad_norm": 0.4141369163990021, "learning_rate": 4.607178531624595e-06, "loss": 0.0093, "step": 29390 }, { "epoch": 1.7259598450158506, "grad_norm": 1.4190118312835693, "learning_rate": 4.5878089550252246e-06, "loss": 0.0163, "step": 29400 }, { "epoch": 1.7265469061876249, "grad_norm": 0.055241331458091736, "learning_rate": 4.568478223165696e-06, "loss": 0.0265, "step": 29410 }, { "epoch": 1.7271339673593988, "grad_norm": 0.7223270535469055, "learning_rate": 4.549186352581131e-06, "loss": 0.0096, "step": 29420 }, { "epoch": 1.7277210285311728, "grad_norm": 0.5983698964118958, "learning_rate": 4.529933359773447e-06, "loss": 0.0109, "step": 29430 }, { "epoch": 1.728308089702947, "grad_norm": 0.3652210831642151, "learning_rate": 4.510719261211293e-06, "loss": 0.0051, "step": 29440 }, { "epoch": 1.7288951508747212, "grad_norm": 0.18875166773796082, "learning_rate": 4.491544073330062e-06, "loss": 0.0103, "step": 29450 }, { "epoch": 1.7294822120464952, "grad_norm": 0.7531063556671143, "learning_rate": 4.472407812531831e-06, "loss": 0.0131, "step": 29460 }, { "epoch": 1.7300692732182692, "grad_norm": 1.282982587814331, "learning_rate": 4.4533104951854255e-06, "loss": 0.0115, "step": 29470 }, { "epoch": 1.7306563343900434, "grad_norm": 0.8157762289047241, "learning_rate": 4.434252137626305e-06, "loss": 0.0179, "step": 29480 }, { "epoch": 1.7312433955618176, "grad_norm": 2.062001943588257, "learning_rate": 4.4152327561566455e-06, "loss": 0.0118, "step": 29490 }, { "epoch": 1.7318304567335916, "grad_norm": 0.1468178629875183, "learning_rate": 4.3962523670452725e-06, "loss": 0.0056, "step": 29500 }, { "epoch": 1.7324175179053656, "grad_norm": 0.04338105395436287, "learning_rate": 4.37731098652766e-06, "loss": 0.0057, "step": 29510 }, { "epoch": 1.7330045790771398, "grad_norm": 0.2398417741060257, "learning_rate": 4.358408630805905e-06, "loss": 0.0088, "step": 29520 }, { "epoch": 1.733591640248914, "grad_norm": 2.732269287109375, "learning_rate": 4.339545316048721e-06, "loss": 0.0253, "step": 29530 }, { "epoch": 1.7341787014206882, "grad_norm": 2.1497623920440674, "learning_rate": 4.320721058391453e-06, "loss": 0.0125, "step": 29540 }, { "epoch": 1.7347657625924622, "grad_norm": 2.46779465675354, "learning_rate": 4.301935873936003e-06, "loss": 0.0129, "step": 29550 }, { "epoch": 1.7353528237642362, "grad_norm": 0.759006917476654, "learning_rate": 4.28318977875089e-06, "loss": 0.034, "step": 29560 }, { "epoch": 1.7359398849360104, "grad_norm": 2.555346965789795, "learning_rate": 4.264482788871149e-06, "loss": 0.0301, "step": 29570 }, { "epoch": 1.7365269461077846, "grad_norm": 0.46556320786476135, "learning_rate": 4.245814920298402e-06, "loss": 0.0122, "step": 29580 }, { "epoch": 1.7371140072795586, "grad_norm": 1.6327601671218872, "learning_rate": 4.227186189000787e-06, "loss": 0.004, "step": 29590 }, { "epoch": 1.7377010684513325, "grad_norm": 0.442588746547699, "learning_rate": 4.2085966109129796e-06, "loss": 0.0051, "step": 29600 }, { "epoch": 1.7382881296231067, "grad_norm": 2.899655342102051, "learning_rate": 4.190046201936154e-06, "loss": 0.0136, "step": 29610 }, { "epoch": 1.738875190794881, "grad_norm": 0.918697714805603, "learning_rate": 4.171534977937991e-06, "loss": 0.0201, "step": 29620 }, { "epoch": 1.739462251966655, "grad_norm": 0.8825867772102356, "learning_rate": 4.153062954752635e-06, "loss": 0.0133, "step": 29630 }, { "epoch": 1.740049313138429, "grad_norm": 0.8073718547821045, "learning_rate": 4.134630148180724e-06, "loss": 0.0119, "step": 29640 }, { "epoch": 1.7406363743102031, "grad_norm": 1.0322785377502441, "learning_rate": 4.1162365739893125e-06, "loss": 0.0037, "step": 29650 }, { "epoch": 1.7412234354819773, "grad_norm": 0.2558544874191284, "learning_rate": 4.0978822479119325e-06, "loss": 0.0197, "step": 29660 }, { "epoch": 1.7418104966537513, "grad_norm": 0.6548084616661072, "learning_rate": 4.0795671856485475e-06, "loss": 0.0152, "step": 29670 }, { "epoch": 1.7423975578255253, "grad_norm": 1.7351124286651611, "learning_rate": 4.061291402865497e-06, "loss": 0.0138, "step": 29680 }, { "epoch": 1.7429846189972995, "grad_norm": 0.7203704714775085, "learning_rate": 4.043054915195566e-06, "loss": 0.0085, "step": 29690 }, { "epoch": 1.7435716801690737, "grad_norm": 0.4105221927165985, "learning_rate": 4.024857738237875e-06, "loss": 0.0063, "step": 29700 }, { "epoch": 1.7441587413408477, "grad_norm": 2.8076388835906982, "learning_rate": 4.006699887557974e-06, "loss": 0.0254, "step": 29710 }, { "epoch": 1.7447458025126217, "grad_norm": 0.5886854529380798, "learning_rate": 3.988581378687739e-06, "loss": 0.0213, "step": 29720 }, { "epoch": 1.7453328636843959, "grad_norm": 0.8291465640068054, "learning_rate": 3.970502227125417e-06, "loss": 0.0025, "step": 29730 }, { "epoch": 1.74591992485617, "grad_norm": 0.7809314131736755, "learning_rate": 3.952462448335553e-06, "loss": 0.0067, "step": 29740 }, { "epoch": 1.746506986027944, "grad_norm": 1.4025253057479858, "learning_rate": 3.934462057749067e-06, "loss": 0.0466, "step": 29750 }, { "epoch": 1.747094047199718, "grad_norm": 0.5499942898750305, "learning_rate": 3.916501070763124e-06, "loss": 0.0026, "step": 29760 }, { "epoch": 1.7476811083714923, "grad_norm": 0.9017033576965332, "learning_rate": 3.898579502741234e-06, "loss": 0.0095, "step": 29770 }, { "epoch": 1.7482681695432665, "grad_norm": 0.8160563111305237, "learning_rate": 3.88069736901317e-06, "loss": 0.0075, "step": 29780 }, { "epoch": 1.7488552307150405, "grad_norm": 0.9270476698875427, "learning_rate": 3.8628546848749895e-06, "loss": 0.013, "step": 29790 }, { "epoch": 1.7494422918868147, "grad_norm": 0.7246671319007874, "learning_rate": 3.845051465588962e-06, "loss": 0.0119, "step": 29800 }, { "epoch": 1.7500293530585886, "grad_norm": 1.147682547569275, "learning_rate": 3.827287726383644e-06, "loss": 0.0063, "step": 29810 }, { "epoch": 1.7506164142303628, "grad_norm": 0.7257166504859924, "learning_rate": 3.809563482453815e-06, "loss": 0.0203, "step": 29820 }, { "epoch": 1.751203475402137, "grad_norm": 1.0267794132232666, "learning_rate": 3.7918787489604477e-06, "loss": 0.0231, "step": 29830 }, { "epoch": 1.751790536573911, "grad_norm": 1.2328356504440308, "learning_rate": 3.7742335410307306e-06, "loss": 0.004, "step": 29840 }, { "epoch": 1.752377597745685, "grad_norm": 0.29348883032798767, "learning_rate": 3.7566278737580563e-06, "loss": 0.016, "step": 29850 }, { "epoch": 1.7529646589174592, "grad_norm": 0.30717048048973083, "learning_rate": 3.7390617622019897e-06, "loss": 0.0102, "step": 29860 }, { "epoch": 1.7535517200892334, "grad_norm": 3.2439353466033936, "learning_rate": 3.7215352213882338e-06, "loss": 0.0221, "step": 29870 }, { "epoch": 1.7541387812610074, "grad_norm": 0.08938045054674149, "learning_rate": 3.704048266308685e-06, "loss": 0.013, "step": 29880 }, { "epoch": 1.7547258424327814, "grad_norm": 0.4849737584590912, "learning_rate": 3.6866009119213283e-06, "loss": 0.0067, "step": 29890 }, { "epoch": 1.7553129036045556, "grad_norm": 0.029423721134662628, "learning_rate": 3.6691931731503425e-06, "loss": 0.0115, "step": 29900 }, { "epoch": 1.7558999647763298, "grad_norm": 2.5846798419952393, "learning_rate": 3.651825064885955e-06, "loss": 0.0299, "step": 29910 }, { "epoch": 1.7564870259481038, "grad_norm": 1.6304051876068115, "learning_rate": 3.6344966019845385e-06, "loss": 0.0113, "step": 29920 }, { "epoch": 1.7570740871198778, "grad_norm": 0.7822657227516174, "learning_rate": 3.6172077992685182e-06, "loss": 0.0103, "step": 29930 }, { "epoch": 1.757661148291652, "grad_norm": 0.09422504901885986, "learning_rate": 3.5999586715264267e-06, "loss": 0.0263, "step": 29940 }, { "epoch": 1.7582482094634262, "grad_norm": 0.8968960642814636, "learning_rate": 3.5827492335128333e-06, "loss": 0.0057, "step": 29950 }, { "epoch": 1.7588352706352002, "grad_norm": 0.48576679825782776, "learning_rate": 3.5655794999483847e-06, "loss": 0.0071, "step": 29960 }, { "epoch": 1.7594223318069742, "grad_norm": 0.06283847242593765, "learning_rate": 3.5484494855197505e-06, "loss": 0.0096, "step": 29970 }, { "epoch": 1.7600093929787484, "grad_norm": 0.8090189099311829, "learning_rate": 3.5313592048796086e-06, "loss": 0.0057, "step": 29980 }, { "epoch": 1.7605964541505226, "grad_norm": 0.1381601095199585, "learning_rate": 3.514308672646682e-06, "loss": 0.0049, "step": 29990 }, { "epoch": 1.7611835153222966, "grad_norm": 1.2781418561935425, "learning_rate": 3.497297903405666e-06, "loss": 0.0037, "step": 30000 }, { "epoch": 1.7611835153222966, "eval_loss": 0.504189670085907, "eval_runtime": 269.4439, "eval_samples_per_second": 3.507, "eval_steps_per_second": 3.507, "step": 30000 }, { "epoch": 1.7617705764940705, "grad_norm": 0.25365105271339417, "learning_rate": 3.4803269117072546e-06, "loss": 0.0116, "step": 30010 }, { "epoch": 1.7623576376658447, "grad_norm": 0.43163204193115234, "learning_rate": 3.4633957120681293e-06, "loss": 0.0084, "step": 30020 }, { "epoch": 1.762944698837619, "grad_norm": 0.04039360210299492, "learning_rate": 3.4465043189709168e-06, "loss": 0.0029, "step": 30030 }, { "epoch": 1.763531760009393, "grad_norm": 0.12317579239606857, "learning_rate": 3.429652746864187e-06, "loss": 0.0151, "step": 30040 }, { "epoch": 1.764118821181167, "grad_norm": 0.3732117712497711, "learning_rate": 3.4128410101624817e-06, "loss": 0.0166, "step": 30050 }, { "epoch": 1.7647058823529411, "grad_norm": 0.13820882141590118, "learning_rate": 3.396069123246226e-06, "loss": 0.0116, "step": 30060 }, { "epoch": 1.7652929435247153, "grad_norm": 0.7048861384391785, "learning_rate": 3.379337100461788e-06, "loss": 0.0139, "step": 30070 }, { "epoch": 1.7658800046964895, "grad_norm": 0.7680193781852722, "learning_rate": 3.3626449561214245e-06, "loss": 0.0113, "step": 30080 }, { "epoch": 1.7664670658682635, "grad_norm": 0.4921130836009979, "learning_rate": 3.3459927045032867e-06, "loss": 0.0047, "step": 30090 }, { "epoch": 1.7670541270400375, "grad_norm": 0.3525088429450989, "learning_rate": 3.3293803598514086e-06, "loss": 0.0246, "step": 30100 }, { "epoch": 1.7676411882118117, "grad_norm": 0.4365999400615692, "learning_rate": 3.312807936375656e-06, "loss": 0.0378, "step": 30110 }, { "epoch": 1.768228249383586, "grad_norm": 0.027743898332118988, "learning_rate": 3.29627544825179e-06, "loss": 0.0132, "step": 30120 }, { "epoch": 1.76881531055536, "grad_norm": 0.7587908506393433, "learning_rate": 3.2797829096213818e-06, "loss": 0.0054, "step": 30130 }, { "epoch": 1.7694023717271339, "grad_norm": 0.06924723088741302, "learning_rate": 3.263330334591852e-06, "loss": 0.0103, "step": 30140 }, { "epoch": 1.769989432898908, "grad_norm": 0.3710141181945801, "learning_rate": 3.246917737236416e-06, "loss": 0.0097, "step": 30150 }, { "epoch": 1.7705764940706823, "grad_norm": 1.4772133827209473, "learning_rate": 3.2305451315941095e-06, "loss": 0.0071, "step": 30160 }, { "epoch": 1.7711635552424563, "grad_norm": 0.717305600643158, "learning_rate": 3.2142125316697467e-06, "loss": 0.0179, "step": 30170 }, { "epoch": 1.7717506164142303, "grad_norm": 0.45245057344436646, "learning_rate": 3.1979199514339307e-06, "loss": 0.0147, "step": 30180 }, { "epoch": 1.7723376775860045, "grad_norm": 2.0297160148620605, "learning_rate": 3.18166740482303e-06, "loss": 0.011, "step": 30190 }, { "epoch": 1.7729247387577787, "grad_norm": 0.17766569554805756, "learning_rate": 3.1654549057391737e-06, "loss": 0.0046, "step": 30200 }, { "epoch": 1.7735117999295527, "grad_norm": 1.2223155498504639, "learning_rate": 3.1492824680502244e-06, "loss": 0.0112, "step": 30210 }, { "epoch": 1.7740988611013266, "grad_norm": 0.12885530292987823, "learning_rate": 3.1331501055897883e-06, "loss": 0.0055, "step": 30220 }, { "epoch": 1.7746859222731008, "grad_norm": 0.05095207691192627, "learning_rate": 3.1170578321571887e-06, "loss": 0.0026, "step": 30230 }, { "epoch": 1.775272983444875, "grad_norm": 0.1780925989151001, "learning_rate": 3.1010056615174365e-06, "loss": 0.006, "step": 30240 }, { "epoch": 1.775860044616649, "grad_norm": 0.5828750729560852, "learning_rate": 3.084993607401293e-06, "loss": 0.01, "step": 30250 }, { "epoch": 1.776447105788423, "grad_norm": 2.3321797847747803, "learning_rate": 3.069021683505141e-06, "loss": 0.0106, "step": 30260 }, { "epoch": 1.7770341669601972, "grad_norm": 0.4269194006919861, "learning_rate": 3.05308990349108e-06, "loss": 0.016, "step": 30270 }, { "epoch": 1.7776212281319714, "grad_norm": 0.34413063526153564, "learning_rate": 3.0371982809868527e-06, "loss": 0.0116, "step": 30280 }, { "epoch": 1.7782082893037454, "grad_norm": 0.5998708605766296, "learning_rate": 3.021346829585847e-06, "loss": 0.0192, "step": 30290 }, { "epoch": 1.7787953504755194, "grad_norm": 0.20596611499786377, "learning_rate": 3.005535562847117e-06, "loss": 0.0265, "step": 30300 }, { "epoch": 1.7793824116472936, "grad_norm": 1.3025153875350952, "learning_rate": 2.9897644942953162e-06, "loss": 0.0065, "step": 30310 }, { "epoch": 1.7799694728190678, "grad_norm": 1.1851646900177002, "learning_rate": 2.9740336374207147e-06, "loss": 0.0164, "step": 30320 }, { "epoch": 1.7805565339908418, "grad_norm": 0.005901661701500416, "learning_rate": 2.9583430056792096e-06, "loss": 0.0082, "step": 30330 }, { "epoch": 1.7811435951626158, "grad_norm": 2.596597909927368, "learning_rate": 2.9426926124922592e-06, "loss": 0.007, "step": 30340 }, { "epoch": 1.78173065633439, "grad_norm": 1.2857441902160645, "learning_rate": 2.927082471246917e-06, "loss": 0.0115, "step": 30350 }, { "epoch": 1.7823177175061642, "grad_norm": 0.17682231962680817, "learning_rate": 2.911512595295818e-06, "loss": 0.0144, "step": 30360 }, { "epoch": 1.7829047786779384, "grad_norm": 0.19923894107341766, "learning_rate": 2.8959829979571306e-06, "loss": 0.0094, "step": 30370 }, { "epoch": 1.7834918398497124, "grad_norm": 0.5574448108673096, "learning_rate": 2.880493692514602e-06, "loss": 0.0127, "step": 30380 }, { "epoch": 1.7840789010214864, "grad_norm": 2.2101352214813232, "learning_rate": 2.8650446922174723e-06, "loss": 0.0167, "step": 30390 }, { "epoch": 1.7846659621932606, "grad_norm": 0.2382199764251709, "learning_rate": 2.849636010280543e-06, "loss": 0.0305, "step": 30400 }, { "epoch": 1.7852530233650348, "grad_norm": 1.3032515048980713, "learning_rate": 2.8342676598841044e-06, "loss": 0.0076, "step": 30410 }, { "epoch": 1.7858400845368088, "grad_norm": 0.4751865267753601, "learning_rate": 2.818939654173952e-06, "loss": 0.021, "step": 30420 }, { "epoch": 1.7864271457085827, "grad_norm": 0.2897246778011322, "learning_rate": 2.803652006261387e-06, "loss": 0.0103, "step": 30430 }, { "epoch": 1.787014206880357, "grad_norm": 0.14007450640201569, "learning_rate": 2.7884047292231817e-06, "loss": 0.0099, "step": 30440 }, { "epoch": 1.7876012680521312, "grad_norm": 1.1003576517105103, "learning_rate": 2.7731978361015543e-06, "loss": 0.0056, "step": 30450 }, { "epoch": 1.7881883292239051, "grad_norm": 1.2715568542480469, "learning_rate": 2.75803133990421e-06, "loss": 0.0169, "step": 30460 }, { "epoch": 1.7887753903956791, "grad_norm": 0.03126873821020126, "learning_rate": 2.742905253604272e-06, "loss": 0.0028, "step": 30470 }, { "epoch": 1.7893624515674533, "grad_norm": 0.9643771052360535, "learning_rate": 2.727819590140335e-06, "loss": 0.0108, "step": 30480 }, { "epoch": 1.7899495127392275, "grad_norm": 0.12926580011844635, "learning_rate": 2.712774362416376e-06, "loss": 0.0081, "step": 30490 }, { "epoch": 1.7905365739110015, "grad_norm": 0.16461752355098724, "learning_rate": 2.6977695833018014e-06, "loss": 0.0039, "step": 30500 }, { "epoch": 1.7911236350827755, "grad_norm": 2.5110087394714355, "learning_rate": 2.6828052656314384e-06, "loss": 0.0093, "step": 30510 }, { "epoch": 1.7917106962545497, "grad_norm": 0.11830391734838486, "learning_rate": 2.6678814222054593e-06, "loss": 0.0096, "step": 30520 }, { "epoch": 1.792297757426324, "grad_norm": 0.12954285740852356, "learning_rate": 2.652998065789453e-06, "loss": 0.0178, "step": 30530 }, { "epoch": 1.792884818598098, "grad_norm": 2.564725875854492, "learning_rate": 2.638155209114368e-06, "loss": 0.0127, "step": 30540 }, { "epoch": 1.7934718797698719, "grad_norm": 2.254889726638794, "learning_rate": 2.623352864876505e-06, "loss": 0.0358, "step": 30550 }, { "epoch": 1.794058940941646, "grad_norm": 0.4922681152820587, "learning_rate": 2.6085910457375073e-06, "loss": 0.0108, "step": 30560 }, { "epoch": 1.7946460021134203, "grad_norm": 0.5215016603469849, "learning_rate": 2.5938697643243635e-06, "loss": 0.005, "step": 30570 }, { "epoch": 1.7952330632851943, "grad_norm": 2.470322608947754, "learning_rate": 2.5791890332293788e-06, "loss": 0.0133, "step": 30580 }, { "epoch": 1.7958201244569683, "grad_norm": 0.9131485223770142, "learning_rate": 2.56454886501018e-06, "loss": 0.0116, "step": 30590 }, { "epoch": 1.7964071856287425, "grad_norm": 0.3151148855686188, "learning_rate": 2.5499492721896887e-06, "loss": 0.0052, "step": 30600 }, { "epoch": 1.7969942468005167, "grad_norm": 0.25771021842956543, "learning_rate": 2.535390267256138e-06, "loss": 0.012, "step": 30610 }, { "epoch": 1.7975813079722909, "grad_norm": 0.8276041746139526, "learning_rate": 2.5208718626630045e-06, "loss": 0.006, "step": 30620 }, { "epoch": 1.7981683691440649, "grad_norm": 0.8665030002593994, "learning_rate": 2.5063940708290823e-06, "loss": 0.0104, "step": 30630 }, { "epoch": 1.7987554303158388, "grad_norm": 2.59692645072937, "learning_rate": 2.491956904138393e-06, "loss": 0.0092, "step": 30640 }, { "epoch": 1.799342491487613, "grad_norm": 0.012434203177690506, "learning_rate": 2.4775603749402187e-06, "loss": 0.0019, "step": 30650 }, { "epoch": 1.7999295526593873, "grad_norm": 0.13764256238937378, "learning_rate": 2.4632044955490983e-06, "loss": 0.0062, "step": 30660 }, { "epoch": 1.8005166138311612, "grad_norm": 0.14008191227912903, "learning_rate": 2.4488892782447593e-06, "loss": 0.0087, "step": 30670 }, { "epoch": 1.8011036750029352, "grad_norm": 1.9909189939498901, "learning_rate": 2.4346147352721836e-06, "loss": 0.017, "step": 30680 }, { "epoch": 1.8016907361747094, "grad_norm": 0.3221103549003601, "learning_rate": 2.4203808788415438e-06, "loss": 0.0187, "step": 30690 }, { "epoch": 1.8022777973464836, "grad_norm": 1.4656686782836914, "learning_rate": 2.406187721128217e-06, "loss": 0.0147, "step": 30700 }, { "epoch": 1.8028648585182576, "grad_norm": 0.8037199974060059, "learning_rate": 2.3920352742727636e-06, "loss": 0.0049, "step": 30710 }, { "epoch": 1.8034519196900316, "grad_norm": 2.6417436599731445, "learning_rate": 2.377923550380934e-06, "loss": 0.0116, "step": 30720 }, { "epoch": 1.8040389808618058, "grad_norm": 0.24850361049175262, "learning_rate": 2.3638525615236164e-06, "loss": 0.0066, "step": 30730 }, { "epoch": 1.80462604203358, "grad_norm": 0.5551864504814148, "learning_rate": 2.3498223197368828e-06, "loss": 0.0131, "step": 30740 }, { "epoch": 1.805213103205354, "grad_norm": 0.48247092962265015, "learning_rate": 2.3358328370219286e-06, "loss": 0.0126, "step": 30750 }, { "epoch": 1.805800164377128, "grad_norm": 0.9977641701698303, "learning_rate": 2.3218841253451084e-06, "loss": 0.0055, "step": 30760 }, { "epoch": 1.8063872255489022, "grad_norm": 1.0749900341033936, "learning_rate": 2.3079761966378787e-06, "loss": 0.0206, "step": 30770 }, { "epoch": 1.8069742867206764, "grad_norm": 2.4534342288970947, "learning_rate": 2.2941090627968287e-06, "loss": 0.0194, "step": 30780 }, { "epoch": 1.8075613478924504, "grad_norm": 0.1504058539867401, "learning_rate": 2.280282735683653e-06, "loss": 0.0149, "step": 30790 }, { "epoch": 1.8081484090642244, "grad_norm": 0.7853707671165466, "learning_rate": 2.266497227125114e-06, "loss": 0.0244, "step": 30800 }, { "epoch": 1.8087354702359986, "grad_norm": 0.20653456449508667, "learning_rate": 2.2527525489131008e-06, "loss": 0.0087, "step": 30810 }, { "epoch": 1.8093225314077728, "grad_norm": 0.6104238033294678, "learning_rate": 2.2390487128045256e-06, "loss": 0.0461, "step": 30820 }, { "epoch": 1.8099095925795468, "grad_norm": 0.02886943705379963, "learning_rate": 2.2253857305214233e-06, "loss": 0.0094, "step": 30830 }, { "epoch": 1.8104966537513207, "grad_norm": 0.5403358936309814, "learning_rate": 2.211763613750839e-06, "loss": 0.0103, "step": 30840 }, { "epoch": 1.811083714923095, "grad_norm": 0.21906422078609467, "learning_rate": 2.1981823741448805e-06, "loss": 0.0202, "step": 30850 }, { "epoch": 1.8116707760948692, "grad_norm": 0.0794738158583641, "learning_rate": 2.1846420233206823e-06, "loss": 0.0101, "step": 30860 }, { "epoch": 1.8122578372666431, "grad_norm": 0.23185820877552032, "learning_rate": 2.1711425728604073e-06, "loss": 0.0079, "step": 30870 }, { "epoch": 1.8128448984384171, "grad_norm": 0.17838993668556213, "learning_rate": 2.1576840343112414e-06, "loss": 0.0084, "step": 30880 }, { "epoch": 1.8134319596101913, "grad_norm": 0.7331791520118713, "learning_rate": 2.1442664191853645e-06, "loss": 0.0169, "step": 30890 }, { "epoch": 1.8140190207819655, "grad_norm": 0.7178912162780762, "learning_rate": 2.130889738959946e-06, "loss": 0.0155, "step": 30900 }, { "epoch": 1.8146060819537397, "grad_norm": 0.19310255348682404, "learning_rate": 2.1175540050771492e-06, "loss": 0.0053, "step": 30910 }, { "epoch": 1.8151931431255137, "grad_norm": 0.46262606978416443, "learning_rate": 2.1042592289441277e-06, "loss": 0.0129, "step": 30920 }, { "epoch": 1.8157802042972877, "grad_norm": 0.07041691243648529, "learning_rate": 2.0910054219329624e-06, "loss": 0.0109, "step": 30930 }, { "epoch": 1.816367265469062, "grad_norm": 0.5578110814094543, "learning_rate": 2.0777925953807288e-06, "loss": 0.019, "step": 30940 }, { "epoch": 1.8169543266408361, "grad_norm": 0.21449141204357147, "learning_rate": 2.0646207605894198e-06, "loss": 0.0073, "step": 30950 }, { "epoch": 1.81754138781261, "grad_norm": 0.6745051145553589, "learning_rate": 2.051489928825995e-06, "loss": 0.0222, "step": 30960 }, { "epoch": 1.818128448984384, "grad_norm": 1.6623717546463013, "learning_rate": 2.0384001113222972e-06, "loss": 0.0074, "step": 30970 }, { "epoch": 1.8187155101561583, "grad_norm": 0.32840752601623535, "learning_rate": 2.0253513192751373e-06, "loss": 0.0044, "step": 30980 }, { "epoch": 1.8193025713279325, "grad_norm": 1.139678716659546, "learning_rate": 2.0123435638461863e-06, "loss": 0.0131, "step": 30990 }, { "epoch": 1.8198896324997065, "grad_norm": 0.943651556968689, "learning_rate": 1.999376856162044e-06, "loss": 0.0088, "step": 31000 }, { "epoch": 1.8204766936714805, "grad_norm": 0.2593276798725128, "learning_rate": 1.986451207314194e-06, "loss": 0.0052, "step": 31010 }, { "epoch": 1.8210637548432547, "grad_norm": 2.5215015411376953, "learning_rate": 1.9735666283589972e-06, "loss": 0.0147, "step": 31020 }, { "epoch": 1.8216508160150289, "grad_norm": 1.2721784114837646, "learning_rate": 1.9607231303176653e-06, "loss": 0.0177, "step": 31030 }, { "epoch": 1.8222378771868029, "grad_norm": 0.1300169676542282, "learning_rate": 1.9479207241763055e-06, "loss": 0.0055, "step": 31040 }, { "epoch": 1.8228249383585768, "grad_norm": 1.896533489227295, "learning_rate": 1.9351594208858405e-06, "loss": 0.0066, "step": 31050 }, { "epoch": 1.823411999530351, "grad_norm": 1.5818156003952026, "learning_rate": 1.9224392313620665e-06, "loss": 0.011, "step": 31060 }, { "epoch": 1.8239990607021253, "grad_norm": 0.09740544855594635, "learning_rate": 1.909760166485586e-06, "loss": 0.007, "step": 31070 }, { "epoch": 1.8245861218738992, "grad_norm": 1.1511468887329102, "learning_rate": 1.8971222371018393e-06, "loss": 0.0043, "step": 31080 }, { "epoch": 1.8251731830456732, "grad_norm": 1.773013949394226, "learning_rate": 1.8845254540210743e-06, "loss": 0.0158, "step": 31090 }, { "epoch": 1.8257602442174474, "grad_norm": 0.41663751006126404, "learning_rate": 1.8719698280183328e-06, "loss": 0.0055, "step": 31100 }, { "epoch": 1.8263473053892216, "grad_norm": 1.1530892848968506, "learning_rate": 1.8594553698334793e-06, "loss": 0.0075, "step": 31110 }, { "epoch": 1.8269343665609956, "grad_norm": 0.29522261023521423, "learning_rate": 1.8469820901711344e-06, "loss": 0.0125, "step": 31120 }, { "epoch": 1.8275214277327696, "grad_norm": 0.6865074038505554, "learning_rate": 1.8345499997007243e-06, "loss": 0.0087, "step": 31130 }, { "epoch": 1.8281084889045438, "grad_norm": 0.49218010902404785, "learning_rate": 1.8221591090564038e-06, "loss": 0.0047, "step": 31140 }, { "epoch": 1.828695550076318, "grad_norm": 1.3413491249084473, "learning_rate": 1.8098094288371336e-06, "loss": 0.009, "step": 31150 }, { "epoch": 1.8292826112480922, "grad_norm": 1.8342851400375366, "learning_rate": 1.7975009696065859e-06, "loss": 0.014, "step": 31160 }, { "epoch": 1.8298696724198662, "grad_norm": 0.9174357652664185, "learning_rate": 1.785233741893183e-06, "loss": 0.0124, "step": 31170 }, { "epoch": 1.8304567335916402, "grad_norm": 0.7499503493309021, "learning_rate": 1.7730077561900926e-06, "loss": 0.0071, "step": 31180 }, { "epoch": 1.8310437947634144, "grad_norm": 0.5048448443412781, "learning_rate": 1.760823022955188e-06, "loss": 0.0036, "step": 31190 }, { "epoch": 1.8316308559351886, "grad_norm": 1.1059004068374634, "learning_rate": 1.748679552611071e-06, "loss": 0.0168, "step": 31200 }, { "epoch": 1.8322179171069626, "grad_norm": 0.14990508556365967, "learning_rate": 1.736577355545027e-06, "loss": 0.0073, "step": 31210 }, { "epoch": 1.8328049782787366, "grad_norm": 0.6041936874389648, "learning_rate": 1.7245164421090533e-06, "loss": 0.0091, "step": 31220 }, { "epoch": 1.8333920394505108, "grad_norm": 0.5979844927787781, "learning_rate": 1.7124968226198357e-06, "loss": 0.0023, "step": 31230 }, { "epoch": 1.833979100622285, "grad_norm": 0.55005943775177, "learning_rate": 1.7005185073587337e-06, "loss": 0.0125, "step": 31240 }, { "epoch": 1.834566161794059, "grad_norm": 0.25090792775154114, "learning_rate": 1.6885815065717625e-06, "loss": 0.0234, "step": 31250 }, { "epoch": 1.835153222965833, "grad_norm": 1.1573761701583862, "learning_rate": 1.676685830469621e-06, "loss": 0.0108, "step": 31260 }, { "epoch": 1.8357402841376071, "grad_norm": 1.82492196559906, "learning_rate": 1.6648314892276362e-06, "loss": 0.0212, "step": 31270 }, { "epoch": 1.8363273453093814, "grad_norm": 0.24990439414978027, "learning_rate": 1.6530184929857973e-06, "loss": 0.0056, "step": 31280 }, { "epoch": 1.8369144064811553, "grad_norm": 0.5845229029655457, "learning_rate": 1.6412468518487212e-06, "loss": 0.0055, "step": 31290 }, { "epoch": 1.8375014676529293, "grad_norm": 1.0427911281585693, "learning_rate": 1.629516575885659e-06, "loss": 0.0053, "step": 31300 }, { "epoch": 1.8380885288247035, "grad_norm": 3.758014440536499, "learning_rate": 1.617827675130451e-06, "loss": 0.0128, "step": 31310 }, { "epoch": 1.8386755899964777, "grad_norm": 1.4943057298660278, "learning_rate": 1.6061801595815774e-06, "loss": 0.0193, "step": 31320 }, { "epoch": 1.8392626511682517, "grad_norm": 0.25631681084632874, "learning_rate": 1.5945740392021013e-06, "loss": 0.0067, "step": 31330 }, { "epoch": 1.8398497123400257, "grad_norm": 1.366217851638794, "learning_rate": 1.5830093239196764e-06, "loss": 0.0094, "step": 31340 }, { "epoch": 1.8404367735118, "grad_norm": 0.3367091119289398, "learning_rate": 1.5714860236265506e-06, "loss": 0.0075, "step": 31350 }, { "epoch": 1.8410238346835741, "grad_norm": 0.2048913985490799, "learning_rate": 1.5600041481795336e-06, "loss": 0.0153, "step": 31360 }, { "epoch": 1.841610895855348, "grad_norm": 0.8786361813545227, "learning_rate": 1.5485637074000247e-06, "loss": 0.011, "step": 31370 }, { "epoch": 1.842197957027122, "grad_norm": 1.0005842447280884, "learning_rate": 1.5371647110739408e-06, "loss": 0.0092, "step": 31380 }, { "epoch": 1.8427850181988963, "grad_norm": 3.90608286857605, "learning_rate": 1.5258071689517872e-06, "loss": 0.0066, "step": 31390 }, { "epoch": 1.8433720793706705, "grad_norm": 0.1294427067041397, "learning_rate": 1.514491090748571e-06, "loss": 0.0108, "step": 31400 }, { "epoch": 1.8439591405424445, "grad_norm": 0.020502302795648575, "learning_rate": 1.5032164861438825e-06, "loss": 0.0039, "step": 31410 }, { "epoch": 1.8445462017142185, "grad_norm": 0.1300276815891266, "learning_rate": 1.4919833647817905e-06, "loss": 0.0105, "step": 31420 }, { "epoch": 1.8451332628859927, "grad_norm": 2.1017277240753174, "learning_rate": 1.4807917362709033e-06, "loss": 0.0141, "step": 31430 }, { "epoch": 1.8457203240577669, "grad_norm": 0.1465960144996643, "learning_rate": 1.4696416101843246e-06, "loss": 0.0031, "step": 31440 }, { "epoch": 1.846307385229541, "grad_norm": 2.96526837348938, "learning_rate": 1.4585329960596639e-06, "loss": 0.0175, "step": 31450 }, { "epoch": 1.846894446401315, "grad_norm": 0.14931446313858032, "learning_rate": 1.4474659033990313e-06, "loss": 0.0132, "step": 31460 }, { "epoch": 1.847481507573089, "grad_norm": 1.349448561668396, "learning_rate": 1.4364403416690042e-06, "loss": 0.0111, "step": 31470 }, { "epoch": 1.8480685687448632, "grad_norm": 0.6655970811843872, "learning_rate": 1.42545632030065e-06, "loss": 0.0072, "step": 31480 }, { "epoch": 1.8486556299166375, "grad_norm": 1.1792455911636353, "learning_rate": 1.4145138486894804e-06, "loss": 0.0102, "step": 31490 }, { "epoch": 1.8492426910884114, "grad_norm": 1.846889615058899, "learning_rate": 1.4036129361954974e-06, "loss": 0.0159, "step": 31500 }, { "epoch": 1.8498297522601854, "grad_norm": 1.8028771877288818, "learning_rate": 1.3927535921431255e-06, "loss": 0.0069, "step": 31510 }, { "epoch": 1.8504168134319596, "grad_norm": 1.691104531288147, "learning_rate": 1.381935825821251e-06, "loss": 0.0178, "step": 31520 }, { "epoch": 1.8510038746037338, "grad_norm": 0.3851316571235657, "learning_rate": 1.371159646483189e-06, "loss": 0.0076, "step": 31530 }, { "epoch": 1.8515909357755078, "grad_norm": 0.19145680963993073, "learning_rate": 1.360425063346682e-06, "loss": 0.0132, "step": 31540 }, { "epoch": 1.8521779969472818, "grad_norm": 0.8808480501174927, "learning_rate": 1.3497320855938855e-06, "loss": 0.0072, "step": 31550 }, { "epoch": 1.852765058119056, "grad_norm": 1.3409181833267212, "learning_rate": 1.3390807223713886e-06, "loss": 0.0086, "step": 31560 }, { "epoch": 1.8533521192908302, "grad_norm": 0.07805105298757553, "learning_rate": 1.328470982790142e-06, "loss": 0.0164, "step": 31570 }, { "epoch": 1.8539391804626042, "grad_norm": 2.4130473136901855, "learning_rate": 1.3179028759255475e-06, "loss": 0.0094, "step": 31580 }, { "epoch": 1.8545262416343782, "grad_norm": 0.09958692640066147, "learning_rate": 1.3073764108173459e-06, "loss": 0.0054, "step": 31590 }, { "epoch": 1.8551133028061524, "grad_norm": 0.8147817254066467, "learning_rate": 1.2968915964696904e-06, "loss": 0.0024, "step": 31600 }, { "epoch": 1.8557003639779266, "grad_norm": 1.387506365776062, "learning_rate": 1.2864484418510959e-06, "loss": 0.0168, "step": 31610 }, { "epoch": 1.8562874251497006, "grad_norm": 0.732915997505188, "learning_rate": 1.2760469558944277e-06, "loss": 0.0068, "step": 31620 }, { "epoch": 1.8568744863214746, "grad_norm": 0.8214514851570129, "learning_rate": 1.2656871474969357e-06, "loss": 0.0371, "step": 31630 }, { "epoch": 1.8574615474932488, "grad_norm": 1.31402587890625, "learning_rate": 1.2553690255201977e-06, "loss": 0.0104, "step": 31640 }, { "epoch": 1.858048608665023, "grad_norm": 0.9569791555404663, "learning_rate": 1.2450925987901595e-06, "loss": 0.0077, "step": 31650 }, { "epoch": 1.858635669836797, "grad_norm": 0.045723479241132736, "learning_rate": 1.234857876097062e-06, "loss": 0.006, "step": 31660 }, { "epoch": 1.859222731008571, "grad_norm": 0.022478388622403145, "learning_rate": 1.224664866195513e-06, "loss": 0.009, "step": 31670 }, { "epoch": 1.8598097921803451, "grad_norm": 1.3781839609146118, "learning_rate": 1.214513577804416e-06, "loss": 0.0212, "step": 31680 }, { "epoch": 1.8603968533521194, "grad_norm": 2.8057188987731934, "learning_rate": 1.204404019606986e-06, "loss": 0.0077, "step": 31690 }, { "epoch": 1.8609839145238933, "grad_norm": 3.708101987838745, "learning_rate": 1.194336200250762e-06, "loss": 0.0096, "step": 31700 }, { "epoch": 1.8615709756956675, "grad_norm": 0.13589777052402496, "learning_rate": 1.1843101283475655e-06, "loss": 0.0124, "step": 31710 }, { "epoch": 1.8621580368674415, "grad_norm": 0.33158472180366516, "learning_rate": 1.174325812473509e-06, "loss": 0.0143, "step": 31720 }, { "epoch": 1.8627450980392157, "grad_norm": 0.411041796207428, "learning_rate": 1.1643832611689943e-06, "loss": 0.0052, "step": 31730 }, { "epoch": 1.86333215921099, "grad_norm": 0.020132148638367653, "learning_rate": 1.1544824829386846e-06, "loss": 0.0041, "step": 31740 }, { "epoch": 1.863919220382764, "grad_norm": 0.40565600991249084, "learning_rate": 1.1446234862515225e-06, "loss": 0.0134, "step": 31750 }, { "epoch": 1.864506281554538, "grad_norm": 1.1989976167678833, "learning_rate": 1.1348062795407233e-06, "loss": 0.0033, "step": 31760 }, { "epoch": 1.865093342726312, "grad_norm": 0.33556655049324036, "learning_rate": 1.1250308712037306e-06, "loss": 0.0037, "step": 31770 }, { "epoch": 1.8656804038980863, "grad_norm": 0.6236353516578674, "learning_rate": 1.1152972696022445e-06, "loss": 0.0074, "step": 31780 }, { "epoch": 1.8662674650698603, "grad_norm": 0.0651530772447586, "learning_rate": 1.105605483062211e-06, "loss": 0.0112, "step": 31790 }, { "epoch": 1.8668545262416343, "grad_norm": 1.2081631422042847, "learning_rate": 1.0959555198738037e-06, "loss": 0.0097, "step": 31800 }, { "epoch": 1.8674415874134085, "grad_norm": 1.1077808141708374, "learning_rate": 1.0863473882914143e-06, "loss": 0.0242, "step": 31810 }, { "epoch": 1.8680286485851827, "grad_norm": 0.6262877583503723, "learning_rate": 1.076781096533669e-06, "loss": 0.0026, "step": 31820 }, { "epoch": 1.8686157097569567, "grad_norm": 0.48098450899124146, "learning_rate": 1.0672566527833827e-06, "loss": 0.004, "step": 31830 }, { "epoch": 1.8692027709287307, "grad_norm": 3.2791879177093506, "learning_rate": 1.0577740651876001e-06, "loss": 0.0049, "step": 31840 }, { "epoch": 1.8697898321005049, "grad_norm": 0.6023370027542114, "learning_rate": 1.048333341857538e-06, "loss": 0.0049, "step": 31850 }, { "epoch": 1.870376893272279, "grad_norm": 1.7936495542526245, "learning_rate": 1.0389344908686205e-06, "loss": 0.0225, "step": 31860 }, { "epoch": 1.870963954444053, "grad_norm": 0.7833817005157471, "learning_rate": 1.0295775202604495e-06, "loss": 0.0047, "step": 31870 }, { "epoch": 1.871551015615827, "grad_norm": 1.1967670917510986, "learning_rate": 1.020262438036801e-06, "loss": 0.0054, "step": 31880 }, { "epoch": 1.8721380767876012, "grad_norm": 0.14555278420448303, "learning_rate": 1.0109892521656283e-06, "loss": 0.0141, "step": 31890 }, { "epoch": 1.8727251379593755, "grad_norm": 0.08336268365383148, "learning_rate": 1.0017579705790314e-06, "loss": 0.0115, "step": 31900 }, { "epoch": 1.8733121991311494, "grad_norm": 1.0022594928741455, "learning_rate": 9.925686011732826e-07, "loss": 0.016, "step": 31910 }, { "epoch": 1.8738992603029234, "grad_norm": 0.15510699152946472, "learning_rate": 9.834211518087887e-07, "loss": 0.0074, "step": 31920 }, { "epoch": 1.8744863214746976, "grad_norm": 0.4988367259502411, "learning_rate": 9.743156303101185e-07, "loss": 0.0105, "step": 31930 }, { "epoch": 1.8750733826464718, "grad_norm": 0.9441335201263428, "learning_rate": 9.652520444659585e-07, "loss": 0.0054, "step": 31940 }, { "epoch": 1.8756604438182458, "grad_norm": 0.029945053160190582, "learning_rate": 9.562304020291346e-07, "loss": 0.0056, "step": 31950 }, { "epoch": 1.8762475049900198, "grad_norm": 0.023890936747193336, "learning_rate": 9.472507107165852e-07, "loss": 0.0122, "step": 31960 }, { "epoch": 1.876834566161794, "grad_norm": 0.824338436126709, "learning_rate": 9.383129782093713e-07, "loss": 0.0144, "step": 31970 }, { "epoch": 1.8774216273335682, "grad_norm": 0.6254005432128906, "learning_rate": 9.294172121526668e-07, "loss": 0.0128, "step": 31980 }, { "epoch": 1.8780086885053424, "grad_norm": 0.10618310421705246, "learning_rate": 9.205634201557456e-07, "loss": 0.0045, "step": 31990 }, { "epoch": 1.8785957496771164, "grad_norm": 0.02671687863767147, "learning_rate": 9.11751609791972e-07, "loss": 0.0127, "step": 32000 }, { "epoch": 1.8791828108488904, "grad_norm": 0.8190175890922546, "learning_rate": 9.029817885988001e-07, "loss": 0.0224, "step": 32010 }, { "epoch": 1.8797698720206646, "grad_norm": 1.1711387634277344, "learning_rate": 8.942539640777792e-07, "loss": 0.0147, "step": 32020 }, { "epoch": 1.8803569331924388, "grad_norm": 0.16896583139896393, "learning_rate": 8.855681436945206e-07, "loss": 0.0115, "step": 32030 }, { "epoch": 1.8809439943642128, "grad_norm": 0.1017298474907875, "learning_rate": 8.769243348787148e-07, "loss": 0.0029, "step": 32040 }, { "epoch": 1.8815310555359868, "grad_norm": 0.16621960699558258, "learning_rate": 8.683225450241139e-07, "loss": 0.0056, "step": 32050 }, { "epoch": 1.882118116707761, "grad_norm": 0.0552670955657959, "learning_rate": 8.597627814885323e-07, "loss": 0.0054, "step": 32060 }, { "epoch": 1.8827051778795352, "grad_norm": 0.0441642627120018, "learning_rate": 8.512450515938298e-07, "loss": 0.0136, "step": 32070 }, { "epoch": 1.8832922390513092, "grad_norm": 2.0039734840393066, "learning_rate": 8.427693626259114e-07, "loss": 0.0138, "step": 32080 }, { "epoch": 1.8838793002230831, "grad_norm": 2.0474815368652344, "learning_rate": 8.343357218347226e-07, "loss": 0.0106, "step": 32090 }, { "epoch": 1.8844663613948573, "grad_norm": 0.13035760819911957, "learning_rate": 8.25944136434248e-07, "loss": 0.0117, "step": 32100 }, { "epoch": 1.8850534225666316, "grad_norm": 1.109013557434082, "learning_rate": 8.175946136024792e-07, "loss": 0.0187, "step": 32110 }, { "epoch": 1.8856404837384055, "grad_norm": 0.22117137908935547, "learning_rate": 8.092871604814645e-07, "loss": 0.0098, "step": 32120 }, { "epoch": 1.8862275449101795, "grad_norm": 2.7652060985565186, "learning_rate": 8.01021784177225e-07, "loss": 0.009, "step": 32130 }, { "epoch": 1.8868146060819537, "grad_norm": 2.179360866546631, "learning_rate": 7.927984917598164e-07, "loss": 0.0184, "step": 32140 }, { "epoch": 1.887401667253728, "grad_norm": 0.016889028251171112, "learning_rate": 7.846172902632842e-07, "loss": 0.0059, "step": 32150 }, { "epoch": 1.887988728425502, "grad_norm": 1.3945127725601196, "learning_rate": 7.764781866856808e-07, "loss": 0.0157, "step": 32160 }, { "epoch": 1.888575789597276, "grad_norm": 0.011339795775711536, "learning_rate": 7.683811879890479e-07, "loss": 0.0072, "step": 32170 }, { "epoch": 1.88916285076905, "grad_norm": 0.8970181345939636, "learning_rate": 7.603263010993955e-07, "loss": 0.0197, "step": 32180 }, { "epoch": 1.8897499119408243, "grad_norm": 0.0005330070271156728, "learning_rate": 7.523135329067343e-07, "loss": 0.0072, "step": 32190 }, { "epoch": 1.8903369731125983, "grad_norm": 1.4529893398284912, "learning_rate": 7.443428902650262e-07, "loss": 0.0178, "step": 32200 }, { "epoch": 1.8909240342843723, "grad_norm": 2.6361594200134277, "learning_rate": 7.364143799922119e-07, "loss": 0.0255, "step": 32210 }, { "epoch": 1.8915110954561465, "grad_norm": 0.16234420239925385, "learning_rate": 7.285280088701996e-07, "loss": 0.0063, "step": 32220 }, { "epoch": 1.8920981566279207, "grad_norm": 0.2861548662185669, "learning_rate": 7.206837836448377e-07, "loss": 0.0066, "step": 32230 }, { "epoch": 1.8926852177996947, "grad_norm": 3.8739373683929443, "learning_rate": 7.128817110259312e-07, "loss": 0.0255, "step": 32240 }, { "epoch": 1.8932722789714689, "grad_norm": 0.5779154896736145, "learning_rate": 7.051217976872248e-07, "loss": 0.0137, "step": 32250 }, { "epoch": 1.8938593401432429, "grad_norm": 0.04500158876180649, "learning_rate": 6.974040502664092e-07, "loss": 0.0047, "step": 32260 }, { "epoch": 1.894446401315017, "grad_norm": 0.9955960512161255, "learning_rate": 6.897284753650924e-07, "loss": 0.004, "step": 32270 }, { "epoch": 1.8950334624867913, "grad_norm": 0.8616155982017517, "learning_rate": 6.820950795488223e-07, "loss": 0.0139, "step": 32280 }, { "epoch": 1.8956205236585653, "grad_norm": 0.07726386934518814, "learning_rate": 6.745038693470651e-07, "loss": 0.0088, "step": 32290 }, { "epoch": 1.8962075848303392, "grad_norm": 0.9140532612800598, "learning_rate": 6.669548512531986e-07, "loss": 0.0124, "step": 32300 }, { "epoch": 1.8967946460021134, "grad_norm": 0.20916344225406647, "learning_rate": 6.594480317245133e-07, "loss": 0.0058, "step": 32310 }, { "epoch": 1.8973817071738877, "grad_norm": 0.36542436480522156, "learning_rate": 6.519834171822003e-07, "loss": 0.0049, "step": 32320 }, { "epoch": 1.8979687683456616, "grad_norm": 0.034110747277736664, "learning_rate": 6.445610140113467e-07, "loss": 0.0138, "step": 32330 }, { "epoch": 1.8985558295174356, "grad_norm": 0.8014510273933411, "learning_rate": 6.371808285609515e-07, "loss": 0.0045, "step": 32340 }, { "epoch": 1.8991428906892098, "grad_norm": 0.019619951024651527, "learning_rate": 6.298428671438705e-07, "loss": 0.0068, "step": 32350 }, { "epoch": 1.899729951860984, "grad_norm": 2.050294876098633, "learning_rate": 6.225471360368773e-07, "loss": 0.0079, "step": 32360 }, { "epoch": 1.900317013032758, "grad_norm": 0.8113236427307129, "learning_rate": 6.152936414805854e-07, "loss": 0.0121, "step": 32370 }, { "epoch": 1.900904074204532, "grad_norm": 2.032452344894409, "learning_rate": 6.080823896795095e-07, "loss": 0.0104, "step": 32380 }, { "epoch": 1.9014911353763062, "grad_norm": 0.0728062316775322, "learning_rate": 6.009133868020156e-07, "loss": 0.0082, "step": 32390 }, { "epoch": 1.9020781965480804, "grad_norm": 0.902544379234314, "learning_rate": 5.93786638980337e-07, "loss": 0.0072, "step": 32400 }, { "epoch": 1.9026652577198544, "grad_norm": 0.01131579652428627, "learning_rate": 5.867021523105587e-07, "loss": 0.0061, "step": 32410 }, { "epoch": 1.9032523188916284, "grad_norm": 0.42741164565086365, "learning_rate": 5.796599328526219e-07, "loss": 0.0081, "step": 32420 }, { "epoch": 1.9038393800634026, "grad_norm": 0.25213170051574707, "learning_rate": 5.726599866303084e-07, "loss": 0.0107, "step": 32430 }, { "epoch": 1.9044264412351768, "grad_norm": 0.3988177180290222, "learning_rate": 5.657023196312394e-07, "loss": 0.0088, "step": 32440 }, { "epoch": 1.9050135024069508, "grad_norm": 0.24899500608444214, "learning_rate": 5.587869378068711e-07, "loss": 0.0043, "step": 32450 }, { "epoch": 1.9056005635787248, "grad_norm": 1.2634080648422241, "learning_rate": 5.519138470724938e-07, "loss": 0.0057, "step": 32460 }, { "epoch": 1.906187624750499, "grad_norm": 2.8402352333068848, "learning_rate": 5.450830533072271e-07, "loss": 0.0212, "step": 32470 }, { "epoch": 1.9067746859222732, "grad_norm": 0.12431987375020981, "learning_rate": 5.38294562353997e-07, "loss": 0.0057, "step": 32480 }, { "epoch": 1.9073617470940472, "grad_norm": 0.42505574226379395, "learning_rate": 5.315483800195531e-07, "loss": 0.0165, "step": 32490 }, { "epoch": 1.9079488082658211, "grad_norm": 3.167966842651367, "learning_rate": 5.248445120744516e-07, "loss": 0.0197, "step": 32500 }, { "epoch": 1.9085358694375953, "grad_norm": 0.5327710509300232, "learning_rate": 5.181829642530667e-07, "loss": 0.0088, "step": 32510 }, { "epoch": 1.9091229306093696, "grad_norm": 0.3622607886791229, "learning_rate": 5.115637422535513e-07, "loss": 0.0045, "step": 32520 }, { "epoch": 1.9097099917811438, "grad_norm": 2.9486169815063477, "learning_rate": 5.049868517378653e-07, "loss": 0.0071, "step": 32530 }, { "epoch": 1.9102970529529177, "grad_norm": 0.04305504634976387, "learning_rate": 4.984522983317641e-07, "loss": 0.0055, "step": 32540 }, { "epoch": 1.9108841141246917, "grad_norm": 0.1499515175819397, "learning_rate": 4.919600876247709e-07, "loss": 0.0122, "step": 32550 }, { "epoch": 1.911471175296466, "grad_norm": 0.01733092963695526, "learning_rate": 4.855102251702159e-07, "loss": 0.0127, "step": 32560 }, { "epoch": 1.9120582364682401, "grad_norm": 1.5027722120285034, "learning_rate": 4.791027164851803e-07, "loss": 0.0147, "step": 32570 }, { "epoch": 1.9126452976400141, "grad_norm": 1.6580020189285278, "learning_rate": 4.727375670505352e-07, "loss": 0.006, "step": 32580 }, { "epoch": 1.913232358811788, "grad_norm": 1.0303738117218018, "learning_rate": 4.6641478231090327e-07, "loss": 0.0088, "step": 32590 }, { "epoch": 1.9138194199835623, "grad_norm": 0.07857631891965866, "learning_rate": 4.6013436767468053e-07, "loss": 0.0075, "step": 32600 }, { "epoch": 1.9144064811553365, "grad_norm": 0.04092937707901001, "learning_rate": 4.538963285140141e-07, "loss": 0.0271, "step": 32610 }, { "epoch": 1.9149935423271105, "grad_norm": 1.4485965967178345, "learning_rate": 4.477006701648079e-07, "loss": 0.005, "step": 32620 }, { "epoch": 1.9155806034988845, "grad_norm": 1.9311720132827759, "learning_rate": 4.4154739792670594e-07, "loss": 0.0084, "step": 32630 }, { "epoch": 1.9161676646706587, "grad_norm": 0.26783835887908936, "learning_rate": 4.3543651706312026e-07, "loss": 0.016, "step": 32640 }, { "epoch": 1.916754725842433, "grad_norm": 2.5353481769561768, "learning_rate": 4.29368032801164e-07, "loss": 0.0187, "step": 32650 }, { "epoch": 1.9173417870142069, "grad_norm": 0.5464176535606384, "learning_rate": 4.233419503317182e-07, "loss": 0.0039, "step": 32660 }, { "epoch": 1.9179288481859809, "grad_norm": 0.03787325695157051, "learning_rate": 4.1735827480937075e-07, "loss": 0.0039, "step": 32670 }, { "epoch": 1.918515909357755, "grad_norm": 0.20810934901237488, "learning_rate": 4.114170113524496e-07, "loss": 0.0072, "step": 32680 }, { "epoch": 1.9191029705295293, "grad_norm": 1.6326884031295776, "learning_rate": 4.055181650430062e-07, "loss": 0.0101, "step": 32690 }, { "epoch": 1.9196900317013033, "grad_norm": 0.2731231153011322, "learning_rate": 3.996617409268044e-07, "loss": 0.0162, "step": 32700 }, { "epoch": 1.9202770928730772, "grad_norm": 0.012623190879821777, "learning_rate": 3.9384774401330924e-07, "loss": 0.0096, "step": 32710 }, { "epoch": 1.9208641540448514, "grad_norm": 0.04078976809978485, "learning_rate": 3.880761792757148e-07, "loss": 0.0152, "step": 32720 }, { "epoch": 1.9214512152166257, "grad_norm": 1.672556757926941, "learning_rate": 3.823470516508998e-07, "loss": 0.0093, "step": 32730 }, { "epoch": 1.9220382763883996, "grad_norm": 2.3060295581817627, "learning_rate": 3.766603660394663e-07, "loss": 0.0077, "step": 32740 }, { "epoch": 1.9226253375601736, "grad_norm": 0.5405573844909668, "learning_rate": 3.7101612730569004e-07, "loss": 0.009, "step": 32750 }, { "epoch": 1.9232123987319478, "grad_norm": 3.03564190864563, "learning_rate": 3.654143402775478e-07, "loss": 0.0158, "step": 32760 }, { "epoch": 1.923799459903722, "grad_norm": 0.33732870221138, "learning_rate": 3.598550097467068e-07, "loss": 0.0105, "step": 32770 }, { "epoch": 1.924386521075496, "grad_norm": 0.22910763323307037, "learning_rate": 3.543381404685131e-07, "loss": 0.0046, "step": 32780 }, { "epoch": 1.9249735822472702, "grad_norm": 1.3840593099594116, "learning_rate": 3.4886373716199184e-07, "loss": 0.021, "step": 32790 }, { "epoch": 1.9255606434190442, "grad_norm": 2.219992160797119, "learning_rate": 3.434318045098417e-07, "loss": 0.0074, "step": 32800 }, { "epoch": 1.9261477045908184, "grad_norm": 1.1197220087051392, "learning_rate": 3.380423471584515e-07, "loss": 0.0067, "step": 32810 }, { "epoch": 1.9267347657625926, "grad_norm": 2.139739513397217, "learning_rate": 3.3269536971784474e-07, "loss": 0.0207, "step": 32820 }, { "epoch": 1.9273218269343666, "grad_norm": 1.0604846477508545, "learning_rate": 3.2739087676173506e-07, "loss": 0.0157, "step": 32830 }, { "epoch": 1.9279088881061406, "grad_norm": 0.5695417523384094, "learning_rate": 3.2212887282748737e-07, "loss": 0.0152, "step": 32840 }, { "epoch": 1.9284959492779148, "grad_norm": 1.0476090908050537, "learning_rate": 3.169093624161179e-07, "loss": 0.0023, "step": 32850 }, { "epoch": 1.929083010449689, "grad_norm": 1.4658812284469604, "learning_rate": 3.1173234999229973e-07, "loss": 0.005, "step": 32860 }, { "epoch": 1.929670071621463, "grad_norm": 1.7977803945541382, "learning_rate": 3.0659783998435165e-07, "loss": 0.0041, "step": 32870 }, { "epoch": 1.930257132793237, "grad_norm": 0.08751683682203293, "learning_rate": 3.0150583678423825e-07, "loss": 0.0069, "step": 32880 }, { "epoch": 1.9308441939650112, "grad_norm": 1.0991337299346924, "learning_rate": 2.9645634474756435e-07, "loss": 0.0059, "step": 32890 }, { "epoch": 1.9314312551367854, "grad_norm": 1.1782103776931763, "learning_rate": 2.914493681935693e-07, "loss": 0.0196, "step": 32900 }, { "epoch": 1.9320183163085594, "grad_norm": 2.289116144180298, "learning_rate": 2.8648491140513266e-07, "loss": 0.024, "step": 32910 }, { "epoch": 1.9326053774803333, "grad_norm": 0.5296044945716858, "learning_rate": 2.815629786287577e-07, "loss": 0.0053, "step": 32920 }, { "epoch": 1.9331924386521075, "grad_norm": 2.5109994411468506, "learning_rate": 2.766835740745599e-07, "loss": 0.0136, "step": 32930 }, { "epoch": 1.9337794998238818, "grad_norm": 0.006016758270561695, "learning_rate": 2.718467019163118e-07, "loss": 0.0064, "step": 32940 }, { "epoch": 1.9343665609956557, "grad_norm": 1.2763035297393799, "learning_rate": 2.670523662913649e-07, "loss": 0.015, "step": 32950 }, { "epoch": 1.9349536221674297, "grad_norm": 0.33494487404823303, "learning_rate": 2.623005713007165e-07, "loss": 0.0099, "step": 32960 }, { "epoch": 1.935540683339204, "grad_norm": 0.2290242463350296, "learning_rate": 2.5759132100895975e-07, "loss": 0.0019, "step": 32970 }, { "epoch": 1.9361277445109781, "grad_norm": 0.5654285550117493, "learning_rate": 2.529246194443002e-07, "loss": 0.0056, "step": 32980 }, { "epoch": 1.9367148056827521, "grad_norm": 1.9769994020462036, "learning_rate": 2.4830047059853924e-07, "loss": 0.0152, "step": 32990 }, { "epoch": 1.937301866854526, "grad_norm": 0.3480927050113678, "learning_rate": 2.4371887842709606e-07, "loss": 0.0116, "step": 33000 }, { "epoch": 1.937301866854526, "eval_loss": 0.5065711140632629, "eval_runtime": 269.6342, "eval_samples_per_second": 3.505, "eval_steps_per_second": 3.505, "step": 33000 }, { "epoch": 1.9378889280263003, "grad_norm": 0.5516364574432373, "learning_rate": 2.391798468489803e-07, "loss": 0.008, "step": 33010 }, { "epoch": 1.9384759891980745, "grad_norm": 0.053052108734846115, "learning_rate": 2.3468337974678624e-07, "loss": 0.0105, "step": 33020 }, { "epoch": 1.9390630503698485, "grad_norm": 0.06822196394205093, "learning_rate": 2.3022948096672049e-07, "loss": 0.0059, "step": 33030 }, { "epoch": 1.9396501115416225, "grad_norm": 0.35682061314582825, "learning_rate": 2.258181543185467e-07, "loss": 0.0045, "step": 33040 }, { "epoch": 1.9402371727133967, "grad_norm": 1.4210172891616821, "learning_rate": 2.2144940357565203e-07, "loss": 0.0066, "step": 33050 }, { "epoch": 1.940824233885171, "grad_norm": 1.6971017122268677, "learning_rate": 2.1712323247496946e-07, "loss": 0.0136, "step": 33060 }, { "epoch": 1.941411295056945, "grad_norm": 0.323036253452301, "learning_rate": 2.1283964471703332e-07, "loss": 0.005, "step": 33070 }, { "epoch": 1.941998356228719, "grad_norm": 0.10017479211091995, "learning_rate": 2.0859864396593488e-07, "loss": 0.0145, "step": 33080 }, { "epoch": 1.942585417400493, "grad_norm": 0.019091343507170677, "learning_rate": 2.044002338493556e-07, "loss": 0.0039, "step": 33090 }, { "epoch": 1.9431724785722673, "grad_norm": 1.455611228942871, "learning_rate": 2.0024441795853388e-07, "loss": 0.0125, "step": 33100 }, { "epoch": 1.9437595397440415, "grad_norm": 0.015015044249594212, "learning_rate": 1.961311998482762e-07, "loss": 0.0092, "step": 33110 }, { "epoch": 1.9443466009158155, "grad_norm": 1.2915472984313965, "learning_rate": 1.9206058303695706e-07, "loss": 0.0321, "step": 33120 }, { "epoch": 1.9449336620875894, "grad_norm": 0.04976224899291992, "learning_rate": 1.8803257100649675e-07, "loss": 0.007, "step": 33130 }, { "epoch": 1.9455207232593636, "grad_norm": 0.757796049118042, "learning_rate": 1.840471672023947e-07, "loss": 0.0044, "step": 33140 }, { "epoch": 1.9461077844311379, "grad_norm": 0.18057653307914734, "learning_rate": 1.801043750336795e-07, "loss": 0.0069, "step": 33150 }, { "epoch": 1.9466948456029118, "grad_norm": 0.7786180973052979, "learning_rate": 1.7620419787294785e-07, "loss": 0.0034, "step": 33160 }, { "epoch": 1.9472819067746858, "grad_norm": 0.17464140057563782, "learning_rate": 1.723466390563311e-07, "loss": 0.0063, "step": 33170 }, { "epoch": 1.94786896794646, "grad_norm": 0.0034637979697436094, "learning_rate": 1.6853170188352306e-07, "loss": 0.0029, "step": 33180 }, { "epoch": 1.9484560291182342, "grad_norm": 1.092969536781311, "learning_rate": 1.6475938961774683e-07, "loss": 0.0114, "step": 33190 }, { "epoch": 1.9490430902900082, "grad_norm": 1.3824563026428223, "learning_rate": 1.610297054857657e-07, "loss": 0.0202, "step": 33200 }, { "epoch": 1.9496301514617822, "grad_norm": 0.7262981534004211, "learning_rate": 1.5734265267787763e-07, "loss": 0.0117, "step": 33210 }, { "epoch": 1.9502172126335564, "grad_norm": 1.4553768634796143, "learning_rate": 1.5369823434792652e-07, "loss": 0.0103, "step": 33220 }, { "epoch": 1.9508042738053306, "grad_norm": 0.94621342420578, "learning_rate": 1.5009645361327983e-07, "loss": 0.0074, "step": 33230 }, { "epoch": 1.9513913349771046, "grad_norm": 0.13350018858909607, "learning_rate": 1.465373135548287e-07, "loss": 0.0102, "step": 33240 }, { "epoch": 1.9519783961488786, "grad_norm": 0.6504955291748047, "learning_rate": 1.4302081721699334e-07, "loss": 0.0117, "step": 33250 }, { "epoch": 1.9525654573206528, "grad_norm": 1.3691706657409668, "learning_rate": 1.3954696760772323e-07, "loss": 0.0114, "step": 33260 }, { "epoch": 1.953152518492427, "grad_norm": 1.3105803728103638, "learning_rate": 1.3611576769848034e-07, "loss": 0.0072, "step": 33270 }, { "epoch": 1.953739579664201, "grad_norm": 0.9720445275306702, "learning_rate": 1.3272722042425577e-07, "loss": 0.005, "step": 33280 }, { "epoch": 1.954326640835975, "grad_norm": 0.3239278197288513, "learning_rate": 1.2938132868354768e-07, "loss": 0.0056, "step": 33290 }, { "epoch": 1.9549137020077492, "grad_norm": 0.13123829662799835, "learning_rate": 1.2607809533836669e-07, "loss": 0.0068, "step": 33300 }, { "epoch": 1.9555007631795234, "grad_norm": 0.01815650425851345, "learning_rate": 1.2281752321423589e-07, "loss": 0.0096, "step": 33310 }, { "epoch": 1.9560878243512974, "grad_norm": 0.23652943968772888, "learning_rate": 1.1959961510018546e-07, "loss": 0.0085, "step": 33320 }, { "epoch": 1.9566748855230713, "grad_norm": 0.10838207602500916, "learning_rate": 1.1642437374876913e-07, "loss": 0.0119, "step": 33330 }, { "epoch": 1.9572619466948455, "grad_norm": 0.31353235244750977, "learning_rate": 1.1329180187600874e-07, "loss": 0.0056, "step": 33340 }, { "epoch": 1.9578490078666198, "grad_norm": 3.3659510612487793, "learning_rate": 1.1020190216146086e-07, "loss": 0.008, "step": 33350 }, { "epoch": 1.958436069038394, "grad_norm": 0.6617619395256042, "learning_rate": 1.071546772481613e-07, "loss": 0.0132, "step": 33360 }, { "epoch": 1.959023130210168, "grad_norm": 0.6812106370925903, "learning_rate": 1.0415012974265281e-07, "loss": 0.0113, "step": 33370 }, { "epoch": 1.959610191381942, "grad_norm": 0.5823590755462646, "learning_rate": 1.0118826221497401e-07, "loss": 0.0166, "step": 33380 }, { "epoch": 1.9601972525537161, "grad_norm": 0.20258504152297974, "learning_rate": 9.82690771986372e-08, "loss": 0.0048, "step": 33390 }, { "epoch": 1.9607843137254903, "grad_norm": 1.6923710107803345, "learning_rate": 9.539257719067274e-08, "loss": 0.0156, "step": 33400 }, { "epoch": 1.9613713748972643, "grad_norm": 1.8547192811965942, "learning_rate": 9.25587646515791e-08, "loss": 0.013, "step": 33410 }, { "epoch": 1.9619584360690383, "grad_norm": 0.07265213876962662, "learning_rate": 8.976764200534504e-08, "loss": 0.0105, "step": 33420 }, { "epoch": 1.9625454972408125, "grad_norm": 0.18601560592651367, "learning_rate": 8.701921163944415e-08, "loss": 0.017, "step": 33430 }, { "epoch": 1.9631325584125867, "grad_norm": 0.12948229908943176, "learning_rate": 8.431347590483474e-08, "loss": 0.0081, "step": 33440 }, { "epoch": 1.9637196195843607, "grad_norm": 0.6142059564590454, "learning_rate": 8.165043711595987e-08, "loss": 0.017, "step": 33450 }, { "epoch": 1.9643066807561347, "grad_norm": 0.8429779410362244, "learning_rate": 7.903009755071967e-08, "loss": 0.017, "step": 33460 }, { "epoch": 1.9648937419279089, "grad_norm": 0.026174206286668777, "learning_rate": 7.645245945051005e-08, "loss": 0.0096, "step": 33470 }, { "epoch": 1.965480803099683, "grad_norm": 0.0425335131585598, "learning_rate": 7.391752502019512e-08, "loss": 0.0063, "step": 33480 }, { "epoch": 1.966067864271457, "grad_norm": 0.715065062046051, "learning_rate": 7.142529642810703e-08, "loss": 0.0179, "step": 33490 }, { "epoch": 1.966654925443231, "grad_norm": 1.5446109771728516, "learning_rate": 6.897577580606273e-08, "loss": 0.0136, "step": 33500 }, { "epoch": 1.9672419866150053, "grad_norm": 0.7537940740585327, "learning_rate": 6.656896524931955e-08, "loss": 0.0148, "step": 33510 }, { "epoch": 1.9678290477867795, "grad_norm": 0.08641387522220612, "learning_rate": 6.420486681663062e-08, "loss": 0.0134, "step": 33520 }, { "epoch": 1.9684161089585535, "grad_norm": 1.0592167377471924, "learning_rate": 6.188348253019505e-08, "loss": 0.011, "step": 33530 }, { "epoch": 1.9690031701303274, "grad_norm": 1.5867942571640015, "learning_rate": 5.960481437568555e-08, "loss": 0.0156, "step": 33540 }, { "epoch": 1.9695902313021016, "grad_norm": 0.37097081542015076, "learning_rate": 5.7368864302226324e-08, "loss": 0.0039, "step": 33550 }, { "epoch": 1.9701772924738759, "grad_norm": 0.4909408688545227, "learning_rate": 5.517563422241523e-08, "loss": 0.0098, "step": 33560 }, { "epoch": 1.9707643536456498, "grad_norm": 0.14873316884040833, "learning_rate": 5.3025126012301586e-08, "loss": 0.0137, "step": 33570 }, { "epoch": 1.9713514148174238, "grad_norm": 0.4595364034175873, "learning_rate": 5.091734151138061e-08, "loss": 0.0173, "step": 33580 }, { "epoch": 1.971938475989198, "grad_norm": 0.6176416277885437, "learning_rate": 4.8852282522615646e-08, "loss": 0.0093, "step": 33590 }, { "epoch": 1.9725255371609722, "grad_norm": 1.2074905633926392, "learning_rate": 4.6829950812421474e-08, "loss": 0.0108, "step": 33600 }, { "epoch": 1.9731125983327464, "grad_norm": 0.31188786029815674, "learning_rate": 4.48503481106588e-08, "loss": 0.0112, "step": 33610 }, { "epoch": 1.9736996595045204, "grad_norm": 0.3260418176651001, "learning_rate": 4.2913476110650887e-08, "loss": 0.0107, "step": 33620 }, { "epoch": 1.9742867206762944, "grad_norm": 0.07644791901111603, "learning_rate": 4.101933646915024e-08, "loss": 0.0089, "step": 33630 }, { "epoch": 1.9748737818480686, "grad_norm": 2.5628581047058105, "learning_rate": 3.9167930806377485e-08, "loss": 0.0145, "step": 33640 }, { "epoch": 1.9754608430198428, "grad_norm": 1.6306122541427612, "learning_rate": 3.7359260705993604e-08, "loss": 0.0101, "step": 33650 }, { "epoch": 1.9760479041916168, "grad_norm": 0.49523115158081055, "learning_rate": 3.559332771508883e-08, "loss": 0.009, "step": 33660 }, { "epoch": 1.9766349653633908, "grad_norm": 0.02993781492114067, "learning_rate": 3.387013334421596e-08, "loss": 0.0138, "step": 33670 }, { "epoch": 1.977222026535165, "grad_norm": 0.4060212969779968, "learning_rate": 3.2189679067368136e-08, "loss": 0.0185, "step": 33680 }, { "epoch": 1.9778090877069392, "grad_norm": 5.356170654296875, "learning_rate": 3.055196632196222e-08, "loss": 0.0087, "step": 33690 }, { "epoch": 1.9783961488787132, "grad_norm": 0.07070734351873398, "learning_rate": 2.8956996508883172e-08, "loss": 0.0097, "step": 33700 }, { "epoch": 1.9789832100504872, "grad_norm": 1.0718694925308228, "learning_rate": 2.7404770992423002e-08, "loss": 0.0066, "step": 33710 }, { "epoch": 1.9795702712222614, "grad_norm": 5.588756561279297, "learning_rate": 2.5895291100336282e-08, "loss": 0.0088, "step": 33720 }, { "epoch": 1.9801573323940356, "grad_norm": 3.1844701766967773, "learning_rate": 2.4428558123795743e-08, "loss": 0.0188, "step": 33730 }, { "epoch": 1.9807443935658096, "grad_norm": 1.5038096904754639, "learning_rate": 2.3004573317431112e-08, "loss": 0.0093, "step": 33740 }, { "epoch": 1.9813314547375835, "grad_norm": 0.77252197265625, "learning_rate": 2.1623337899279173e-08, "loss": 0.0096, "step": 33750 }, { "epoch": 1.9819185159093577, "grad_norm": 0.06425748765468597, "learning_rate": 2.0284853050828166e-08, "loss": 0.0116, "step": 33760 }, { "epoch": 1.982505577081132, "grad_norm": 0.6445738673210144, "learning_rate": 1.898911991699004e-08, "loss": 0.0096, "step": 33770 }, { "epoch": 1.983092638252906, "grad_norm": 0.47015705704689026, "learning_rate": 1.7736139606111534e-08, "loss": 0.0041, "step": 33780 }, { "epoch": 1.98367969942468, "grad_norm": 2.5312390327453613, "learning_rate": 1.6525913189974208e-08, "loss": 0.0138, "step": 33790 }, { "epoch": 1.9842667605964541, "grad_norm": 0.3456118106842041, "learning_rate": 1.5358441703777758e-08, "loss": 0.0174, "step": 33800 }, { "epoch": 1.9848538217682283, "grad_norm": 0.10266964137554169, "learning_rate": 1.42337261461567e-08, "loss": 0.0146, "step": 33810 }, { "epoch": 1.9854408829400023, "grad_norm": 0.05175187066197395, "learning_rate": 1.3151767479169241e-08, "loss": 0.0074, "step": 33820 }, { "epoch": 1.9860279441117763, "grad_norm": 1.0970159769058228, "learning_rate": 1.2112566628302846e-08, "loss": 0.014, "step": 33830 }, { "epoch": 1.9866150052835505, "grad_norm": 2.562270402908325, "learning_rate": 1.1116124482479784e-08, "loss": 0.0169, "step": 33840 }, { "epoch": 1.9872020664553247, "grad_norm": 0.7647679448127747, "learning_rate": 1.0162441894023822e-08, "loss": 0.0126, "step": 33850 }, { "epoch": 1.9877891276270987, "grad_norm": 0.010356120765209198, "learning_rate": 9.251519678710186e-09, "loss": 0.0221, "step": 33860 }, { "epoch": 1.9883761887988727, "grad_norm": 0.2596798539161682, "learning_rate": 8.383358615715598e-09, "loss": 0.0074, "step": 33870 }, { "epoch": 1.9889632499706469, "grad_norm": 2.041486978530884, "learning_rate": 7.557959447657137e-09, "loss": 0.0147, "step": 33880 }, { "epoch": 1.989550311142421, "grad_norm": 0.04968883469700813, "learning_rate": 6.775322880553381e-09, "loss": 0.0146, "step": 33890 }, { "epoch": 1.9901373723141953, "grad_norm": 0.2724190652370453, "learning_rate": 6.035449583868813e-09, "loss": 0.0224, "step": 33900 }, { "epoch": 1.9907244334859693, "grad_norm": 1.732768177986145, "learning_rate": 5.338340190469415e-09, "loss": 0.0155, "step": 33910 }, { "epoch": 1.9913114946577433, "grad_norm": 2.2499430179595947, "learning_rate": 4.6839952966559744e-09, "loss": 0.0198, "step": 33920 }, { "epoch": 1.9918985558295175, "grad_norm": 2.311940908432007, "learning_rate": 4.0724154621418766e-09, "loss": 0.0047, "step": 33930 }, { "epoch": 1.9924856170012917, "grad_norm": 0.39947935938835144, "learning_rate": 3.503601210053109e-09, "loss": 0.0057, "step": 33940 }, { "epoch": 1.9930726781730657, "grad_norm": 1.165054202079773, "learning_rate": 2.9775530269560146e-09, "loss": 0.0056, "step": 33950 }, { "epoch": 1.9936597393448396, "grad_norm": 0.10673221200704575, "learning_rate": 2.494271362807332e-09, "loss": 0.0051, "step": 33960 }, { "epoch": 1.9942468005166138, "grad_norm": 0.22739560902118683, "learning_rate": 2.0537566310097065e-09, "loss": 0.0213, "step": 33970 }, { "epoch": 1.994833861688388, "grad_norm": 0.06764887273311615, "learning_rate": 1.6560092083672817e-09, "loss": 0.0121, "step": 33980 }, { "epoch": 1.995420922860162, "grad_norm": 0.2568235695362091, "learning_rate": 1.3010294351023523e-09, "loss": 0.0098, "step": 33990 }, { "epoch": 1.996007984031936, "grad_norm": 0.19651329517364502, "learning_rate": 9.88817614860915e-10, "loss": 0.0094, "step": 34000 }, { "epoch": 1.9965950452037102, "grad_norm": 0.5874388813972473, "learning_rate": 7.193740147015682e-10, "loss": 0.009, "step": 34010 }, { "epoch": 1.9971821063754844, "grad_norm": 1.3699864149093628, "learning_rate": 4.926988651066111e-10, "loss": 0.0089, "step": 34020 }, { "epoch": 1.9977691675472584, "grad_norm": 0.2916640639305115, "learning_rate": 3.087923599598419e-10, "loss": 0.0061, "step": 34030 }, { "epoch": 1.9983562287190324, "grad_norm": 0.20287832617759705, "learning_rate": 1.6765465658541424e-10, "loss": 0.0116, "step": 34040 }, { "epoch": 1.9989432898908066, "grad_norm": 0.13390199840068817, "learning_rate": 6.928587569232647e-11, "loss": 0.0055, "step": 34050 }, { "epoch": 1.9995303510625808, "grad_norm": 1.4384433031082153, "learning_rate": 1.3686101441034992e-11, "loss": 0.0173, "step": 34060 }, { "epoch": 2.0, "step": 34068, "total_flos": 4.396799291960525e+17, "train_loss": 0.030897805340184815, "train_runtime": 22421.4322, "train_samples_per_second": 1.519, "train_steps_per_second": 1.519 } ], "logging_steps": 10, "max_steps": 34068, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.396799291960525e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }